yash bhaskar commited on
Commit
5fa6e3c
·
1 Parent(s): e170b87

Adding RRF code

Browse files
Files changed (1) hide show
  1. Ranking/RRF/RRF_implementation.py +239 -0
Ranking/RRF/RRF_implementation.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+
5
+ def load_and_merge_json_files(directory_path):
6
+ """
7
+ Load and merge JSON files from a directory into a single structure, keeping each list from different files separate for each query.
8
+
9
+ Args:
10
+ directory_path (str): Path to the directory containing the JSON files.
11
+
12
+ Returns:
13
+ list: Merged list of dictionaries, keeping separate lists for each query.
14
+ """
15
+ merged_queries = defaultdict(list)
16
+
17
+ # Iterate through all files in the directory
18
+ for filename in os.listdir(directory_path):
19
+ if filename.endswith('.json'):
20
+ file_path = os.path.join(directory_path, filename)
21
+ try:
22
+ with open(file_path, 'r') as f:
23
+ json_data = json.load(f)
24
+
25
+ # For each file, add the lists to the corresponding query
26
+ for query_data in json_data:
27
+ for query, rank_list in query_data.items():
28
+ if isinstance(rank_list, list): # Ensure rank_list is a list
29
+ merged_queries[query].append(rank_list)
30
+ else:
31
+ print(f"Warning: Expected a list for query '{query}' but got {type(rank_list)}")
32
+ except Exception as e:
33
+ print(f"Error reading {filename}: {e}")
34
+
35
+ # Convert defaultdict to a list of dictionaries
36
+ return [{query: lists} for query, lists in merged_queries.items()]
37
+
38
+ def reciprocal_rank_fusion(json_input, K=60, top_n=100):
39
+ """
40
+ Fuse rank from multiple IR systems for multiple queries using Reciprocal Rank Fusion.
41
+
42
+ Args:
43
+ json_input (list): A list of dictionaries where keys are queries, and values are ranked document lists from different systems.
44
+ K (int): A constant used in the RRF formula (default is 60).
45
+ top_n (int): Number of top results to return for each query.
46
+
47
+ Returns:
48
+ list: A list of dictionaries with each query and its respective fused document rankings.
49
+ """
50
+ query_fusion_results = []
51
+
52
+ # Iterate over each query in the JSON input
53
+ for query_data in json_input:
54
+ for query, list_of_ranked_docs in query_data.items():
55
+ rrf_map = defaultdict(float)
56
+
57
+ # Fuse rankings for the query using RRF
58
+ for rank_list in list_of_ranked_docs:
59
+ for rank, doc in enumerate(rank_list, 1):
60
+ rrf_map[doc] += 1 / (rank + K)
61
+
62
+ # Sort the documents based on RRF scores in descending order
63
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
64
+ fused_rankings = [doc for doc, score in sorted_docs[:top_n]] # Keep only top N results
65
+
66
+ # Store the results for the current query
67
+ query_fusion_results.append({query: fused_rankings})
68
+
69
+ return query_fusion_results
70
+
71
+ def save_to_json(output_data, output_file_path):
72
+ """
73
+ Save the RRF results to a JSON file in the same format as the input.
74
+
75
+ Args:
76
+ output_data (list): The processed data to save.
77
+ output_file_path (str): Path to the output JSON file.
78
+ """
79
+ with open(output_file_path, 'w') as f:
80
+ json.dump(output_data, f, indent=2)
81
+
82
+ # # Example usage
83
+ # directory_path = "Modified_1_2" # Replace with your directory path
84
+ # output_file_path = "Modified_1_2/rrf_1_2_modified.json" # Replace with your desired output file path
85
+
86
+ # # Load and merge JSON files
87
+ # merged_input = load_and_merge_json_files(directory_path)
88
+
89
+ # print(merged_input[0]["5xvggq"])
90
+
91
+ # # Perform RRF on the merged input, keeping only the top 100 results
92
+ # combined_results = reciprocal_rank_fusion(merged_input, top_n=100)
93
+
94
+ # # Save the combined results to a JSON file
95
+ # save_to_json(combined_results, output_file_path)
96
+
97
+ # print(f"Combined results saved to {output_file_path}")
98
+
99
+
100
+ def reciprocal_rank_fusion_two(rank_list1, rank_list2, K=60, top_n=100):
101
+ """
102
+ Perform Reciprocal Rank Fusion (RRF) for two ranking lists.
103
+
104
+ Args:
105
+ rank_list1 (list): First list of ranked documents.
106
+ rank_list2 (list): Second list of ranked documents.
107
+ K (int): A constant used in the RRF formula (default is 60).
108
+ top_n (int): Number of top results to return (default is 100).
109
+
110
+ Returns:
111
+ list: Combined list of rankings after applying RRF.
112
+ """
113
+ rrf_map = defaultdict(float)
114
+
115
+ # Process the first ranking list
116
+ for rank, doc in enumerate(rank_list1, 1): # Start ranks from 1
117
+ rrf_map[doc] += 1 / (rank + K)
118
+
119
+ # Process the second ranking list
120
+ for rank, doc in enumerate(rank_list2, 1): # Start ranks from 1
121
+ rrf_map[doc] += 1 / (rank + K)
122
+
123
+ # Sort the documents based on RRF scores in descending order
124
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
125
+
126
+ # Return only the top N results
127
+ return [doc for doc, score in sorted_docs[:top_n]]
128
+
129
+
130
+ def reciprocal_rank_fusion_three(rank_list1, rank_list2, rank_list3, K=60, top_n=100):
131
+ """
132
+ Perform Reciprocal Rank Fusion (RRF) for three ranking lists.
133
+
134
+ Args:
135
+ rank_list1 (list): First list of ranked documents.
136
+ rank_list2 (list): Second list of ranked documents.
137
+ rank_list3 (list): Third list of ranked documents.
138
+ K (int): A constant used in the RRF formula (default is 60).
139
+ top_n (int): Number of top results to return (default is 100).
140
+
141
+ Returns:
142
+ list: Combined list of rankings after applying RRF.
143
+ """
144
+ rrf_map = defaultdict(float)
145
+
146
+ # Process the first ranking list
147
+ for rank, doc in enumerate(rank_list1, 1): # Start ranks from 1
148
+ rrf_map[doc] += 1 / (rank + K)
149
+
150
+ # Process the second ranking list
151
+ for rank, doc in enumerate(rank_list2, 1): # Start ranks from 1
152
+ rrf_map[doc] += 1 / (rank + K)
153
+
154
+ # Process the third ranking list
155
+ for rank, doc in enumerate(rank_list3, 1): # Start ranks from 1
156
+ rrf_map[doc] += 1 / (rank + K)
157
+
158
+ # Sort the documents based on RRF scores in descending order
159
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
160
+
161
+ # Return only the top N results
162
+ return [doc for doc, score in sorted_docs[:top_n]]
163
+
164
+
165
+ def reciprocal_rank_fusion_six(rank_list1, rank_list2, rank_list3, rank_list4, rank_list5, rank_list6, K=60, top_n=100):
166
+ """
167
+ Perform Reciprocal Rank Fusion (RRF) for six ranking lists.
168
+
169
+ Args:
170
+ rank_list1 (list): First list of ranked documents.
171
+ rank_list2 (list): Second list of ranked documents.
172
+ rank_list3 (list): Third list of ranked documents.
173
+ rank_list4 (list): Fourth list of ranked documents.
174
+ rank_list5 (list): Fifth list of ranked documents.
175
+ rank_list6 (list): Sixth list of ranked documents.
176
+ K (int): A constant used in the RRF formula (default is 60).
177
+ top_n (int): Number of top results to return (default is 100).
178
+
179
+ Returns:
180
+ list: Combined list of rankings after applying RRF.
181
+ """
182
+ rrf_map = defaultdict(float)
183
+
184
+ # Process each ranking list
185
+ for rank, doc in enumerate(rank_list1, 1):
186
+ rrf_map[doc] += 1 / (rank + K)
187
+ for rank, doc in enumerate(rank_list2, 1):
188
+ rrf_map[doc] += 1 / (rank + K)
189
+ for rank, doc in enumerate(rank_list3, 1):
190
+ rrf_map[doc] += 1 / (rank + K)
191
+ for rank, doc in enumerate(rank_list4, 1):
192
+ rrf_map[doc] += 1 / (rank + K)
193
+ for rank, doc in enumerate(rank_list5, 1):
194
+ rrf_map[doc] += 1 / (rank + K)
195
+ for rank, doc in enumerate(rank_list6, 1):
196
+ rrf_map[doc] += 1 / (rank + K)
197
+
198
+ # Sort the documents based on RRF scores in descending order
199
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
200
+
201
+ # Return only the top N results
202
+ return [doc for doc, score in sorted_docs[:top_n]]
203
+
204
+
205
+ def reciprocal_rank_fusion_multiple_lists(ranking_lists, K=60, top_n=100):
206
+ """
207
+ Perform Reciprocal Rank Fusion (RRF) for multiple ranking lists for each query.
208
+
209
+ Args:
210
+ ranking_lists (list of list of dict): Each element is a list of dictionaries, where each dictionary contains query IDs and ranked lists.
211
+ K (int): A constant used in the RRF formula (default is 60).
212
+ top_n (int): Number of top results to return for each query (default is 100).
213
+
214
+ Returns:
215
+ dict: A dictionary with query IDs as keys and their combined rankings as values.
216
+ """
217
+ combined_results = defaultdict(list)
218
+
219
+ # Flatten all ranking lists into a single dictionary per query
220
+ merged_rankings = defaultdict(list)
221
+ for ranking_list in ranking_lists:
222
+ for ranking_dict in ranking_list:
223
+ for query_id, doc_list in ranking_dict.items():
224
+ merged_rankings[query_id].append(doc_list)
225
+
226
+ # Apply RRF for each query
227
+ for query_id, ranked_lists in merged_rankings.items():
228
+ rrf_map = defaultdict(float)
229
+
230
+ # Process rankings for each system
231
+ for rank_list in ranked_lists:
232
+ for rank, doc in enumerate(rank_list, 1): # Start rank from 1
233
+ rrf_map[str(doc)] += 1 / (rank + K)
234
+
235
+ # Sort documents based on their RRF scores in descending order
236
+ sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
237
+ combined_results[query_id] = [doc for doc, score in sorted_docs[:top_n]]
238
+
239
+ return dict(combined_results)