|
import json |
|
import os |
|
from collections import defaultdict |
|
|
|
def load_and_merge_json_files(directory_path): |
|
""" |
|
Load and merge JSON files from a directory into a single structure, keeping each list from different files separate for each query. |
|
|
|
Args: |
|
directory_path (str): Path to the directory containing the JSON files. |
|
|
|
Returns: |
|
list: Merged list of dictionaries, keeping separate lists for each query. |
|
""" |
|
merged_queries = defaultdict(list) |
|
|
|
|
|
for filename in os.listdir(directory_path): |
|
if filename.endswith('.json'): |
|
file_path = os.path.join(directory_path, filename) |
|
try: |
|
with open(file_path, 'r') as f: |
|
json_data = json.load(f) |
|
|
|
|
|
for query_data in json_data: |
|
for query, rank_list in query_data.items(): |
|
if isinstance(rank_list, list): |
|
merged_queries[query].append(rank_list) |
|
else: |
|
print(f"Warning: Expected a list for query '{query}' but got {type(rank_list)}") |
|
except Exception as e: |
|
print(f"Error reading {filename}: {e}") |
|
|
|
|
|
return [{query: lists} for query, lists in merged_queries.items()] |
|
|
|
def reciprocal_rank_fusion(json_input, K=60, top_n=100): |
|
""" |
|
Fuse rank from multiple IR systems for multiple queries using Reciprocal Rank Fusion. |
|
|
|
Args: |
|
json_input (list): A list of dictionaries where keys are queries, and values are ranked document lists from different systems. |
|
K (int): A constant used in the RRF formula (default is 60). |
|
top_n (int): Number of top results to return for each query. |
|
|
|
Returns: |
|
list: A list of dictionaries with each query and its respective fused document rankings. |
|
""" |
|
query_fusion_results = [] |
|
|
|
|
|
for query_data in json_input: |
|
for query, list_of_ranked_docs in query_data.items(): |
|
rrf_map = defaultdict(float) |
|
|
|
|
|
for rank_list in list_of_ranked_docs: |
|
for rank, doc in enumerate(rank_list, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True) |
|
fused_rankings = [doc for doc, score in sorted_docs[:top_n]] |
|
|
|
|
|
query_fusion_results.append({query: fused_rankings}) |
|
|
|
return query_fusion_results |
|
|
|
def save_to_json(output_data, output_file_path): |
|
""" |
|
Save the RRF results to a JSON file in the same format as the input. |
|
|
|
Args: |
|
output_data (list): The processed data to save. |
|
output_file_path (str): Path to the output JSON file. |
|
""" |
|
with open(output_file_path, 'w') as f: |
|
json.dump(output_data, f, indent=2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reciprocal_rank_fusion_two(rank_list1, rank_list2, K=60, top_n=100): |
|
""" |
|
Perform Reciprocal Rank Fusion (RRF) for two ranking lists. |
|
|
|
Args: |
|
rank_list1 (list): First list of ranked documents. |
|
rank_list2 (list): Second list of ranked documents. |
|
K (int): A constant used in the RRF formula (default is 60). |
|
top_n (int): Number of top results to return (default is 100). |
|
|
|
Returns: |
|
list: Combined list of rankings after applying RRF. |
|
""" |
|
rrf_map = defaultdict(float) |
|
|
|
|
|
for rank, doc in enumerate(rank_list1, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
for rank, doc in enumerate(rank_list2, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True) |
|
|
|
|
|
return [doc for doc, score in sorted_docs[:top_n]] |
|
|
|
|
|
def reciprocal_rank_fusion_three(rank_list1, rank_list2, rank_list3, K=60, top_n=100): |
|
""" |
|
Perform Reciprocal Rank Fusion (RRF) for three ranking lists. |
|
|
|
Args: |
|
rank_list1 (list): First list of ranked documents. |
|
rank_list2 (list): Second list of ranked documents. |
|
rank_list3 (list): Third list of ranked documents. |
|
K (int): A constant used in the RRF formula (default is 60). |
|
top_n (int): Number of top results to return (default is 100). |
|
|
|
Returns: |
|
list: Combined list of rankings after applying RRF. |
|
""" |
|
rrf_map = defaultdict(float) |
|
|
|
|
|
for rank, doc in enumerate(rank_list1, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
for rank, doc in enumerate(rank_list2, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
for rank, doc in enumerate(rank_list3, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True) |
|
|
|
|
|
return [doc for doc, score in sorted_docs[:top_n]] |
|
|
|
|
|
def reciprocal_rank_fusion_six(rank_list1, rank_list2, rank_list3, rank_list4, rank_list5, rank_list6, K=60, top_n=100): |
|
""" |
|
Perform Reciprocal Rank Fusion (RRF) for six ranking lists. |
|
|
|
Args: |
|
rank_list1 (list): First list of ranked documents. |
|
rank_list2 (list): Second list of ranked documents. |
|
rank_list3 (list): Third list of ranked documents. |
|
rank_list4 (list): Fourth list of ranked documents. |
|
rank_list5 (list): Fifth list of ranked documents. |
|
rank_list6 (list): Sixth list of ranked documents. |
|
K (int): A constant used in the RRF formula (default is 60). |
|
top_n (int): Number of top results to return (default is 100). |
|
|
|
Returns: |
|
list: Combined list of rankings after applying RRF. |
|
""" |
|
rrf_map = defaultdict(float) |
|
|
|
|
|
for rank, doc in enumerate(rank_list1, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
for rank, doc in enumerate(rank_list2, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
for rank, doc in enumerate(rank_list3, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
for rank, doc in enumerate(rank_list4, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
for rank, doc in enumerate(rank_list5, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
for rank, doc in enumerate(rank_list6, 1): |
|
rrf_map[doc] += 1 / (rank + K) |
|
|
|
|
|
sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True) |
|
|
|
|
|
return [doc for doc, score in sorted_docs[:top_n]] |
|
|
|
|
|
def reciprocal_rank_fusion_multiple_lists(ranking_lists, K=60, top_n=100): |
|
""" |
|
Perform Reciprocal Rank Fusion (RRF) for multiple ranking lists for each query. |
|
|
|
Args: |
|
ranking_lists (list of list of dict): Each element is a list of dictionaries, where each dictionary contains query IDs and ranked lists. |
|
K (int): A constant used in the RRF formula (default is 60). |
|
top_n (int): Number of top results to return for each query (default is 100). |
|
|
|
Returns: |
|
dict: A dictionary with query IDs as keys and their combined rankings as values. |
|
""" |
|
combined_results = defaultdict(list) |
|
|
|
|
|
merged_rankings = defaultdict(list) |
|
for ranking_list in ranking_lists: |
|
for ranking_dict in ranking_list: |
|
for query_id, doc_list in ranking_dict.items(): |
|
merged_rankings[query_id].append(doc_list) |
|
|
|
|
|
for query_id, ranked_lists in merged_rankings.items(): |
|
rrf_map = defaultdict(float) |
|
|
|
|
|
for rank_list in ranked_lists: |
|
for rank, doc in enumerate(rank_list, 1): |
|
rrf_map[str(doc)] += 1 / (rank + K) |
|
|
|
|
|
sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True) |
|
combined_results[query_id] = [doc for doc, score in sorted_docs[:top_n]] |
|
|
|
return dict(combined_results) |