Spaces:

yash9439
/

MultiAgent-QnA-ChatBot

Sleeping

App Files Files Community

yash bhaskar commited on Nov 21, 2024

Commit

5fa6e3c

1 Parent(s): e170b87

Adding RRF code

Browse files

Files changed (1) hide show

Ranking/RRF/RRF_implementation.py +239 -0

Ranking/RRF/RRF_implementation.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import json
+import os
+from collections import defaultdict
+def load_and_merge_json_files(directory_path):
+    """
+    Load and merge JSON files from a directory into a single structure, keeping each list from different files separate for each query.
+    Args:
+    directory_path (str): Path to the directory containing the JSON files.
+    Returns:
+    list: Merged list of dictionaries, keeping separate lists for each query.
+    """
+    merged_queries = defaultdict(list)
+    # Iterate through all files in the directory
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                with open(file_path, 'r') as f:
+                    json_data = json.load(f)
+                    # For each file, add the lists to the corresponding query
+                    for query_data in json_data:
+                        for query, rank_list in query_data.items():
+                            if isinstance(rank_list, list):  # Ensure rank_list is a list
+                                merged_queries[query].append(rank_list)
+                            else:
+                                print(f"Warning: Expected a list for query '{query}' but got {type(rank_list)}")
+            except Exception as e:
+                print(f"Error reading {filename}: {e}")
+    # Convert defaultdict to a list of dictionaries
+    return [{query: lists} for query, lists in merged_queries.items()]
+def reciprocal_rank_fusion(json_input, K=60, top_n=100):
+    """
+    Fuse rank from multiple IR systems for multiple queries using Reciprocal Rank Fusion.
+    Args:
+    json_input (list): A list of dictionaries where keys are queries, and values are ranked document lists from different systems.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return for each query.
+    Returns:
+    list: A list of dictionaries with each query and its respective fused document rankings.
+    """
+    query_fusion_results = []
+    # Iterate over each query in the JSON input
+    for query_data in json_input:
+        for query, list_of_ranked_docs in query_data.items():
+            rrf_map = defaultdict(float)
+            # Fuse rankings for the query using RRF
+            for rank_list in list_of_ranked_docs:
+                for rank, doc in enumerate(rank_list, 1):
+                    rrf_map[doc] += 1 / (rank + K)
+            # Sort the documents based on RRF scores in descending order
+            sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+            fused_rankings = [doc for doc, score in sorted_docs[:top_n]]  # Keep only top N results
+            # Store the results for the current query
+            query_fusion_results.append({query: fused_rankings})
+    return query_fusion_results
+def save_to_json(output_data, output_file_path):
+    """
+    Save the RRF results to a JSON file in the same format as the input.
+    Args:
+    output_data (list): The processed data to save.
+    output_file_path (str): Path to the output JSON file.
+    """
+    with open(output_file_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+# # Example usage
+# directory_path = "Modified_1_2"  # Replace with your directory path
+# output_file_path = "Modified_1_2/rrf_1_2_modified.json"  # Replace with your desired output file path
+# # Load and merge JSON files
+# merged_input = load_and_merge_json_files(directory_path)
+# print(merged_input[0]["5xvggq"])
+# # Perform RRF on the merged input, keeping only the top 100 results
+# combined_results = reciprocal_rank_fusion(merged_input, top_n=100)
+# # Save the combined results to a JSON file
+# save_to_json(combined_results, output_file_path)
+# print(f"Combined results saved to {output_file_path}")
+def reciprocal_rank_fusion_two(rank_list1, rank_list2, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for two ranking lists.
+    Args:
+    rank_list1 (list): First list of ranked documents.
+    rank_list2 (list): Second list of ranked documents.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return (default is 100).
+    Returns:
+    list: Combined list of rankings after applying RRF.
+    """
+    rrf_map = defaultdict(float)
+    # Process the first ranking list
+    for rank, doc in enumerate(rank_list1, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Process the second ranking list
+    for rank, doc in enumerate(rank_list2, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Sort the documents based on RRF scores in descending order
+    sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+    # Return only the top N results
+    return [doc for doc, score in sorted_docs[:top_n]]
+def reciprocal_rank_fusion_three(rank_list1, rank_list2, rank_list3, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for three ranking lists.
+    Args:
+    rank_list1 (list): First list of ranked documents.
+    rank_list2 (list): Second list of ranked documents.
+    rank_list3 (list): Third list of ranked documents.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return (default is 100).
+    Returns:
+    list: Combined list of rankings after applying RRF.
+    """
+    rrf_map = defaultdict(float)
+    # Process the first ranking list
+    for rank, doc in enumerate(rank_list1, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Process the second ranking list
+    for rank, doc in enumerate(rank_list2, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Process the third ranking list
+    for rank, doc in enumerate(rank_list3, 1):  # Start ranks from 1
+        rrf_map[doc] += 1 / (rank + K)
+    # Sort the documents based on RRF scores in descending order
+    sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+    # Return only the top N results
+    return [doc for doc, score in sorted_docs[:top_n]]
+def reciprocal_rank_fusion_six(rank_list1, rank_list2, rank_list3, rank_list4, rank_list5, rank_list6, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for six ranking lists.
+    Args:
+    rank_list1 (list): First list of ranked documents.
+    rank_list2 (list): Second list of ranked documents.
+    rank_list3 (list): Third list of ranked documents.
+    rank_list4 (list): Fourth list of ranked documents.
+    rank_list5 (list): Fifth list of ranked documents.
+    rank_list6 (list): Sixth list of ranked documents.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return (default is 100).
+    Returns:
+    list: Combined list of rankings after applying RRF.
+    """
+    rrf_map = defaultdict(float)
+    # Process each ranking list
+    for rank, doc in enumerate(rank_list1, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list2, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list3, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list4, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list5, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    for rank, doc in enumerate(rank_list6, 1):
+        rrf_map[doc] += 1 / (rank + K)
+    # Sort the documents based on RRF scores in descending order
+    sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+    # Return only the top N results
+    return [doc for doc, score in sorted_docs[:top_n]]
+def reciprocal_rank_fusion_multiple_lists(ranking_lists, K=60, top_n=100):
+    """
+    Perform Reciprocal Rank Fusion (RRF) for multiple ranking lists for each query.
+    Args:
+    ranking_lists (list of list of dict): Each element is a list of dictionaries, where each dictionary contains query IDs and ranked lists.
+    K (int): A constant used in the RRF formula (default is 60).
+    top_n (int): Number of top results to return for each query (default is 100).
+    Returns:
+    dict: A dictionary with query IDs as keys and their combined rankings as values.
+    """
+    combined_results = defaultdict(list)
+    # Flatten all ranking lists into a single dictionary per query
+    merged_rankings = defaultdict(list)
+    for ranking_list in ranking_lists:
+        for ranking_dict in ranking_list:
+            for query_id, doc_list in ranking_dict.items():
+                merged_rankings[query_id].append(doc_list)
+    # Apply RRF for each query
+    for query_id, ranked_lists in merged_rankings.items():
+        rrf_map = defaultdict(float)
+        # Process rankings for each system
+        for rank_list in ranked_lists:
+            for rank, doc in enumerate(rank_list, 1):  # Start rank from 1
+                rrf_map[str(doc)] += 1 / (rank + K)
+        # Sort documents based on their RRF scores in descending order
+        sorted_docs = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)
+        combined_results[query_id] = [doc for doc, score in sorted_docs[:top_n]]
+    return dict(combined_results)