Spaces:

IR-IIITH
/

MultiAgent-OpenDomain-QnA-System

Sleeping

File size: 4,950 Bytes

9f21f05

from collections import defaultdict
import re
import heapq
import joblib
import os

def preprocess_text(text):
    """
    Preprocess the text for tokenization.
    Removes special characters, lowercases, and splits into words.
    """
    return re.findall(r'\w+', text.lower())

def create_inverted_index(wikipedia_dict):
    """
    Create an inverted index from the document dictionary.
    Args:
        wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.

    Returns:
        dict: An inverted index where each term maps to a list of document IDs containing it.
    """
    inverted_index = defaultdict(set)
    for doc_id, text in wikipedia_dict.items():
        tokens = set(preprocess_text(text))  # Unique tokens for each document
        for token in tokens:
            inverted_index[token].add(doc_id)
    return inverted_index

def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
    """
    Save the inverted index to a file using joblib.
    """
    joblib.dump(inverted_index, filepath)

def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
    """
    Load the inverted index from a file using joblib.
    """
    if os.path.exists(filepath):
        return joblib.load(filepath)
    return None

def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
    """
    Perform boolean retrieval for each query.
    Args:
        queries_dict (dict): A dictionary with query IDs as keys and query text as values.
        inverted_index (dict): The inverted index created from the document collection.
        wikipedia_dict (dict): The original document dictionary (for scoring if needed).
        top_n (int): The number of top documents to retrieve for each query.

    Returns:
        dict: A dictionary with query IDs as keys and a list of top document IDs as values.
    """
    query_results = {}
    
    for query_id, query_text in queries_dict.items():
        query_tokens = preprocess_text(query_text)
        
        # Collect all document IDs that contain any of the query terms
        relevant_docs = set()
        for token in query_tokens:
            if token in inverted_index:
                relevant_docs.update(inverted_index[token])
        
        # If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
        doc_scores = []
        for doc_id in relevant_docs:
            doc_text = preprocess_text(wikipedia_dict[doc_id])
            score = sum(doc_text.count(token) for token in query_tokens)  # Term frequency score
            doc_scores.append((score, doc_id))
        
        # Get the top `top_n` documents based on the score
        top_docs = heapq.nlargest(top_n, doc_scores)
        query_results[query_id] = [doc_id for _, doc_id in top_docs]

    return query_results

# Main flow
def main_boolean_retrieval(wikipedia_dict, queries_dict):
    # Step 1: Create inverted index
    inverted_index = create_inverted_index(wikipedia_dict)
    
    # Step 2: Perform boolean retrieval
    top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
    
    return top_docs

def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
    """
    Retrieve documents for a single query using the inverted index.
    If the inverted index is not found, it will be created and saved.

    Args:
        query (str): The query text.
        wikipedia_dict (dict): The original document dictionary.
        top_n (int): The number of top documents to retrieve.
        inverted_index_path (str): Path to the saved inverted index file.

    Returns:
        list: A list of top document IDs matching the query.
    """
    # Load or create the inverted index
    inverted_index = load_inverted_index(inverted_index_path)
    if inverted_index is None:
        print("Inverted index not found. Creating one...")
        inverted_index = create_inverted_index(wikipedia_dict)
        save_inverted_index(inverted_index, inverted_index_path)

    # Preprocess the query
    query_tokens = preprocess_text(query)
    
    # Collect relevant documents
    relevant_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            relevant_docs.update(inverted_index[token])
    
    # Rank documents by frequency of terms
    doc_scores = []
    for doc_id in relevant_docs:
        doc_text = preprocess_text(wikipedia_dict[doc_id])
        score = sum(doc_text.count(token) for token in query_tokens)
        doc_scores.append((score, doc_id))
    
    # Get the top `top_n` documents based on the score
    top_docs = heapq.nlargest(top_n, doc_scores)
    return [doc_id for _, doc_id in top_docs]

# Example usage:
# Assuming `wikipedia_dict` and `queries_dict` are already prepared
# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
# print(top_results)