from collections import defaultdict import re import heapq import joblib import os def preprocess_text(text): """ Preprocess the text for tokenization. Removes special characters, lowercases, and splits into words. """ return re.findall(r'\w+', text.lower()) def create_inverted_index(wikipedia_dict): """ Create an inverted index from the document dictionary. Args: wikipedia_dict (dict): A dictionary with document IDs as keys and text as values. Returns: dict: An inverted index where each term maps to a list of document IDs containing it. """ inverted_index = defaultdict(set) for doc_id, text in wikipedia_dict.items(): tokens = set(preprocess_text(text)) # Unique tokens for each document for token in tokens: inverted_index[token].add(doc_id) return inverted_index def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"): """ Save the inverted index to a file using joblib. """ joblib.dump(inverted_index, filepath) def load_inverted_index(filepath="Baseline/inverted_index.pkl"): """ Load the inverted index from a file using joblib. """ if os.path.exists(filepath): return joblib.load(filepath) return None def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100): """ Perform boolean retrieval for each query. Args: queries_dict (dict): A dictionary with query IDs as keys and query text as values. inverted_index (dict): The inverted index created from the document collection. wikipedia_dict (dict): The original document dictionary (for scoring if needed). top_n (int): The number of top documents to retrieve for each query. Returns: dict: A dictionary with query IDs as keys and a list of top document IDs as values. """ query_results = {} for query_id, query_text in queries_dict.items(): query_tokens = preprocess_text(query_text) # Collect all document IDs that contain any of the query terms relevant_docs = set() for token in query_tokens: if token in inverted_index: relevant_docs.update(inverted_index[token]) # If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc) doc_scores = [] for doc_id in relevant_docs: doc_text = preprocess_text(wikipedia_dict[doc_id]) score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score doc_scores.append((score, doc_id)) # Get the top `top_n` documents based on the score top_docs = heapq.nlargest(top_n, doc_scores) query_results[query_id] = [doc_id for _, doc_id in top_docs] return query_results # Main flow def main_boolean_retrieval(wikipedia_dict, queries_dict): # Step 1: Create inverted index inverted_index = create_inverted_index(wikipedia_dict) # Step 2: Perform boolean retrieval top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict) return top_docs def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"): """ Retrieve documents for a single query using the inverted index. If the inverted index is not found, it will be created and saved. Args: query (str): The query text. wikipedia_dict (dict): The original document dictionary. top_n (int): The number of top documents to retrieve. inverted_index_path (str): Path to the saved inverted index file. Returns: list: A list of top document IDs matching the query. """ # Load or create the inverted index inverted_index = load_inverted_index(inverted_index_path) if inverted_index is None: print("Inverted index not found. Creating one...") inverted_index = create_inverted_index(wikipedia_dict) save_inverted_index(inverted_index, inverted_index_path) # Preprocess the query query_tokens = preprocess_text(query) # Collect relevant documents relevant_docs = set() for token in query_tokens: if token in inverted_index: relevant_docs.update(inverted_index[token]) # Rank documents by frequency of terms doc_scores = [] for doc_id in relevant_docs: doc_text = preprocess_text(wikipedia_dict[doc_id]) score = sum(doc_text.count(token) for token in query_tokens) doc_scores.append((score, doc_id)) # Get the top `top_n` documents based on the score top_docs = heapq.nlargest(top_n, doc_scores) return [doc_id for _, doc_id in top_docs] # Example usage: # Assuming `wikipedia_dict` and `queries_dict` are already prepared # top_results = main_boolean_retrieval(wikipedia_dict, queries_dict) # print(top_results)