raghuv-aditya's picture
Upload 24 files
9f21f05 verified
from collections import defaultdict
import re
import heapq
import joblib
import os
def preprocess_text(text):
"""
Preprocess the text for tokenization.
Removes special characters, lowercases, and splits into words.
"""
return re.findall(r'\w+', text.lower())
def create_inverted_index(wikipedia_dict):
"""
Create an inverted index from the document dictionary.
Args:
wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.
Returns:
dict: An inverted index where each term maps to a list of document IDs containing it.
"""
inverted_index = defaultdict(set)
for doc_id, text in wikipedia_dict.items():
tokens = set(preprocess_text(text)) # Unique tokens for each document
for token in tokens:
inverted_index[token].add(doc_id)
return inverted_index
def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
"""
Save the inverted index to a file using joblib.
"""
joblib.dump(inverted_index, filepath)
def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
"""
Load the inverted index from a file using joblib.
"""
if os.path.exists(filepath):
return joblib.load(filepath)
return None
def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
"""
Perform boolean retrieval for each query.
Args:
queries_dict (dict): A dictionary with query IDs as keys and query text as values.
inverted_index (dict): The inverted index created from the document collection.
wikipedia_dict (dict): The original document dictionary (for scoring if needed).
top_n (int): The number of top documents to retrieve for each query.
Returns:
dict: A dictionary with query IDs as keys and a list of top document IDs as values.
"""
query_results = {}
for query_id, query_text in queries_dict.items():
query_tokens = preprocess_text(query_text)
# Collect all document IDs that contain any of the query terms
relevant_docs = set()
for token in query_tokens:
if token in inverted_index:
relevant_docs.update(inverted_index[token])
# If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
doc_scores = []
for doc_id in relevant_docs:
doc_text = preprocess_text(wikipedia_dict[doc_id])
score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
doc_scores.append((score, doc_id))
# Get the top `top_n` documents based on the score
top_docs = heapq.nlargest(top_n, doc_scores)
query_results[query_id] = [doc_id for _, doc_id in top_docs]
return query_results
# Main flow
def main_boolean_retrieval(wikipedia_dict, queries_dict):
# Step 1: Create inverted index
inverted_index = create_inverted_index(wikipedia_dict)
# Step 2: Perform boolean retrieval
top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
return top_docs
def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
"""
Retrieve documents for a single query using the inverted index.
If the inverted index is not found, it will be created and saved.
Args:
query (str): The query text.
wikipedia_dict (dict): The original document dictionary.
top_n (int): The number of top documents to retrieve.
inverted_index_path (str): Path to the saved inverted index file.
Returns:
list: A list of top document IDs matching the query.
"""
# Load or create the inverted index
inverted_index = load_inverted_index(inverted_index_path)
if inverted_index is None:
print("Inverted index not found. Creating one...")
inverted_index = create_inverted_index(wikipedia_dict)
save_inverted_index(inverted_index, inverted_index_path)
# Preprocess the query
query_tokens = preprocess_text(query)
# Collect relevant documents
relevant_docs = set()
for token in query_tokens:
if token in inverted_index:
relevant_docs.update(inverted_index[token])
# Rank documents by frequency of terms
doc_scores = []
for doc_id in relevant_docs:
doc_text = preprocess_text(wikipedia_dict[doc_id])
score = sum(doc_text.count(token) for token in query_tokens)
doc_scores.append((score, doc_id))
# Get the top `top_n` documents based on the score
top_docs = heapq.nlargest(top_n, doc_scores)
return [doc_id for _, doc_id in top_docs]
# Example usage:
# Assuming `wikipedia_dict` and `queries_dict` are already prepared
# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
# print(top_results)