|
from collections import defaultdict |
|
import re |
|
import heapq |
|
import joblib |
|
import os |
|
|
|
def preprocess_text(text): |
|
""" |
|
Preprocess the text for tokenization. |
|
Removes special characters, lowercases, and splits into words. |
|
""" |
|
return re.findall(r'\w+', text.lower()) |
|
|
|
def create_inverted_index(wikipedia_dict): |
|
""" |
|
Create an inverted index from the document dictionary. |
|
Args: |
|
wikipedia_dict (dict): A dictionary with document IDs as keys and text as values. |
|
|
|
Returns: |
|
dict: An inverted index where each term maps to a list of document IDs containing it. |
|
""" |
|
inverted_index = defaultdict(set) |
|
for doc_id, text in wikipedia_dict.items(): |
|
tokens = set(preprocess_text(text)) |
|
for token in tokens: |
|
inverted_index[token].add(doc_id) |
|
return inverted_index |
|
|
|
def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"): |
|
""" |
|
Save the inverted index to a file using joblib. |
|
""" |
|
joblib.dump(inverted_index, filepath) |
|
|
|
def load_inverted_index(filepath="Baseline/inverted_index.pkl"): |
|
""" |
|
Load the inverted index from a file using joblib. |
|
""" |
|
if os.path.exists(filepath): |
|
return joblib.load(filepath) |
|
return None |
|
|
|
def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100): |
|
""" |
|
Perform boolean retrieval for each query. |
|
Args: |
|
queries_dict (dict): A dictionary with query IDs as keys and query text as values. |
|
inverted_index (dict): The inverted index created from the document collection. |
|
wikipedia_dict (dict): The original document dictionary (for scoring if needed). |
|
top_n (int): The number of top documents to retrieve for each query. |
|
|
|
Returns: |
|
dict: A dictionary with query IDs as keys and a list of top document IDs as values. |
|
""" |
|
query_results = {} |
|
|
|
for query_id, query_text in queries_dict.items(): |
|
query_tokens = preprocess_text(query_text) |
|
|
|
|
|
relevant_docs = set() |
|
for token in query_tokens: |
|
if token in inverted_index: |
|
relevant_docs.update(inverted_index[token]) |
|
|
|
|
|
doc_scores = [] |
|
for doc_id in relevant_docs: |
|
doc_text = preprocess_text(wikipedia_dict[doc_id]) |
|
score = sum(doc_text.count(token) for token in query_tokens) |
|
doc_scores.append((score, doc_id)) |
|
|
|
|
|
top_docs = heapq.nlargest(top_n, doc_scores) |
|
query_results[query_id] = [doc_id for _, doc_id in top_docs] |
|
|
|
return query_results |
|
|
|
|
|
def main_boolean_retrieval(wikipedia_dict, queries_dict): |
|
|
|
inverted_index = create_inverted_index(wikipedia_dict) |
|
|
|
|
|
top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict) |
|
|
|
return top_docs |
|
|
|
def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"): |
|
""" |
|
Retrieve documents for a single query using the inverted index. |
|
If the inverted index is not found, it will be created and saved. |
|
|
|
Args: |
|
query (str): The query text. |
|
wikipedia_dict (dict): The original document dictionary. |
|
top_n (int): The number of top documents to retrieve. |
|
inverted_index_path (str): Path to the saved inverted index file. |
|
|
|
Returns: |
|
list: A list of top document IDs matching the query. |
|
""" |
|
|
|
inverted_index = load_inverted_index(inverted_index_path) |
|
if inverted_index is None: |
|
print("Inverted index not found. Creating one...") |
|
inverted_index = create_inverted_index(wikipedia_dict) |
|
save_inverted_index(inverted_index, inverted_index_path) |
|
|
|
|
|
query_tokens = preprocess_text(query) |
|
|
|
|
|
relevant_docs = set() |
|
for token in query_tokens: |
|
if token in inverted_index: |
|
relevant_docs.update(inverted_index[token]) |
|
|
|
|
|
doc_scores = [] |
|
for doc_id in relevant_docs: |
|
doc_text = preprocess_text(wikipedia_dict[doc_id]) |
|
score = sum(doc_text.count(token) for token in query_tokens) |
|
doc_scores.append((score, doc_id)) |
|
|
|
|
|
top_docs = heapq.nlargest(top_n, doc_scores) |
|
return [doc_id for _, doc_id in top_docs] |
|
|
|
|
|
|
|
|
|
|
|
|