File size: 4,950 Bytes
9f21f05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from collections import defaultdict
import re
import heapq
import joblib
import os
def preprocess_text(text):
"""
Preprocess the text for tokenization.
Removes special characters, lowercases, and splits into words.
"""
return re.findall(r'\w+', text.lower())
def create_inverted_index(wikipedia_dict):
"""
Create an inverted index from the document dictionary.
Args:
wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.
Returns:
dict: An inverted index where each term maps to a list of document IDs containing it.
"""
inverted_index = defaultdict(set)
for doc_id, text in wikipedia_dict.items():
tokens = set(preprocess_text(text)) # Unique tokens for each document
for token in tokens:
inverted_index[token].add(doc_id)
return inverted_index
def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
"""
Save the inverted index to a file using joblib.
"""
joblib.dump(inverted_index, filepath)
def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
"""
Load the inverted index from a file using joblib.
"""
if os.path.exists(filepath):
return joblib.load(filepath)
return None
def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
"""
Perform boolean retrieval for each query.
Args:
queries_dict (dict): A dictionary with query IDs as keys and query text as values.
inverted_index (dict): The inverted index created from the document collection.
wikipedia_dict (dict): The original document dictionary (for scoring if needed).
top_n (int): The number of top documents to retrieve for each query.
Returns:
dict: A dictionary with query IDs as keys and a list of top document IDs as values.
"""
query_results = {}
for query_id, query_text in queries_dict.items():
query_tokens = preprocess_text(query_text)
# Collect all document IDs that contain any of the query terms
relevant_docs = set()
for token in query_tokens:
if token in inverted_index:
relevant_docs.update(inverted_index[token])
# If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
doc_scores = []
for doc_id in relevant_docs:
doc_text = preprocess_text(wikipedia_dict[doc_id])
score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
doc_scores.append((score, doc_id))
# Get the top `top_n` documents based on the score
top_docs = heapq.nlargest(top_n, doc_scores)
query_results[query_id] = [doc_id for _, doc_id in top_docs]
return query_results
# Main flow
def main_boolean_retrieval(wikipedia_dict, queries_dict):
# Step 1: Create inverted index
inverted_index = create_inverted_index(wikipedia_dict)
# Step 2: Perform boolean retrieval
top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
return top_docs
def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
"""
Retrieve documents for a single query using the inverted index.
If the inverted index is not found, it will be created and saved.
Args:
query (str): The query text.
wikipedia_dict (dict): The original document dictionary.
top_n (int): The number of top documents to retrieve.
inverted_index_path (str): Path to the saved inverted index file.
Returns:
list: A list of top document IDs matching the query.
"""
# Load or create the inverted index
inverted_index = load_inverted_index(inverted_index_path)
if inverted_index is None:
print("Inverted index not found. Creating one...")
inverted_index = create_inverted_index(wikipedia_dict)
save_inverted_index(inverted_index, inverted_index_path)
# Preprocess the query
query_tokens = preprocess_text(query)
# Collect relevant documents
relevant_docs = set()
for token in query_tokens:
if token in inverted_index:
relevant_docs.update(inverted_index[token])
# Rank documents by frequency of terms
doc_scores = []
for doc_id in relevant_docs:
doc_text = preprocess_text(wikipedia_dict[doc_id])
score = sum(doc_text.count(token) for token in query_tokens)
doc_scores.append((score, doc_id))
# Get the top `top_n` documents based on the score
top_docs = heapq.nlargest(top_n, doc_scores)
return [doc_id for _, doc_id in top_docs]
# Example usage:
# Assuming `wikipedia_dict` and `queries_dict` are already prepared
# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
# print(top_results)
|