File size: 4,950 Bytes
9f21f05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from collections import defaultdict
import re
import heapq
import joblib
import os

def preprocess_text(text):
    """
    Preprocess the text for tokenization.
    Removes special characters, lowercases, and splits into words.
    """
    return re.findall(r'\w+', text.lower())

def create_inverted_index(wikipedia_dict):
    """
    Create an inverted index from the document dictionary.
    Args:
        wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.

    Returns:
        dict: An inverted index where each term maps to a list of document IDs containing it.
    """
    inverted_index = defaultdict(set)
    for doc_id, text in wikipedia_dict.items():
        tokens = set(preprocess_text(text))  # Unique tokens for each document
        for token in tokens:
            inverted_index[token].add(doc_id)
    return inverted_index

def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
    """
    Save the inverted index to a file using joblib.
    """
    joblib.dump(inverted_index, filepath)

def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
    """
    Load the inverted index from a file using joblib.
    """
    if os.path.exists(filepath):
        return joblib.load(filepath)
    return None

def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
    """
    Perform boolean retrieval for each query.
    Args:
        queries_dict (dict): A dictionary with query IDs as keys and query text as values.
        inverted_index (dict): The inverted index created from the document collection.
        wikipedia_dict (dict): The original document dictionary (for scoring if needed).
        top_n (int): The number of top documents to retrieve for each query.

    Returns:
        dict: A dictionary with query IDs as keys and a list of top document IDs as values.
    """
    query_results = {}
    
    for query_id, query_text in queries_dict.items():
        query_tokens = preprocess_text(query_text)
        
        # Collect all document IDs that contain any of the query terms
        relevant_docs = set()
        for token in query_tokens:
            if token in inverted_index:
                relevant_docs.update(inverted_index[token])
        
        # If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
        doc_scores = []
        for doc_id in relevant_docs:
            doc_text = preprocess_text(wikipedia_dict[doc_id])
            score = sum(doc_text.count(token) for token in query_tokens)  # Term frequency score
            doc_scores.append((score, doc_id))
        
        # Get the top `top_n` documents based on the score
        top_docs = heapq.nlargest(top_n, doc_scores)
        query_results[query_id] = [doc_id for _, doc_id in top_docs]

    return query_results

# Main flow
def main_boolean_retrieval(wikipedia_dict, queries_dict):
    # Step 1: Create inverted index
    inverted_index = create_inverted_index(wikipedia_dict)
    
    # Step 2: Perform boolean retrieval
    top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
    
    return top_docs

def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
    """
    Retrieve documents for a single query using the inverted index.
    If the inverted index is not found, it will be created and saved.

    Args:
        query (str): The query text.
        wikipedia_dict (dict): The original document dictionary.
        top_n (int): The number of top documents to retrieve.
        inverted_index_path (str): Path to the saved inverted index file.

    Returns:
        list: A list of top document IDs matching the query.
    """
    # Load or create the inverted index
    inverted_index = load_inverted_index(inverted_index_path)
    if inverted_index is None:
        print("Inverted index not found. Creating one...")
        inverted_index = create_inverted_index(wikipedia_dict)
        save_inverted_index(inverted_index, inverted_index_path)

    # Preprocess the query
    query_tokens = preprocess_text(query)
    
    # Collect relevant documents
    relevant_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            relevant_docs.update(inverted_index[token])
    
    # Rank documents by frequency of terms
    doc_scores = []
    for doc_id in relevant_docs:
        doc_text = preprocess_text(wikipedia_dict[doc_id])
        score = sum(doc_text.count(token) for token in query_tokens)
        doc_scores.append((score, doc_id))
    
    # Get the top `top_n` documents based on the score
    top_docs = heapq.nlargest(top_n, doc_scores)
    return [doc_id for _, doc_id in top_docs]

# Example usage:
# Assuming `wikipedia_dict` and `queries_dict` are already prepared
# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
# print(top_results)