Spaces:
Sleeping
Sleeping
Adding Baseline and DocumentCollection
Browse files- Baseline/boolean.py +54 -0
- Baseline/boolean_retrieval.py +134 -0
- Baseline/data_processor.py +98 -0
- Baseline/inverted_index.pkl +3 -0
- Datasets/mini_wiki_collection.json +3 -0
Baseline/boolean.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Baseline.data_processor import process_json_data, process_queries, merge_documents
|
2 |
+
from Baseline.boolean_retrieval import main_boolean_retrieval, retrieve_single_query
|
3 |
+
import json
|
4 |
+
|
5 |
+
def boolean_pipeline(query, wikipedia_data_path="Datasets/mini_wiki_collection.json", top_n=100):
|
6 |
+
# Load the JSON files
|
7 |
+
with open(wikipedia_data_path, "r") as file1:
|
8 |
+
wikipedia_data = json.load(file1)
|
9 |
+
|
10 |
+
# Process the JSON files
|
11 |
+
wikipedia_dict = process_json_data(wikipedia_data)
|
12 |
+
# Print the processed data
|
13 |
+
|
14 |
+
top_results = retrieve_single_query(query, wikipedia_dict, top_n)
|
15 |
+
|
16 |
+
return top_results
|
17 |
+
|
18 |
+
# def main():
|
19 |
+
# # Load the JSON files
|
20 |
+
# # boolean_retrieval("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?")
|
21 |
+
# # return
|
22 |
+
# with open("../Datasets/mini_wiki_collection.json", "r") as file1: # Replace with the actual path to your file
|
23 |
+
# wikipedia_data = json.load(file1)
|
24 |
+
|
25 |
+
# with open("../Datasets/mini_wiki_collection_10000_documents.json", "r") as file1: # Replace with the actual path to your file
|
26 |
+
# additional_json_file = json.load(file1)
|
27 |
+
|
28 |
+
# with open("../Datasets/FinalDataset_WithModifiedQuery.json", "r") as file2: # Replace with the actual path to your file
|
29 |
+
# queries_data = json.load(file2)
|
30 |
+
|
31 |
+
# # Process the JSON files
|
32 |
+
# wikipedia_dict = process_json_data(wikipedia_data)
|
33 |
+
# updated_main_dict = merge_documents(wikipedia_dict, additional_json_file, limit=2000)
|
34 |
+
# queries_dict = process_queries(queries_data)
|
35 |
+
|
36 |
+
# # Print the processed data
|
37 |
+
# print("Processed Wikipedia Data:")
|
38 |
+
# print(wikipedia_dict["420538"])
|
39 |
+
# print("\nProcessed Queries Data:")
|
40 |
+
# print(queries_dict["5xvggq"])
|
41 |
+
|
42 |
+
# top_results = main_boolean_retrieval(updated_main_dict, queries_dict)
|
43 |
+
|
44 |
+
# # Print the results for a specific query
|
45 |
+
# print("\nTop results for query '5xvggq':")
|
46 |
+
# print(top_results.get("5xvggq", []))
|
47 |
+
|
48 |
+
# # Optionally, save the top results to a JSON file
|
49 |
+
# with open("boolean_retrieval_1_2_query.json", "w") as output_file:
|
50 |
+
# json.dump(top_results, output_file, indent=4)
|
51 |
+
|
52 |
+
|
53 |
+
# # if __name__ == "__main__":
|
54 |
+
# # main()
|
Baseline/boolean_retrieval.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import defaultdict
|
2 |
+
import re
|
3 |
+
import heapq
|
4 |
+
import joblib
|
5 |
+
import os
|
6 |
+
|
7 |
+
def preprocess_text(text):
|
8 |
+
"""
|
9 |
+
Preprocess the text for tokenization.
|
10 |
+
Removes special characters, lowercases, and splits into words.
|
11 |
+
"""
|
12 |
+
return re.findall(r'\w+', text.lower())
|
13 |
+
|
14 |
+
def create_inverted_index(wikipedia_dict):
|
15 |
+
"""
|
16 |
+
Create an inverted index from the document dictionary.
|
17 |
+
Args:
|
18 |
+
wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
dict: An inverted index where each term maps to a list of document IDs containing it.
|
22 |
+
"""
|
23 |
+
inverted_index = defaultdict(set)
|
24 |
+
for doc_id, text in wikipedia_dict.items():
|
25 |
+
tokens = set(preprocess_text(text)) # Unique tokens for each document
|
26 |
+
for token in tokens:
|
27 |
+
inverted_index[token].add(doc_id)
|
28 |
+
return inverted_index
|
29 |
+
|
30 |
+
def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
|
31 |
+
"""
|
32 |
+
Save the inverted index to a file using joblib.
|
33 |
+
"""
|
34 |
+
joblib.dump(inverted_index, filepath)
|
35 |
+
|
36 |
+
def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
|
37 |
+
"""
|
38 |
+
Load the inverted index from a file using joblib.
|
39 |
+
"""
|
40 |
+
if os.path.exists(filepath):
|
41 |
+
return joblib.load(filepath)
|
42 |
+
return None
|
43 |
+
|
44 |
+
def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
|
45 |
+
"""
|
46 |
+
Perform boolean retrieval for each query.
|
47 |
+
Args:
|
48 |
+
queries_dict (dict): A dictionary with query IDs as keys and query text as values.
|
49 |
+
inverted_index (dict): The inverted index created from the document collection.
|
50 |
+
wikipedia_dict (dict): The original document dictionary (for scoring if needed).
|
51 |
+
top_n (int): The number of top documents to retrieve for each query.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
dict: A dictionary with query IDs as keys and a list of top document IDs as values.
|
55 |
+
"""
|
56 |
+
query_results = {}
|
57 |
+
|
58 |
+
for query_id, query_text in queries_dict.items():
|
59 |
+
query_tokens = preprocess_text(query_text)
|
60 |
+
|
61 |
+
# Collect all document IDs that contain any of the query terms
|
62 |
+
relevant_docs = set()
|
63 |
+
for token in query_tokens:
|
64 |
+
if token in inverted_index:
|
65 |
+
relevant_docs.update(inverted_index[token])
|
66 |
+
|
67 |
+
# If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
|
68 |
+
doc_scores = []
|
69 |
+
for doc_id in relevant_docs:
|
70 |
+
doc_text = preprocess_text(wikipedia_dict[doc_id])
|
71 |
+
score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
|
72 |
+
doc_scores.append((score, doc_id))
|
73 |
+
|
74 |
+
# Get the top `top_n` documents based on the score
|
75 |
+
top_docs = heapq.nlargest(top_n, doc_scores)
|
76 |
+
query_results[query_id] = [doc_id for _, doc_id in top_docs]
|
77 |
+
|
78 |
+
return query_results
|
79 |
+
|
80 |
+
# Main flow
|
81 |
+
def main_boolean_retrieval(wikipedia_dict, queries_dict):
|
82 |
+
# Step 1: Create inverted index
|
83 |
+
inverted_index = create_inverted_index(wikipedia_dict)
|
84 |
+
|
85 |
+
# Step 2: Perform boolean retrieval
|
86 |
+
top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
|
87 |
+
|
88 |
+
return top_docs
|
89 |
+
|
90 |
+
def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
|
91 |
+
"""
|
92 |
+
Retrieve documents for a single query using the inverted index.
|
93 |
+
If the inverted index is not found, it will be created and saved.
|
94 |
+
|
95 |
+
Args:
|
96 |
+
query (str): The query text.
|
97 |
+
wikipedia_dict (dict): The original document dictionary.
|
98 |
+
top_n (int): The number of top documents to retrieve.
|
99 |
+
inverted_index_path (str): Path to the saved inverted index file.
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
list: A list of top document IDs matching the query.
|
103 |
+
"""
|
104 |
+
# Load or create the inverted index
|
105 |
+
inverted_index = load_inverted_index(inverted_index_path)
|
106 |
+
if inverted_index is None:
|
107 |
+
print("Inverted index not found. Creating one...")
|
108 |
+
inverted_index = create_inverted_index(wikipedia_dict)
|
109 |
+
save_inverted_index(inverted_index, inverted_index_path)
|
110 |
+
|
111 |
+
# Preprocess the query
|
112 |
+
query_tokens = preprocess_text(query)
|
113 |
+
|
114 |
+
# Collect relevant documents
|
115 |
+
relevant_docs = set()
|
116 |
+
for token in query_tokens:
|
117 |
+
if token in inverted_index:
|
118 |
+
relevant_docs.update(inverted_index[token])
|
119 |
+
|
120 |
+
# Rank documents by frequency of terms
|
121 |
+
doc_scores = []
|
122 |
+
for doc_id in relevant_docs:
|
123 |
+
doc_text = preprocess_text(wikipedia_dict[doc_id])
|
124 |
+
score = sum(doc_text.count(token) for token in query_tokens)
|
125 |
+
doc_scores.append((score, doc_id))
|
126 |
+
|
127 |
+
# Get the top `top_n` documents based on the score
|
128 |
+
top_docs = heapq.nlargest(top_n, doc_scores)
|
129 |
+
return [doc_id for _, doc_id in top_docs]
|
130 |
+
|
131 |
+
# Example usage:
|
132 |
+
# Assuming `wikipedia_dict` and `queries_dict` are already prepared
|
133 |
+
# top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
|
134 |
+
# print(top_results)
|
Baseline/data_processor.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Assuming sanitize_text is a function you've defined elsewhere
|
2 |
+
|
3 |
+
import re
|
4 |
+
|
5 |
+
def merge_documents(main_dict, additional_json, limit=1000):
|
6 |
+
"""
|
7 |
+
Adds a subset of documents from an additional JSON file to the main dictionary.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
main_dict (dict): The main dictionary where processed documents are stored.
|
11 |
+
additional_json (list): The additional JSON data containing documents.
|
12 |
+
limit (int): The maximum number of documents to add to the main dictionary.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
dict: The updated main dictionary with additional documents added.
|
16 |
+
"""
|
17 |
+
# Counter to track how many documents have been added
|
18 |
+
count = 0
|
19 |
+
|
20 |
+
for doc in additional_json:
|
21 |
+
if count >= limit:
|
22 |
+
break
|
23 |
+
|
24 |
+
# Extract wikipedia_id and text from the document
|
25 |
+
wikipedia_id = doc.get("wikipedia_id")
|
26 |
+
text = doc.get("text", [])
|
27 |
+
|
28 |
+
# Check if the document ID is unique to avoid overwriting
|
29 |
+
if wikipedia_id not in main_dict:
|
30 |
+
# Process and sanitize the document
|
31 |
+
joined_text = " ".join(text)
|
32 |
+
sanitized_text = sanitize_text(joined_text)
|
33 |
+
|
34 |
+
# Add to the main dictionary
|
35 |
+
main_dict[wikipedia_id] = sanitized_text
|
36 |
+
count += 1
|
37 |
+
|
38 |
+
print(f"{count} documents added to the main dictionary.")
|
39 |
+
return main_dict
|
40 |
+
|
41 |
+
def sanitize_text(text):
|
42 |
+
"""
|
43 |
+
Cleans and standardizes text by keeping only alphanumeric characters and spaces.
|
44 |
+
Args:
|
45 |
+
text (str): Text to sanitize.
|
46 |
+
Returns:
|
47 |
+
str: Sanitized text.
|
48 |
+
"""
|
49 |
+
if isinstance(text, str):
|
50 |
+
# Use regex to keep only alphanumeric characters and spaces
|
51 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
52 |
+
# Optionally, collapse multiple spaces into a single space
|
53 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
54 |
+
return text
|
55 |
+
|
56 |
+
|
57 |
+
def process_json_data(json_data):
|
58 |
+
result_dict = {}
|
59 |
+
|
60 |
+
for doc in json_data:
|
61 |
+
# Extract wikipedia_id and text
|
62 |
+
wikipedia_id = doc.get("wikipedia_id")
|
63 |
+
text = doc.get("text", [])
|
64 |
+
|
65 |
+
# Join the text content and sanitize
|
66 |
+
joined_text = " ".join(text)
|
67 |
+
sanitized_text = sanitize_text(joined_text)
|
68 |
+
|
69 |
+
# Store in the dictionary
|
70 |
+
result_dict[wikipedia_id] = sanitized_text
|
71 |
+
|
72 |
+
return result_dict
|
73 |
+
|
74 |
+
def process_queries(json_data):
|
75 |
+
"""
|
76 |
+
Processes a JSON object containing queries and query IDs.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
json_data (dict): The input JSON data.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
dict: A dictionary with query_id as the key and query text as the value.
|
83 |
+
"""
|
84 |
+
result_dict = {}
|
85 |
+
|
86 |
+
for query_id, query_info in json_data.items():
|
87 |
+
# Extract the query input
|
88 |
+
query_text = query_info.get("input", "")
|
89 |
+
|
90 |
+
# Store query_id and text in the result dictionary
|
91 |
+
result_dict[query_id] = query_text
|
92 |
+
|
93 |
+
return result_dict
|
94 |
+
|
95 |
+
# Example usage
|
96 |
+
# Assuming `query_json_file` contains your JSON data
|
97 |
+
# processed_queries = process_queries(query_json_file)
|
98 |
+
|
Baseline/inverted_index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c47f19521041e7b2a5681da4128cfae538eba1bc653528f04c7dc9df300fbc5
|
3 |
+
size 4671080
|
Datasets/mini_wiki_collection.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:986eedb174550564ce95cf9b08de1207cfb1e2290646b4aeb60257c9edceb27a
|
3 |
+
size 41656963
|