yash9439 commited on
Commit
f7f9f67
·
verified ·
1 Parent(s): d72cf5d

Adding Baseline and DocumentCollection

Browse files
Baseline/boolean.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Baseline.data_processor import process_json_data, process_queries, merge_documents
2
+ from Baseline.boolean_retrieval import main_boolean_retrieval, retrieve_single_query
3
+ import json
4
+
5
+ def boolean_pipeline(query, wikipedia_data_path="Datasets/mini_wiki_collection.json", top_n=100):
6
+ # Load the JSON files
7
+ with open(wikipedia_data_path, "r") as file1:
8
+ wikipedia_data = json.load(file1)
9
+
10
+ # Process the JSON files
11
+ wikipedia_dict = process_json_data(wikipedia_data)
12
+ # Print the processed data
13
+
14
+ top_results = retrieve_single_query(query, wikipedia_dict, top_n)
15
+
16
+ return top_results
17
+
18
+ # def main():
19
+ # # Load the JSON files
20
+ # # boolean_retrieval("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?")
21
+ # # return
22
+ # with open("../Datasets/mini_wiki_collection.json", "r") as file1: # Replace with the actual path to your file
23
+ # wikipedia_data = json.load(file1)
24
+
25
+ # with open("../Datasets/mini_wiki_collection_10000_documents.json", "r") as file1: # Replace with the actual path to your file
26
+ # additional_json_file = json.load(file1)
27
+
28
+ # with open("../Datasets/FinalDataset_WithModifiedQuery.json", "r") as file2: # Replace with the actual path to your file
29
+ # queries_data = json.load(file2)
30
+
31
+ # # Process the JSON files
32
+ # wikipedia_dict = process_json_data(wikipedia_data)
33
+ # updated_main_dict = merge_documents(wikipedia_dict, additional_json_file, limit=2000)
34
+ # queries_dict = process_queries(queries_data)
35
+
36
+ # # Print the processed data
37
+ # print("Processed Wikipedia Data:")
38
+ # print(wikipedia_dict["420538"])
39
+ # print("\nProcessed Queries Data:")
40
+ # print(queries_dict["5xvggq"])
41
+
42
+ # top_results = main_boolean_retrieval(updated_main_dict, queries_dict)
43
+
44
+ # # Print the results for a specific query
45
+ # print("\nTop results for query '5xvggq':")
46
+ # print(top_results.get("5xvggq", []))
47
+
48
+ # # Optionally, save the top results to a JSON file
49
+ # with open("boolean_retrieval_1_2_query.json", "w") as output_file:
50
+ # json.dump(top_results, output_file, indent=4)
51
+
52
+
53
+ # # if __name__ == "__main__":
54
+ # # main()
Baseline/boolean_retrieval.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import re
3
+ import heapq
4
+ import joblib
5
+ import os
6
+
7
+ def preprocess_text(text):
8
+ """
9
+ Preprocess the text for tokenization.
10
+ Removes special characters, lowercases, and splits into words.
11
+ """
12
+ return re.findall(r'\w+', text.lower())
13
+
14
+ def create_inverted_index(wikipedia_dict):
15
+ """
16
+ Create an inverted index from the document dictionary.
17
+ Args:
18
+ wikipedia_dict (dict): A dictionary with document IDs as keys and text as values.
19
+
20
+ Returns:
21
+ dict: An inverted index where each term maps to a list of document IDs containing it.
22
+ """
23
+ inverted_index = defaultdict(set)
24
+ for doc_id, text in wikipedia_dict.items():
25
+ tokens = set(preprocess_text(text)) # Unique tokens for each document
26
+ for token in tokens:
27
+ inverted_index[token].add(doc_id)
28
+ return inverted_index
29
+
30
+ def save_inverted_index(inverted_index, filepath="Baseline/inverted_index.pkl"):
31
+ """
32
+ Save the inverted index to a file using joblib.
33
+ """
34
+ joblib.dump(inverted_index, filepath)
35
+
36
+ def load_inverted_index(filepath="Baseline/inverted_index.pkl"):
37
+ """
38
+ Load the inverted index from a file using joblib.
39
+ """
40
+ if os.path.exists(filepath):
41
+ return joblib.load(filepath)
42
+ return None
43
+
44
+ def boolean_retrieval(queries_dict, inverted_index, wikipedia_dict, top_n=100):
45
+ """
46
+ Perform boolean retrieval for each query.
47
+ Args:
48
+ queries_dict (dict): A dictionary with query IDs as keys and query text as values.
49
+ inverted_index (dict): The inverted index created from the document collection.
50
+ wikipedia_dict (dict): The original document dictionary (for scoring if needed).
51
+ top_n (int): The number of top documents to retrieve for each query.
52
+
53
+ Returns:
54
+ dict: A dictionary with query IDs as keys and a list of top document IDs as values.
55
+ """
56
+ query_results = {}
57
+
58
+ for query_id, query_text in queries_dict.items():
59
+ query_tokens = preprocess_text(query_text)
60
+
61
+ # Collect all document IDs that contain any of the query terms
62
+ relevant_docs = set()
63
+ for token in query_tokens:
64
+ if token in inverted_index:
65
+ relevant_docs.update(inverted_index[token])
66
+
67
+ # If more than `top_n` documents, sort by some criteria (e.g., frequency of terms in the doc)
68
+ doc_scores = []
69
+ for doc_id in relevant_docs:
70
+ doc_text = preprocess_text(wikipedia_dict[doc_id])
71
+ score = sum(doc_text.count(token) for token in query_tokens) # Term frequency score
72
+ doc_scores.append((score, doc_id))
73
+
74
+ # Get the top `top_n` documents based on the score
75
+ top_docs = heapq.nlargest(top_n, doc_scores)
76
+ query_results[query_id] = [doc_id for _, doc_id in top_docs]
77
+
78
+ return query_results
79
+
80
+ # Main flow
81
+ def main_boolean_retrieval(wikipedia_dict, queries_dict):
82
+ # Step 1: Create inverted index
83
+ inverted_index = create_inverted_index(wikipedia_dict)
84
+
85
+ # Step 2: Perform boolean retrieval
86
+ top_docs = boolean_retrieval(queries_dict, inverted_index, wikipedia_dict)
87
+
88
+ return top_docs
89
+
90
+ def retrieve_single_query(query, wikipedia_dict, top_n=100, inverted_index_path="Baseline/inverted_index.pkl"):
91
+ """
92
+ Retrieve documents for a single query using the inverted index.
93
+ If the inverted index is not found, it will be created and saved.
94
+
95
+ Args:
96
+ query (str): The query text.
97
+ wikipedia_dict (dict): The original document dictionary.
98
+ top_n (int): The number of top documents to retrieve.
99
+ inverted_index_path (str): Path to the saved inverted index file.
100
+
101
+ Returns:
102
+ list: A list of top document IDs matching the query.
103
+ """
104
+ # Load or create the inverted index
105
+ inverted_index = load_inverted_index(inverted_index_path)
106
+ if inverted_index is None:
107
+ print("Inverted index not found. Creating one...")
108
+ inverted_index = create_inverted_index(wikipedia_dict)
109
+ save_inverted_index(inverted_index, inverted_index_path)
110
+
111
+ # Preprocess the query
112
+ query_tokens = preprocess_text(query)
113
+
114
+ # Collect relevant documents
115
+ relevant_docs = set()
116
+ for token in query_tokens:
117
+ if token in inverted_index:
118
+ relevant_docs.update(inverted_index[token])
119
+
120
+ # Rank documents by frequency of terms
121
+ doc_scores = []
122
+ for doc_id in relevant_docs:
123
+ doc_text = preprocess_text(wikipedia_dict[doc_id])
124
+ score = sum(doc_text.count(token) for token in query_tokens)
125
+ doc_scores.append((score, doc_id))
126
+
127
+ # Get the top `top_n` documents based on the score
128
+ top_docs = heapq.nlargest(top_n, doc_scores)
129
+ return [doc_id for _, doc_id in top_docs]
130
+
131
+ # Example usage:
132
+ # Assuming `wikipedia_dict` and `queries_dict` are already prepared
133
+ # top_results = main_boolean_retrieval(wikipedia_dict, queries_dict)
134
+ # print(top_results)
Baseline/data_processor.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Assuming sanitize_text is a function you've defined elsewhere
2
+
3
+ import re
4
+
5
+ def merge_documents(main_dict, additional_json, limit=1000):
6
+ """
7
+ Adds a subset of documents from an additional JSON file to the main dictionary.
8
+
9
+ Args:
10
+ main_dict (dict): The main dictionary where processed documents are stored.
11
+ additional_json (list): The additional JSON data containing documents.
12
+ limit (int): The maximum number of documents to add to the main dictionary.
13
+
14
+ Returns:
15
+ dict: The updated main dictionary with additional documents added.
16
+ """
17
+ # Counter to track how many documents have been added
18
+ count = 0
19
+
20
+ for doc in additional_json:
21
+ if count >= limit:
22
+ break
23
+
24
+ # Extract wikipedia_id and text from the document
25
+ wikipedia_id = doc.get("wikipedia_id")
26
+ text = doc.get("text", [])
27
+
28
+ # Check if the document ID is unique to avoid overwriting
29
+ if wikipedia_id not in main_dict:
30
+ # Process and sanitize the document
31
+ joined_text = " ".join(text)
32
+ sanitized_text = sanitize_text(joined_text)
33
+
34
+ # Add to the main dictionary
35
+ main_dict[wikipedia_id] = sanitized_text
36
+ count += 1
37
+
38
+ print(f"{count} documents added to the main dictionary.")
39
+ return main_dict
40
+
41
+ def sanitize_text(text):
42
+ """
43
+ Cleans and standardizes text by keeping only alphanumeric characters and spaces.
44
+ Args:
45
+ text (str): Text to sanitize.
46
+ Returns:
47
+ str: Sanitized text.
48
+ """
49
+ if isinstance(text, str):
50
+ # Use regex to keep only alphanumeric characters and spaces
51
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
52
+ # Optionally, collapse multiple spaces into a single space
53
+ text = re.sub(r'\s+', ' ', text).strip()
54
+ return text
55
+
56
+
57
+ def process_json_data(json_data):
58
+ result_dict = {}
59
+
60
+ for doc in json_data:
61
+ # Extract wikipedia_id and text
62
+ wikipedia_id = doc.get("wikipedia_id")
63
+ text = doc.get("text", [])
64
+
65
+ # Join the text content and sanitize
66
+ joined_text = " ".join(text)
67
+ sanitized_text = sanitize_text(joined_text)
68
+
69
+ # Store in the dictionary
70
+ result_dict[wikipedia_id] = sanitized_text
71
+
72
+ return result_dict
73
+
74
+ def process_queries(json_data):
75
+ """
76
+ Processes a JSON object containing queries and query IDs.
77
+
78
+ Args:
79
+ json_data (dict): The input JSON data.
80
+
81
+ Returns:
82
+ dict: A dictionary with query_id as the key and query text as the value.
83
+ """
84
+ result_dict = {}
85
+
86
+ for query_id, query_info in json_data.items():
87
+ # Extract the query input
88
+ query_text = query_info.get("input", "")
89
+
90
+ # Store query_id and text in the result dictionary
91
+ result_dict[query_id] = query_text
92
+
93
+ return result_dict
94
+
95
+ # Example usage
96
+ # Assuming `query_json_file` contains your JSON data
97
+ # processed_queries = process_queries(query_json_file)
98
+
Baseline/inverted_index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c47f19521041e7b2a5681da4128cfae538eba1bc653528f04c7dc9df300fbc5
3
+ size 4671080
Datasets/mini_wiki_collection.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:986eedb174550564ce95cf9b08de1207cfb1e2290646b4aeb60257c9edceb27a
3
+ size 41656963