seanpedrickcase
commited on
Commit
•
fea085c
1
Parent(s):
d3ff2e2
Changed all intermediate file outputs to save to output folder
Browse files
app.py
CHANGED
@@ -12,11 +12,6 @@ from search_funcs.helper_functions import display_info, initial_data_load, put_c
|
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
14 |
|
15 |
-
#from fastapi import FastAPI
|
16 |
-
#app = FastAPI()
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
|
21 |
temp_folder_path = get_temp_folder_path()
|
22 |
empty_folder(temp_folder_path)
|
@@ -104,8 +99,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
104 |
**Thematic/semantic search**
|
105 |
|
106 |
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
107 |
-
""")
|
108 |
-
|
109 |
|
110 |
with gr.Row():
|
111 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
|
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
|
16 |
temp_folder_path = get_temp_folder_path()
|
17 |
empty_folder(temp_folder_path)
|
|
|
99 |
**Thematic/semantic search**
|
100 |
|
101 |
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
102 |
+
""")
|
|
|
103 |
|
104 |
with gr.Row():
|
105 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
search_funcs/bm25_functions.py
CHANGED
@@ -220,12 +220,12 @@ class BM25:
|
|
220 |
return list(indices), docs, list(scores)
|
221 |
|
222 |
def save(self, filename):
|
223 |
-
with open(f"{filename}.pkl", "wb") as fsave:
|
224 |
pickle.dump(self, fsave, protocol=pickle.HIGHEST_PROTOCOL)
|
225 |
|
226 |
@staticmethod
|
227 |
def load(filename):
|
228 |
-
with open(f"{filename}.pkl", "rb") as fsave:
|
229 |
return pickle.load(fsave)
|
230 |
|
231 |
# These following functions are my own work
|
@@ -432,9 +432,9 @@ def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_inter
|
|
432 |
progress(0.8, desc = "Saving search index to file")
|
433 |
|
434 |
if clean == "Yes":
|
435 |
-
bm25_search_file_name = data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
|
436 |
else:
|
437 |
-
bm25_search_file_name = data_file_name_no_ext + '_search_index.pkl.gz'
|
438 |
#np.savez_compressed(bm25_search_file_name, bm25)
|
439 |
|
440 |
with gzip.open(bm25_search_file_name, 'wb') as file:
|
|
|
220 |
return list(indices), docs, list(scores)
|
221 |
|
222 |
def save(self, filename):
|
223 |
+
with open(f"{output_folder}{filename}.pkl", "wb") as fsave:
|
224 |
pickle.dump(self, fsave, protocol=pickle.HIGHEST_PROTOCOL)
|
225 |
|
226 |
@staticmethod
|
227 |
def load(filename):
|
228 |
+
with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
|
229 |
return pickle.load(fsave)
|
230 |
|
231 |
# These following functions are my own work
|
|
|
432 |
progress(0.8, desc = "Saving search index to file")
|
433 |
|
434 |
if clean == "Yes":
|
435 |
+
bm25_search_file_name = output_folder + data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
|
436 |
else:
|
437 |
+
bm25_search_file_name = output_folder + data_file_name_no_ext + '_search_index.pkl.gz'
|
438 |
#np.savez_compressed(bm25_search_file_name, bm25)
|
439 |
|
440 |
with gzip.open(bm25_search_file_name, 'wb') as file:
|
search_funcs/helper_functions.py
CHANGED
@@ -231,11 +231,6 @@ def put_columns_in_join_df(in_file):
|
|
231 |
return gr.Dropdown(choices=concat_choices), new_df, out_message
|
232 |
|
233 |
|
234 |
-
"""
|
235 |
-
A dummy function that exists just so that dropdown updates work correctly.
|
236 |
-
"""
|
237 |
-
return None
|
238 |
-
|
239 |
def display_info(info_component):
|
240 |
gr.Info(info_component)
|
241 |
|
|
|
231 |
return gr.Dropdown(choices=concat_choices), new_df, out_message
|
232 |
|
233 |
|
|
|
|
|
|
|
|
|
|
|
234 |
def display_info(info_component):
|
235 |
gr.Info(info_component)
|
236 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -68,9 +68,7 @@ def parse_file_not_used(file_paths, text_column='text'):
|
|
68 |
file_names = []
|
69 |
|
70 |
for file_path in file_paths:
|
71 |
-
|
72 |
-
#file = open(file_path.name, 'r')
|
73 |
-
#print(file)
|
74 |
file_extension = detect_file_type(file_path.name)
|
75 |
if file_extension in extension_to_parser:
|
76 |
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
|
@@ -222,19 +220,11 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
222 |
if "prepared_docs" in data_file_name:
|
223 |
print("Loading in documents from file.")
|
224 |
|
225 |
-
#print(df[0:5])
|
226 |
-
#section_series = df.iloc[:,0]
|
227 |
-
#section_series = "{" + section_series + "}"
|
228 |
-
|
229 |
doc_sections = df
|
230 |
|
231 |
-
#print(doc_sections[0])
|
232 |
-
|
233 |
# Convert each element in the Series to a Document instance
|
234 |
-
#doc_sections = section_series.apply(lambda x: Document(**x))
|
235 |
|
236 |
return doc_sections, "Finished preparing documents", output_list
|
237 |
-
# df = document_to_dataframe(df.iloc[:,0])
|
238 |
|
239 |
ingest_tic = time.perf_counter()
|
240 |
|
@@ -248,16 +238,9 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
248 |
clean_tic = time.perf_counter()
|
249 |
print("Starting data clean.")
|
250 |
|
251 |
-
#df = df.drop_duplicates(text_column)
|
252 |
-
|
253 |
df_list = list(df[text_column])
|
254 |
df_list = initial_clean(df_list)
|
255 |
|
256 |
-
# Get rid of old data and keep only the new
|
257 |
-
#df = df.drop(text_column, axis = 1)
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
# Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
|
262 |
out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
263 |
|
@@ -272,13 +255,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
272 |
|
273 |
df["metadata"] = combine_metadata_columns(df, cols)
|
274 |
|
275 |
-
#df = df.rename(columns={text_column:"page_content"})
|
276 |
-
|
277 |
-
#print(df[["page_content", "metadata"]].to_dict(orient='records'))
|
278 |
-
|
279 |
-
#doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
|
280 |
-
#doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
|
281 |
-
|
282 |
progress(0.3, desc = "Converting data to document format")
|
283 |
|
284 |
# Create a list of Document objects
|
@@ -295,29 +271,17 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
295 |
progress(0.5, desc = "Saving prepared documents")
|
296 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
297 |
file_name = data_file_out_name_no_ext
|
298 |
-
#print(doc_sections)
|
299 |
-
#page_content_series_string = pd.Series(doc_sections).astype(str)
|
300 |
-
#page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':")
|
301 |
-
#page_content_series_string = page_content_series_string + "}"
|
302 |
-
#print(page_content_series_string[0])
|
303 |
-
#metadata_series_string = pd.Series(doc_sections[1]).astype(str)
|
304 |
-
|
305 |
|
306 |
if clean == "No":
|
307 |
-
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
308 |
out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
|
309 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
310 |
pickle.dump(doc_sections, file)
|
311 |
|
312 |
-
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
|
313 |
elif clean == "Yes":
|
314 |
-
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
315 |
-
|
316 |
out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
|
317 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
318 |
pickle.dump(doc_sections, file)
|
319 |
|
320 |
-
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|
321 |
output_list.append(out_doc_file_name)
|
322 |
print("Documents saved to file.")
|
323 |
|
|
|
68 |
file_names = []
|
69 |
|
70 |
for file_path in file_paths:
|
71 |
+
|
|
|
|
|
72 |
file_extension = detect_file_type(file_path.name)
|
73 |
if file_extension in extension_to_parser:
|
74 |
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
|
|
|
220 |
if "prepared_docs" in data_file_name:
|
221 |
print("Loading in documents from file.")
|
222 |
|
|
|
|
|
|
|
|
|
223 |
doc_sections = df
|
224 |
|
|
|
|
|
225 |
# Convert each element in the Series to a Document instance
|
|
|
226 |
|
227 |
return doc_sections, "Finished preparing documents", output_list
|
|
|
228 |
|
229 |
ingest_tic = time.perf_counter()
|
230 |
|
|
|
238 |
clean_tic = time.perf_counter()
|
239 |
print("Starting data clean.")
|
240 |
|
|
|
|
|
241 |
df_list = list(df[text_column])
|
242 |
df_list = initial_clean(df_list)
|
243 |
|
|
|
|
|
|
|
|
|
|
|
244 |
# Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
|
245 |
out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
246 |
|
|
|
255 |
|
256 |
df["metadata"] = combine_metadata_columns(df, cols)
|
257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
progress(0.3, desc = "Converting data to document format")
|
259 |
|
260 |
# Create a list of Document objects
|
|
|
271 |
progress(0.5, desc = "Saving prepared documents")
|
272 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
273 |
file_name = data_file_out_name_no_ext
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
if clean == "No":
|
|
|
276 |
out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
|
277 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
278 |
pickle.dump(doc_sections, file)
|
279 |
|
|
|
280 |
elif clean == "Yes":
|
|
|
|
|
281 |
out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
|
282 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
283 |
pickle.dump(doc_sections, file)
|
284 |
|
|
|
285 |
output_list.append(out_doc_file_name)
|
286 |
print("Documents saved to file.")
|
287 |
|
search_funcs/spacy_search_funcs.py
CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
|
|
7 |
import pandas as pd
|
8 |
from typing import List, Type
|
9 |
from datetime import datetime
|
10 |
-
from search_funcs.helper_functions import create_highlighted_excel_wb
|
11 |
|
12 |
PandasDataFrame = Type[pd.DataFrame]
|
13 |
|
@@ -106,7 +106,7 @@ def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: Pand
|
|
106 |
|
107 |
# Out file
|
108 |
query_str_file = ("_").join(tokenised_query)
|
109 |
-
results_df_name = "
|
110 |
|
111 |
print("Saving search file output")
|
112 |
progress(0.7, desc = "Saving search output to file")
|
|
|
7 |
import pandas as pd
|
8 |
from typing import List, Type
|
9 |
from datetime import datetime
|
10 |
+
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder
|
11 |
|
12 |
PandasDataFrame = Type[pd.DataFrame]
|
13 |
|
|
|
106 |
|
107 |
# Out file
|
108 |
query_str_file = ("_").join(tokenised_query)
|
109 |
+
results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
110 |
|
111 |
print("Saving search file output")
|
112 |
progress(0.7, desc = "Saving search output to file")
|