Spaces:
Sleeping
Sleeping
Commit
·
739b386
1
Parent(s):
c6dc87d
Cut out semantic search temporarily while issues with Jina gated model resolved. Improved error/progress tracking and messaging. Placeholder for Spacy fuzzy search.
Browse files- app.py +34 -33
- how_to_create_exe_dist.txt +3 -0
- requirements.txt +4 -4
- search_funcs/bm25_functions.py +24 -18
- search_funcs/helper_functions.py +13 -5
- search_funcs/semantic_functions.py +2 -2
- search_funcs/semantic_ingest_functions.py +2 -2
- search_funcs/spacy_search_funcs.py +137 -0
app.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from typing import Type
|
2 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
3 |
-
from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
|
4 |
-
from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
|
5 |
-
from search_funcs.helper_functions import dummy_function, display_info,
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
@@ -25,6 +25,7 @@ with block:
|
|
25 |
vectorstore_state = gr.State() # globals()["vectorstore"]
|
26 |
embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
|
27 |
search_index_state = gr.State()
|
|
|
28 |
|
29 |
k_val = gr.State(9999)
|
30 |
out_passages = gr.State(9999)
|
@@ -82,31 +83,31 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
82 |
output_single_text = gr.Textbox(label="Top result")
|
83 |
output_file = gr.File(label="File output")
|
84 |
|
85 |
-
with gr.Tab("Semantic search"):
|
86 |
-
|
87 |
-
"""
|
88 |
-
**Thematic/semantic search**
|
89 |
|
90 |
-
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
91 |
-
""")
|
92 |
-
|
93 |
-
|
94 |
|
95 |
-
|
96 |
-
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
|
104 |
-
|
105 |
-
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
with gr.Tab(label="Advanced options"):
|
112 |
with gr.Accordion(label="Data load / save options", open = True):
|
@@ -148,12 +149,12 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
148 |
|
149 |
### BM25 SEARCH ###
|
150 |
# Update dropdowns upon initial file load
|
151 |
-
in_bm25_file.upload(
|
152 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
153 |
|
154 |
# Load in BM25 data
|
155 |
-
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file
|
156 |
-
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
157 |
|
158 |
# BM25 search functions on click or enter
|
159 |
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
@@ -161,20 +162,20 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
161 |
|
162 |
### SEMANTIC SEARCH ###
|
163 |
# Load in a csv/excel file for semantic search
|
164 |
-
in_semantic_file.upload(
|
165 |
-
load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
|
166 |
-
|
167 |
-
|
168 |
|
169 |
-
# Semantic search query
|
170 |
-
semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
171 |
-
semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
172 |
|
173 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
174 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
175 |
search_df_join_column.change(dummy_function, search_df_join_column, None)
|
176 |
in_join_column.change(dummy_function, in_join_column, None)
|
177 |
-
in_semantic_column.change(dummy_function, in_join_column, None)
|
178 |
|
179 |
block.queue().launch(debug=True)
|
180 |
|
|
|
1 |
from typing import Type
|
2 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
3 |
+
#from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
|
4 |
+
#from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
|
5 |
+
from search_funcs.helper_functions import dummy_function, display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
|
|
25 |
vectorstore_state = gr.State() # globals()["vectorstore"]
|
26 |
embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
|
27 |
search_index_state = gr.State()
|
28 |
+
tokenised_state = gr.State()
|
29 |
|
30 |
k_val = gr.State(9999)
|
31 |
out_passages = gr.State(9999)
|
|
|
83 |
output_single_text = gr.Textbox(label="Top result")
|
84 |
output_file = gr.File(label="File output")
|
85 |
|
86 |
+
# with gr.Tab("Semantic search"):
|
87 |
+
# gr.Markdown(
|
88 |
+
# """
|
89 |
+
# **Thematic/semantic search**
|
90 |
|
91 |
+
# This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
92 |
+
# """)
|
93 |
+
# with gr.Row():
|
94 |
+
# current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
95 |
|
96 |
+
# with gr.Accordion("Load in data", open = True):
|
97 |
+
# in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
|
98 |
|
99 |
+
# with gr.Row():
|
100 |
+
# in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
101 |
+
# load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
|
102 |
|
103 |
+
# semantic_load_progress = gr.Textbox(label="Load progress")
|
104 |
|
105 |
+
# semantic_query = gr.Textbox(label="Enter semantic search query here")
|
106 |
+
# semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
|
107 |
|
108 |
+
# with gr.Row():
|
109 |
+
# semantic_output_single_text = gr.Textbox(label="Top result")
|
110 |
+
# semantic_output_file = gr.File(label="File output")
|
111 |
|
112 |
with gr.Tab(label="Advanced options"):
|
113 |
with gr.Accordion(label="Data load / save options", open = True):
|
|
|
149 |
|
150 |
### BM25 SEARCH ###
|
151 |
# Update dropdowns upon initial file load
|
152 |
+
in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
|
153 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
154 |
|
155 |
# Load in BM25 data
|
156 |
+
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file]).\
|
157 |
+
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
158 |
|
159 |
# BM25 search functions on click or enter
|
160 |
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
|
|
162 |
|
163 |
### SEMANTIC SEARCH ###
|
164 |
# Load in a csv/excel file for semantic search
|
165 |
+
# in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress, current_source])
|
166 |
+
# load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
|
167 |
+
# then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
168 |
+
# then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
169 |
|
170 |
+
# # Semantic search query
|
171 |
+
# semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
172 |
+
# semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
173 |
|
174 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
175 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
176 |
search_df_join_column.change(dummy_function, search_df_join_column, None)
|
177 |
in_join_column.change(dummy_function, in_join_column, None)
|
178 |
+
# in_semantic_column.change(dummy_function, in_join_column, None)
|
179 |
|
180 |
block.queue().launch(debug=True)
|
181 |
|
how_to_create_exe_dist.txt
CHANGED
@@ -19,6 +19,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
19 |
For one single file:
|
20 |
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
21 |
|
|
|
|
|
|
|
22 |
For a small exe with a folder of dependencies:
|
23 |
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
24 |
|
|
|
19 |
For one single file:
|
20 |
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
21 |
|
22 |
+
If not using embedding model:
|
23 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --onefile --clean --noconfirm --name DataSearchApp_0.2.2_keyword app.py
|
24 |
+
|
25 |
For a small exe with a folder of dependencies:
|
26 |
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
27 |
|
requirements.txt
CHANGED
@@ -2,9 +2,9 @@ pandas==2.1.4
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
-
transformers==4.32.1
|
6 |
-
accelerate==0.26.0
|
7 |
-
torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.
|
10 |
gradio==3.50.2
|
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
+
# transformers==4.32.1
|
6 |
+
# accelerate==0.26.0
|
7 |
+
# torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.2/en_core_web_sm-3.7.2.tar.gz
|
10 |
gradio==3.50.2
|
search_funcs/bm25_functions.py
CHANGED
@@ -231,7 +231,7 @@ class BM25:
|
|
231 |
|
232 |
# These following functions are my own work
|
233 |
|
234 |
-
def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
235 |
print(in_file)
|
236 |
|
237 |
if not in_file:
|
@@ -243,7 +243,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
243 |
|
244 |
#print(file_list)
|
245 |
|
246 |
-
data_file_names = [string
|
247 |
|
248 |
if not data_file_names:
|
249 |
return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
|
@@ -260,8 +260,8 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
260 |
## Load in pre-tokenised corpus if exists
|
261 |
tokenised_df = pd.DataFrame()
|
262 |
|
263 |
-
tokenised_file_names = [string
|
264 |
-
search_index_file_names = [string
|
265 |
|
266 |
df[text_column] = df[text_column].astype(str).str.lower()
|
267 |
|
@@ -271,8 +271,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
271 |
print(message)
|
272 |
return corpus, message, df, None, None, None
|
273 |
|
274 |
-
|
275 |
-
tokenised_df = read_file(tokenised_file_names[0])
|
276 |
|
277 |
if clean == "Yes":
|
278 |
progress(0.1, desc = "Cleaning data")
|
@@ -300,12 +299,12 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
300 |
|
301 |
progress(0.4, desc = "Tokenising text")
|
302 |
|
303 |
-
if
|
|
|
304 |
corpus = tokenised_df.iloc[:,0].tolist()
|
305 |
-
print("
|
306 |
#print("Corpus is: ", corpus[0:5])
|
307 |
|
308 |
-
# If doesn't already exist, tokenize texts in batches
|
309 |
else:
|
310 |
tokeniser_tic = time.perf_counter()
|
311 |
corpus = []
|
@@ -316,7 +315,6 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
316 |
tokeniser_toc = time.perf_counter()
|
317 |
tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
|
318 |
print(tokenizer_time_out)
|
319 |
-
|
320 |
|
321 |
if len(df_list) >= 20:
|
322 |
message = "Data loaded"
|
@@ -324,12 +322,16 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
324 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
325 |
|
326 |
if return_intermediate_files == "Yes":
|
327 |
-
|
|
|
|
|
|
|
|
|
328 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
329 |
|
330 |
-
return corpus, message, df, out_file_name, tokenised_data_file_name
|
331 |
|
332 |
-
return corpus, message, df, out_file_name, None
|
333 |
|
334 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
335 |
|
@@ -357,7 +359,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
|
|
357 |
|
358 |
return file_name, new_text_column
|
359 |
|
360 |
-
def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
|
361 |
#bm25.save("saved_df_bm25")
|
362 |
#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
|
363 |
|
@@ -385,7 +387,7 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
|
|
385 |
#print(file_list)
|
386 |
|
387 |
# Get data file name
|
388 |
-
data_file_names = [string
|
389 |
|
390 |
if not data_file_names:
|
391 |
return "Please load in at least one csv/Excel/parquet data file.", None
|
@@ -395,7 +397,7 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
|
|
395 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
396 |
|
397 |
# Check if there is a search index file already
|
398 |
-
#index_file_names = [string
|
399 |
|
400 |
progress(0.6, desc = "Preparing search index")
|
401 |
|
@@ -422,8 +424,12 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
|
|
422 |
if return_intermediate_files == "Yes":
|
423 |
print("Saving search index file")
|
424 |
progress(0.8, desc = "Saving search index to file")
|
425 |
-
|
426 |
-
|
|
|
|
|
|
|
|
|
427 |
|
428 |
with gzip.open(bm25_search_file_name, 'wb') as file:
|
429 |
pickle.dump(bm25, file)
|
|
|
231 |
|
232 |
# These following functions are my own work
|
233 |
|
234 |
+
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
235 |
print(in_file)
|
236 |
|
237 |
if not in_file:
|
|
|
243 |
|
244 |
#print(file_list)
|
245 |
|
246 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
|
247 |
|
248 |
if not data_file_names:
|
249 |
return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
|
|
|
260 |
## Load in pre-tokenised corpus if exists
|
261 |
tokenised_df = pd.DataFrame()
|
262 |
|
263 |
+
tokenised_file_names = [string for string in file_list if "tokenised" in string.lower()]
|
264 |
+
search_index_file_names = [string for string in file_list if "gz" in string.lower()]
|
265 |
|
266 |
df[text_column] = df[text_column].astype(str).str.lower()
|
267 |
|
|
|
271 |
print(message)
|
272 |
return corpus, message, df, None, None, None
|
273 |
|
274 |
+
|
|
|
275 |
|
276 |
if clean == "Yes":
|
277 |
progress(0.1, desc = "Cleaning data")
|
|
|
299 |
|
300 |
progress(0.4, desc = "Tokenising text")
|
301 |
|
302 |
+
if tokenised_state:
|
303 |
+
tokenised_df = tokenised_state
|
304 |
corpus = tokenised_df.iloc[:,0].tolist()
|
305 |
+
print("Tokenised data loaded from file")
|
306 |
#print("Corpus is: ", corpus[0:5])
|
307 |
|
|
|
308 |
else:
|
309 |
tokeniser_tic = time.perf_counter()
|
310 |
corpus = []
|
|
|
315 |
tokeniser_toc = time.perf_counter()
|
316 |
tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
|
317 |
print(tokenizer_time_out)
|
|
|
318 |
|
319 |
if len(df_list) >= 20:
|
320 |
message = "Data loaded"
|
|
|
322 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
323 |
|
324 |
if return_intermediate_files == "Yes":
|
325 |
+
if clean == "Yes":
|
326 |
+
tokenised_data_file_name = data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
|
327 |
+
else:
|
328 |
+
tokenised_data_file_name = data_file_out_name_no_ext + "_tokenised.parquet"
|
329 |
+
|
330 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
331 |
|
332 |
+
return corpus, message, df, out_file_name, tokenised_data_file_name
|
333 |
|
334 |
+
return corpus, message, df, out_file_name, None # tokenised_data_file_name
|
335 |
|
336 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
337 |
|
|
|
359 |
|
360 |
return file_name, new_text_column
|
361 |
|
362 |
+
def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
|
363 |
#bm25.save("saved_df_bm25")
|
364 |
#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
|
365 |
|
|
|
387 |
#print(file_list)
|
388 |
|
389 |
# Get data file name
|
390 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
|
391 |
|
392 |
if not data_file_names:
|
393 |
return "Please load in at least one csv/Excel/parquet data file.", None
|
|
|
397 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
398 |
|
399 |
# Check if there is a search index file already
|
400 |
+
#index_file_names = [string for string in file_list if "gz" in string.lower()]
|
401 |
|
402 |
progress(0.6, desc = "Preparing search index")
|
403 |
|
|
|
424 |
if return_intermediate_files == "Yes":
|
425 |
print("Saving search index file")
|
426 |
progress(0.8, desc = "Saving search index to file")
|
427 |
+
|
428 |
+
if clean == "Yes":
|
429 |
+
bm25_search_file_name = data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
|
430 |
+
else:
|
431 |
+
bm25_search_file_name = data_file_name_no_ext + '_search_index.pkl.gz'
|
432 |
+
#np.savez_compressed(bm25_search_file_name, bm25)
|
433 |
|
434 |
with gzip.open(bm25_search_file_name, 'wb') as file:
|
435 |
pickle.dump(bm25, file)
|
search_funcs/helper_functions.py
CHANGED
@@ -88,7 +88,7 @@ def read_file(filename):
|
|
88 |
|
89 |
return file
|
90 |
|
91 |
-
def
|
92 |
'''
|
93 |
When file is loaded, update the column dropdown choices
|
94 |
'''
|
@@ -96,13 +96,15 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
96 |
concat_choices = []
|
97 |
index_load = None
|
98 |
embed_load = np.array([])
|
|
|
99 |
out_message = ""
|
|
|
100 |
|
101 |
file_list = [string.name for string in in_file]
|
102 |
|
103 |
#print(file_list)
|
104 |
|
105 |
-
data_file_names = [string
|
106 |
|
107 |
if not data_file_names:
|
108 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
@@ -110,6 +112,8 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
110 |
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
|
111 |
|
112 |
data_file_name = data_file_names[0]
|
|
|
|
|
113 |
|
114 |
|
115 |
df = read_file(data_file_name)
|
@@ -128,13 +132,13 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
128 |
concat_choices.extend(new_choices)
|
129 |
|
130 |
# Check if there is a search index file already
|
131 |
-
index_file_names = [string
|
132 |
|
133 |
if index_file_names:
|
134 |
index_file_name = index_file_names[0]
|
135 |
index_load = read_file(index_file_name)
|
136 |
|
137 |
-
embeddings_file_names = [string
|
138 |
|
139 |
if embeddings_file_names:
|
140 |
print("Loading embeddings from file.")
|
@@ -146,10 +150,14 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
146 |
else:
|
147 |
embed_load = np.array([])
|
148 |
|
|
|
|
|
|
|
|
|
149 |
out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
|
150 |
print(out_message)
|
151 |
|
152 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, out_message
|
153 |
|
154 |
def put_columns_in_join_df(in_file):
|
155 |
'''
|
|
|
88 |
|
89 |
return file
|
90 |
|
91 |
+
def initial_data_load(in_file, in_bm25_column):
|
92 |
'''
|
93 |
When file is loaded, update the column dropdown choices
|
94 |
'''
|
|
|
96 |
concat_choices = []
|
97 |
index_load = None
|
98 |
embed_load = np.array([])
|
99 |
+
tokenised_load =[]
|
100 |
out_message = ""
|
101 |
+
current_source = ""
|
102 |
|
103 |
file_list = [string.name for string in in_file]
|
104 |
|
105 |
#print(file_list)
|
106 |
|
107 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
|
108 |
|
109 |
if not data_file_names:
|
110 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
|
|
112 |
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
|
113 |
|
114 |
data_file_name = data_file_names[0]
|
115 |
+
|
116 |
+
current_source = get_file_path_end_with_ext(data_file_name)
|
117 |
|
118 |
|
119 |
df = read_file(data_file_name)
|
|
|
132 |
concat_choices.extend(new_choices)
|
133 |
|
134 |
# Check if there is a search index file already
|
135 |
+
index_file_names = [string for string in file_list if "gz" in string.lower()]
|
136 |
|
137 |
if index_file_names:
|
138 |
index_file_name = index_file_names[0]
|
139 |
index_load = read_file(index_file_name)
|
140 |
|
141 |
+
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
142 |
|
143 |
if embeddings_file_names:
|
144 |
print("Loading embeddings from file.")
|
|
|
150 |
else:
|
151 |
embed_load = np.array([])
|
152 |
|
153 |
+
tokenised_file_names = [string for string in file_list if "tokenised" in string.lower()]
|
154 |
+
if tokenised_file_names:
|
155 |
+
tokenised_load = read_file(tokenised_file_names[0])
|
156 |
+
|
157 |
out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
|
158 |
print(out_message)
|
159 |
|
160 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, tokenised_load, out_message, current_source
|
161 |
|
162 |
def put_columns_in_join_df(in_file):
|
163 |
'''
|
search_funcs/semantic_functions.py
CHANGED
@@ -92,8 +92,8 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
|
|
92 |
|
93 |
#print(file_list)
|
94 |
|
95 |
-
embeddings_file_names = [string
|
96 |
-
data_file_names = [string
|
97 |
data_file_name = data_file_names[0]
|
98 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
99 |
|
|
|
92 |
|
93 |
#print(file_list)
|
94 |
|
95 |
+
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
96 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
97 |
data_file_name = data_file_names[0]
|
98 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
99 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
|
|
130 |
|
131 |
#print(file_list)
|
132 |
|
133 |
-
data_file_names = [string
|
134 |
|
135 |
data_file_name = data_file_names[0]
|
136 |
|
@@ -303,7 +303,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
303 |
|
304 |
file_list = [string.name for string in in_file]
|
305 |
|
306 |
-
data_file_names = [string
|
307 |
|
308 |
if not data_file_names:
|
309 |
return doc_sections, "Please load in at least one csv/Excel/parquet data file."
|
|
|
130 |
|
131 |
#print(file_list)
|
132 |
|
133 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
134 |
|
135 |
data_file_name = data_file_names[0]
|
136 |
|
|
|
303 |
|
304 |
file_list = [string.name for string in in_file]
|
305 |
|
306 |
+
data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
307 |
|
308 |
if not data_file_names:
|
309 |
return doc_sections, "Please load in at least one csv/Excel/parquet data file."
|
search_funcs/spacy_search_funcs.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from spacy.matcher import Matcher
|
3 |
+
import numpy as np
|
4 |
+
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
+
from typing import List, Type
|
7 |
+
|
8 |
+
PandasDataFrame = Type[pd.DataFrame]
|
9 |
+
|
10 |
+
nlp = spacy.load("en_core_web_sm")
|
11 |
+
|
12 |
+
string_query = "knife attack run fast"
|
13 |
+
df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]
|
14 |
+
|
15 |
+
|
16 |
+
def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
17 |
+
''' Conduct fuzzy match on a list of data.'''
|
18 |
+
|
19 |
+
query = nlp(string_query)
|
20 |
+
tokenised_query = [token.text for token in query]
|
21 |
+
print(tokenised_query)
|
22 |
+
|
23 |
+
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)
|
24 |
+
|
25 |
+
# %%
|
26 |
+
if len(tokenised_query) > 1:
|
27 |
+
pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
|
28 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
|
29 |
+
elif len(tokenised_query) == 1:
|
30 |
+
pattern_lemma = [{"LEMMA": tokenised_query[0]}]
|
31 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
|
32 |
+
else:
|
33 |
+
tokenised_query = [""]
|
34 |
+
|
35 |
+
# %%
|
36 |
+
search_pattern = pattern_fuzz.copy()
|
37 |
+
search_pattern.extend(pattern_lemma)
|
38 |
+
|
39 |
+
|
40 |
+
# %%
|
41 |
+
matcher = Matcher(nlp.vocab)
|
42 |
+
|
43 |
+
# %% [markdown]
|
44 |
+
# from spacy.tokens import Span
|
45 |
+
# from spacy import displacy
|
46 |
+
#
|
47 |
+
# def add_event_ent(matcher, doc, i, matches):
|
48 |
+
# # Get the current match and create tuple of entity label, start and end.
|
49 |
+
# # Append entity to the doc's entity. (Don't overwrite doc.ents!)
|
50 |
+
# match_id, start, end = matches[i]
|
51 |
+
# entity = Span(doc, start, end, label="EVENT")
|
52 |
+
# doc.ents += (entity,)
|
53 |
+
# print(entity.text)
|
54 |
+
|
55 |
+
# %% [markdown]
|
56 |
+
# matched_sents = [] # Collect data of matched sentences to be visualized
|
57 |
+
#
|
58 |
+
# def collect_sents(matcher, doc, i, matches):
|
59 |
+
# match_id, start, end = matches[i]
|
60 |
+
# span = doc[start:end] # Matched span
|
61 |
+
# sent = span.sent # Sentence containing matched span
|
62 |
+
# # Append mock entity for match in displaCy style to matched_sents
|
63 |
+
# # get the match span by ofsetting the start and end of the span with the
|
64 |
+
# # start and end of the sentence in the doc
|
65 |
+
# match_ents = [{
|
66 |
+
# "start": span.start_char - sent.start_char,
|
67 |
+
# "end": span.end_char - sent.start_char,
|
68 |
+
# "label": "MATCH",
|
69 |
+
# }]
|
70 |
+
# matched_sents.append({"text": sent.text, "ents": match_ents})
|
71 |
+
|
72 |
+
# %%
|
73 |
+
matcher.add(string_query, [pattern_fuzz])#, on_match=add_event_ent)
|
74 |
+
matcher.add(string_query, [pattern_lemma])#, on_match=add_event_ent)
|
75 |
+
|
76 |
+
# %%
|
77 |
+
batch_size = 256
|
78 |
+
docs = nlp.pipe(df_list, batch_size=batch_size)
|
79 |
+
|
80 |
+
# %%
|
81 |
+
all_matches = []
|
82 |
+
|
83 |
+
# Get number of matches per doc
|
84 |
+
for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
85 |
+
matches = matcher(doc)
|
86 |
+
match_count = len(matches)
|
87 |
+
all_matches.append(match_count)
|
88 |
+
|
89 |
+
print("Search complete")
|
90 |
+
|
91 |
+
## Get document lengths
|
92 |
+
lengths = []
|
93 |
+
for element in df_list:
|
94 |
+
lengths.append(len(element))
|
95 |
+
|
96 |
+
# Score is number of matches divided by length of document
|
97 |
+
match_scores = (np.array(all_matches)/np.array(lengths)).tolist()
|
98 |
+
|
99 |
+
# Prepare results and export
|
100 |
+
results_df = pd.DataFrame(data={"index": list(range(len(df_list))),
|
101 |
+
"search_text": df_list,
|
102 |
+
"search_score_abs": match_scores})
|
103 |
+
results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
|
104 |
+
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
|
105 |
+
|
106 |
+
# Join on additional files
|
107 |
+
if not in_join_file.empty:
|
108 |
+
progress(0.5, desc = "Joining on additional data file")
|
109 |
+
join_df = in_join_file
|
110 |
+
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
111 |
+
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
112 |
+
|
113 |
+
# Duplicates dropped so as not to expand out dataframe
|
114 |
+
join_df = join_df.drop_duplicates(in_join_column)
|
115 |
+
|
116 |
+
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
|
117 |
+
|
118 |
+
# Reorder results by score
|
119 |
+
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
120 |
+
|
121 |
+
# Out file
|
122 |
+
query_str_file = ("_").join(token_query)
|
123 |
+
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
124 |
+
|
125 |
+
print("Saving search file output")
|
126 |
+
progress(0.7, desc = "Saving search output to file")
|
127 |
+
|
128 |
+
results_df_out.to_excel(results_df_name, index= None)
|
129 |
+
results_first_text = results_df_out[text_column].iloc[0]
|
130 |
+
|
131 |
+
print("Returning results")
|
132 |
+
|
133 |
+
return results_first_text, results_df_name
|
134 |
+
|
135 |
+
|
136 |
+
match_list = spacy_fuzzy_search(string_query, df_list)
|
137 |
+
print(match_list)
|