seanpedrickcase commited on
Commit
739b386
·
1 Parent(s): c6dc87d

Cut out semantic search temporarily while issues with Jina gated model resolved. Improved error/progress tracking and messaging. Placeholder for Spacy fuzzy search.

Browse files
app.py CHANGED
@@ -1,8 +1,8 @@
1
  from typing import Type
2
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
3
- from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
4
- from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
5
- from search_funcs.helper_functions import dummy_function, display_info, put_columns_in_df, put_columns_in_join_df, get_temp_folder_path, empty_folder
6
 
7
  import gradio as gr
8
  import pandas as pd
@@ -25,6 +25,7 @@ with block:
25
  vectorstore_state = gr.State() # globals()["vectorstore"]
26
  embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
27
  search_index_state = gr.State()
 
28
 
29
  k_val = gr.State(9999)
30
  out_passages = gr.State(9999)
@@ -82,31 +83,31 @@ depends on factors such as the type of documents or queries. Information taken f
82
  output_single_text = gr.Textbox(label="Top result")
83
  output_file = gr.File(label="File output")
84
 
85
- with gr.Tab("Semantic search"):
86
- gr.Markdown(
87
- """
88
- **Thematic/semantic search**
89
 
90
- This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
91
- """)
92
- with gr.Row():
93
- current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
94
 
95
- with gr.Accordion("Load in data", open = True):
96
- in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
97
 
98
- with gr.Row():
99
- in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
100
- load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
101
 
102
- semantic_load_progress = gr.Textbox(label="Load progress")
103
 
104
- semantic_query = gr.Textbox(label="Enter semantic search query here")
105
- semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
106
 
107
- with gr.Row():
108
- semantic_output_single_text = gr.Textbox(label="Top result")
109
- semantic_output_file = gr.File(label="File output")
110
 
111
  with gr.Tab(label="Advanced options"):
112
  with gr.Accordion(label="Data load / save options", open = True):
@@ -148,12 +149,12 @@ depends on factors such as the type of documents or queries. Information taken f
148
 
149
  ### BM25 SEARCH ###
150
  # Update dropdowns upon initial file load
151
- in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, load_finished_message])
152
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
153
 
154
  # Load in BM25 data
155
- load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
156
- then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
157
 
158
  # BM25 search functions on click or enter
159
  keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
@@ -161,20 +162,20 @@ depends on factors such as the type of documents or queries. Information taken f
161
 
162
  ### SEMANTIC SEARCH ###
163
  # Load in a csv/excel file for semantic search
164
- in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress])
165
- load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
166
- then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
167
- then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
168
 
169
- # Semantic search query
170
- semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
171
- semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
172
 
173
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
174
  in_bm25_column.change(dummy_function, in_bm25_column, None)
175
  search_df_join_column.change(dummy_function, search_df_join_column, None)
176
  in_join_column.change(dummy_function, in_join_column, None)
177
- in_semantic_column.change(dummy_function, in_join_column, None)
178
 
179
  block.queue().launch(debug=True)
180
 
 
1
  from typing import Type
2
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
3
+ #from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
4
+ #from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
5
+ from search_funcs.helper_functions import dummy_function, display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
6
 
7
  import gradio as gr
8
  import pandas as pd
 
25
  vectorstore_state = gr.State() # globals()["vectorstore"]
26
  embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
27
  search_index_state = gr.State()
28
+ tokenised_state = gr.State()
29
 
30
  k_val = gr.State(9999)
31
  out_passages = gr.State(9999)
 
83
  output_single_text = gr.Textbox(label="Top result")
84
  output_file = gr.File(label="File output")
85
 
86
+ # with gr.Tab("Semantic search"):
87
+ # gr.Markdown(
88
+ # """
89
+ # **Thematic/semantic search**
90
 
91
+ # This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
92
+ # """)
93
+ # with gr.Row():
94
+ # current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
95
 
96
+ # with gr.Accordion("Load in data", open = True):
97
+ # in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
98
 
99
+ # with gr.Row():
100
+ # in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
101
+ # load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
102
 
103
+ # semantic_load_progress = gr.Textbox(label="Load progress")
104
 
105
+ # semantic_query = gr.Textbox(label="Enter semantic search query here")
106
+ # semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
107
 
108
+ # with gr.Row():
109
+ # semantic_output_single_text = gr.Textbox(label="Top result")
110
+ # semantic_output_file = gr.File(label="File output")
111
 
112
  with gr.Tab(label="Advanced options"):
113
  with gr.Accordion(label="Data load / save options", open = True):
 
149
 
150
  ### BM25 SEARCH ###
151
  # Update dropdowns upon initial file load
152
+ in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
153
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
154
 
155
  # Load in BM25 data
156
+ load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file]).\
157
+ then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
158
 
159
  # BM25 search functions on click or enter
160
  keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
 
162
 
163
  ### SEMANTIC SEARCH ###
164
  # Load in a csv/excel file for semantic search
165
+ # in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress, current_source])
166
+ # load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
167
+ # then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
168
+ # then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
169
 
170
+ # # Semantic search query
171
+ # semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
172
+ # semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
173
 
174
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
175
  in_bm25_column.change(dummy_function, in_bm25_column, None)
176
  search_df_join_column.change(dummy_function, search_df_join_column, None)
177
  in_join_column.change(dummy_function, in_join_column, None)
178
+ # in_semantic_column.change(dummy_function, in_join_column, None)
179
 
180
  block.queue().launch(debug=True)
181
 
how_to_create_exe_dist.txt CHANGED
@@ -19,6 +19,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
19
  For one single file:
20
  python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
21
 
 
 
 
22
  For a small exe with a folder of dependencies:
23
  python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
24
 
 
19
  For one single file:
20
  python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
21
 
22
+ If not using embedding model:
23
+ python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --onefile --clean --noconfirm --name DataSearchApp_0.2.2_keyword app.py
24
+
25
  For a small exe with a folder of dependencies:
26
  python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
27
 
requirements.txt CHANGED
@@ -2,9 +2,9 @@ pandas==2.1.4
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
- transformers==4.32.1
6
- accelerate==0.26.0
7
- torch==2.1.2
8
  spacy==3.7.2
9
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
  gradio==3.50.2
 
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
+ # transformers==4.32.1
6
+ # accelerate==0.26.0
7
+ # torch==2.1.2
8
  spacy==3.7.2
9
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.2/en_core_web_sm-3.7.2.tar.gz
10
  gradio==3.50.2
search_funcs/bm25_functions.py CHANGED
@@ -231,7 +231,7 @@ class BM25:
231
 
232
  # These following functions are my own work
233
 
234
- def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
235
  print(in_file)
236
 
237
  if not in_file:
@@ -243,7 +243,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
243
 
244
  #print(file_list)
245
 
246
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
247
 
248
  if not data_file_names:
249
  return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
@@ -260,8 +260,8 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
260
  ## Load in pre-tokenised corpus if exists
261
  tokenised_df = pd.DataFrame()
262
 
263
- tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
264
- search_index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
265
 
266
  df[text_column] = df[text_column].astype(str).str.lower()
267
 
@@ -271,8 +271,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
271
  print(message)
272
  return corpus, message, df, None, None, None
273
 
274
- if tokenised_file_names:
275
- tokenised_df = read_file(tokenised_file_names[0])
276
 
277
  if clean == "Yes":
278
  progress(0.1, desc = "Cleaning data")
@@ -300,12 +299,12 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
300
 
301
  progress(0.4, desc = "Tokenising text")
302
 
303
- if not tokenised_df.empty:
 
304
  corpus = tokenised_df.iloc[:,0].tolist()
305
- print("Tokeniser loaded from file")
306
  #print("Corpus is: ", corpus[0:5])
307
 
308
- # If doesn't already exist, tokenize texts in batches
309
  else:
310
  tokeniser_tic = time.perf_counter()
311
  corpus = []
@@ -316,7 +315,6 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
316
  tokeniser_toc = time.perf_counter()
317
  tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
318
  print(tokenizer_time_out)
319
-
320
 
321
  if len(df_list) >= 20:
322
  message = "Data loaded"
@@ -324,12 +322,16 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
324
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
325
 
326
  if return_intermediate_files == "Yes":
327
- tokenised_data_file_name = data_file_out_name_no_ext + "_" + "tokenised.parquet"
 
 
 
 
328
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
329
 
330
- return corpus, message, df, out_file_name, tokenised_data_file_name, data_file_out_name
331
 
332
- return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
333
 
334
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
335
 
@@ -357,7 +359,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
357
 
358
  return file_name, new_text_column
359
 
360
- def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
361
  #bm25.save("saved_df_bm25")
362
  #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
363
 
@@ -385,7 +387,7 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
385
  #print(file_list)
386
 
387
  # Get data file name
388
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
389
 
390
  if not data_file_names:
391
  return "Please load in at least one csv/Excel/parquet data file.", None
@@ -395,7 +397,7 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
395
  data_file_name_no_ext = get_file_path_end(data_file_name)
396
 
397
  # Check if there is a search index file already
398
- #index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
399
 
400
  progress(0.6, desc = "Preparing search index")
401
 
@@ -422,8 +424,12 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
422
  if return_intermediate_files == "Yes":
423
  print("Saving search index file")
424
  progress(0.8, desc = "Saving search index to file")
425
- bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
426
- #np.savez_compressed(bm25_search_file_name, bm25)
 
 
 
 
427
 
428
  with gzip.open(bm25_search_file_name, 'wb') as file:
429
  pickle.dump(bm25, file)
 
231
 
232
  # These following functions are my own work
233
 
234
+ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
235
  print(in_file)
236
 
237
  if not in_file:
 
243
 
244
  #print(file_list)
245
 
246
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
247
 
248
  if not data_file_names:
249
  return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
 
260
  ## Load in pre-tokenised corpus if exists
261
  tokenised_df = pd.DataFrame()
262
 
263
+ tokenised_file_names = [string for string in file_list if "tokenised" in string.lower()]
264
+ search_index_file_names = [string for string in file_list if "gz" in string.lower()]
265
 
266
  df[text_column] = df[text_column].astype(str).str.lower()
267
 
 
271
  print(message)
272
  return corpus, message, df, None, None, None
273
 
274
+
 
275
 
276
  if clean == "Yes":
277
  progress(0.1, desc = "Cleaning data")
 
299
 
300
  progress(0.4, desc = "Tokenising text")
301
 
302
+ if tokenised_state:
303
+ tokenised_df = tokenised_state
304
  corpus = tokenised_df.iloc[:,0].tolist()
305
+ print("Tokenised data loaded from file")
306
  #print("Corpus is: ", corpus[0:5])
307
 
 
308
  else:
309
  tokeniser_tic = time.perf_counter()
310
  corpus = []
 
315
  tokeniser_toc = time.perf_counter()
316
  tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
317
  print(tokenizer_time_out)
 
318
 
319
  if len(df_list) >= 20:
320
  message = "Data loaded"
 
322
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
323
 
324
  if return_intermediate_files == "Yes":
325
+ if clean == "Yes":
326
+ tokenised_data_file_name = data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
327
+ else:
328
+ tokenised_data_file_name = data_file_out_name_no_ext + "_tokenised.parquet"
329
+
330
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
331
 
332
+ return corpus, message, df, out_file_name, tokenised_data_file_name
333
 
334
+ return corpus, message, df, out_file_name, None # tokenised_data_file_name
335
 
336
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
337
 
 
359
 
360
  return file_name, new_text_column
361
 
362
+ def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
363
  #bm25.save("saved_df_bm25")
364
  #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
365
 
 
387
  #print(file_list)
388
 
389
  # Get data file name
390
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
391
 
392
  if not data_file_names:
393
  return "Please load in at least one csv/Excel/parquet data file.", None
 
397
  data_file_name_no_ext = get_file_path_end(data_file_name)
398
 
399
  # Check if there is a search index file already
400
+ #index_file_names = [string for string in file_list if "gz" in string.lower()]
401
 
402
  progress(0.6, desc = "Preparing search index")
403
 
 
424
  if return_intermediate_files == "Yes":
425
  print("Saving search index file")
426
  progress(0.8, desc = "Saving search index to file")
427
+
428
+ if clean == "Yes":
429
+ bm25_search_file_name = data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
430
+ else:
431
+ bm25_search_file_name = data_file_name_no_ext + '_search_index.pkl.gz'
432
+ #np.savez_compressed(bm25_search_file_name, bm25)
433
 
434
  with gzip.open(bm25_search_file_name, 'wb') as file:
435
  pickle.dump(bm25, file)
search_funcs/helper_functions.py CHANGED
@@ -88,7 +88,7 @@ def read_file(filename):
88
 
89
  return file
90
 
91
- def put_columns_in_df(in_file, in_bm25_column):
92
  '''
93
  When file is loaded, update the column dropdown choices
94
  '''
@@ -96,13 +96,15 @@ def put_columns_in_df(in_file, in_bm25_column):
96
  concat_choices = []
97
  index_load = None
98
  embed_load = np.array([])
 
99
  out_message = ""
 
100
 
101
  file_list = [string.name for string in in_file]
102
 
103
  #print(file_list)
104
 
105
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
106
 
107
  if not data_file_names:
108
  out_message = "Please load in at least one csv/Excel/parquet data file."
@@ -110,6 +112,8 @@ def put_columns_in_df(in_file, in_bm25_column):
110
  return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
111
 
112
  data_file_name = data_file_names[0]
 
 
113
 
114
 
115
  df = read_file(data_file_name)
@@ -128,13 +132,13 @@ def put_columns_in_df(in_file, in_bm25_column):
128
  concat_choices.extend(new_choices)
129
 
130
  # Check if there is a search index file already
131
- index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
132
 
133
  if index_file_names:
134
  index_file_name = index_file_names[0]
135
  index_load = read_file(index_file_name)
136
 
137
- embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
138
 
139
  if embeddings_file_names:
140
  print("Loading embeddings from file.")
@@ -146,10 +150,14 @@ def put_columns_in_df(in_file, in_bm25_column):
146
  else:
147
  embed_load = np.array([])
148
 
 
 
 
 
149
  out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
150
  print(out_message)
151
 
152
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, out_message
153
 
154
  def put_columns_in_join_df(in_file):
155
  '''
 
88
 
89
  return file
90
 
91
+ def initial_data_load(in_file, in_bm25_column):
92
  '''
93
  When file is loaded, update the column dropdown choices
94
  '''
 
96
  concat_choices = []
97
  index_load = None
98
  embed_load = np.array([])
99
+ tokenised_load =[]
100
  out_message = ""
101
+ current_source = ""
102
 
103
  file_list = [string.name for string in in_file]
104
 
105
  #print(file_list)
106
 
107
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
108
 
109
  if not data_file_names:
110
  out_message = "Please load in at least one csv/Excel/parquet data file."
 
112
  return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
113
 
114
  data_file_name = data_file_names[0]
115
+
116
+ current_source = get_file_path_end_with_ext(data_file_name)
117
 
118
 
119
  df = read_file(data_file_name)
 
132
  concat_choices.extend(new_choices)
133
 
134
  # Check if there is a search index file already
135
+ index_file_names = [string for string in file_list if "gz" in string.lower()]
136
 
137
  if index_file_names:
138
  index_file_name = index_file_names[0]
139
  index_load = read_file(index_file_name)
140
 
141
+ embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
142
 
143
  if embeddings_file_names:
144
  print("Loading embeddings from file.")
 
150
  else:
151
  embed_load = np.array([])
152
 
153
+ tokenised_file_names = [string for string in file_list if "tokenised" in string.lower()]
154
+ if tokenised_file_names:
155
+ tokenised_load = read_file(tokenised_file_names[0])
156
+
157
  out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
158
  print(out_message)
159
 
160
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, tokenised_load, out_message, current_source
161
 
162
  def put_columns_in_join_df(in_file):
163
  '''
search_funcs/semantic_functions.py CHANGED
@@ -92,8 +92,8 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
92
 
93
  #print(file_list)
94
 
95
- embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
96
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
97
  data_file_name = data_file_names[0]
98
  data_file_name_no_ext = get_file_path_end(data_file_name)
99
 
 
92
 
93
  #print(file_list)
94
 
95
+ embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
96
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
97
  data_file_name = data_file_names[0]
98
  data_file_name_no_ext = get_file_path_end(data_file_name)
99
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
130
 
131
  #print(file_list)
132
 
133
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
134
 
135
  data_file_name = data_file_names[0]
136
 
@@ -303,7 +303,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
303
 
304
  file_list = [string.name for string in in_file]
305
 
306
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
307
 
308
  if not data_file_names:
309
  return doc_sections, "Please load in at least one csv/Excel/parquet data file."
 
130
 
131
  #print(file_list)
132
 
133
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
134
 
135
  data_file_name = data_file_names[0]
136
 
 
303
 
304
  file_list = [string.name for string in in_file]
305
 
306
+ data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
307
 
308
  if not data_file_names:
309
  return doc_sections, "Please load in at least one csv/Excel/parquet data file."
search_funcs/spacy_search_funcs.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.matcher import Matcher
3
+ import numpy as np
4
+ import gradio as gr
5
+ import pandas as pd
6
+ from typing import List, Type
7
+
8
+ PandasDataFrame = Type[pd.DataFrame]
9
+
10
+ nlp = spacy.load("en_core_web_sm")
11
+
12
+ string_query = "knife attack run fast"
13
+ df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]
14
+
15
+
16
+ def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
17
+ ''' Conduct fuzzy match on a list of data.'''
18
+
19
+ query = nlp(string_query)
20
+ tokenised_query = [token.text for token in query]
21
+ print(tokenised_query)
22
+
23
+ spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)
24
+
25
+ # %%
26
+ if len(tokenised_query) > 1:
27
+ pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
28
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
29
+ elif len(tokenised_query) == 1:
30
+ pattern_lemma = [{"LEMMA": tokenised_query[0]}]
31
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
32
+ else:
33
+ tokenised_query = [""]
34
+
35
+ # %%
36
+ search_pattern = pattern_fuzz.copy()
37
+ search_pattern.extend(pattern_lemma)
38
+
39
+
40
+ # %%
41
+ matcher = Matcher(nlp.vocab)
42
+
43
+ # %% [markdown]
44
+ # from spacy.tokens import Span
45
+ # from spacy import displacy
46
+ #
47
+ # def add_event_ent(matcher, doc, i, matches):
48
+ # # Get the current match and create tuple of entity label, start and end.
49
+ # # Append entity to the doc's entity. (Don't overwrite doc.ents!)
50
+ # match_id, start, end = matches[i]
51
+ # entity = Span(doc, start, end, label="EVENT")
52
+ # doc.ents += (entity,)
53
+ # print(entity.text)
54
+
55
+ # %% [markdown]
56
+ # matched_sents = [] # Collect data of matched sentences to be visualized
57
+ #
58
+ # def collect_sents(matcher, doc, i, matches):
59
+ # match_id, start, end = matches[i]
60
+ # span = doc[start:end] # Matched span
61
+ # sent = span.sent # Sentence containing matched span
62
+ # # Append mock entity for match in displaCy style to matched_sents
63
+ # # get the match span by ofsetting the start and end of the span with the
64
+ # # start and end of the sentence in the doc
65
+ # match_ents = [{
66
+ # "start": span.start_char - sent.start_char,
67
+ # "end": span.end_char - sent.start_char,
68
+ # "label": "MATCH",
69
+ # }]
70
+ # matched_sents.append({"text": sent.text, "ents": match_ents})
71
+
72
+ # %%
73
+ matcher.add(string_query, [pattern_fuzz])#, on_match=add_event_ent)
74
+ matcher.add(string_query, [pattern_lemma])#, on_match=add_event_ent)
75
+
76
+ # %%
77
+ batch_size = 256
78
+ docs = nlp.pipe(df_list, batch_size=batch_size)
79
+
80
+ # %%
81
+ all_matches = []
82
+
83
+ # Get number of matches per doc
84
+ for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
85
+ matches = matcher(doc)
86
+ match_count = len(matches)
87
+ all_matches.append(match_count)
88
+
89
+ print("Search complete")
90
+
91
+ ## Get document lengths
92
+ lengths = []
93
+ for element in df_list:
94
+ lengths.append(len(element))
95
+
96
+ # Score is number of matches divided by length of document
97
+ match_scores = (np.array(all_matches)/np.array(lengths)).tolist()
98
+
99
+ # Prepare results and export
100
+ results_df = pd.DataFrame(data={"index": list(range(len(df_list))),
101
+ "search_text": df_list,
102
+ "search_score_abs": match_scores})
103
+ results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
104
+ results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
105
+
106
+ # Join on additional files
107
+ if not in_join_file.empty:
108
+ progress(0.5, desc = "Joining on additional data file")
109
+ join_df = in_join_file
110
+ join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
111
+ results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
112
+
113
+ # Duplicates dropped so as not to expand out dataframe
114
+ join_df = join_df.drop_duplicates(in_join_column)
115
+
116
+ results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
117
+
118
+ # Reorder results by score
119
+ results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
120
+
121
+ # Out file
122
+ query_str_file = ("_").join(token_query)
123
+ results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
124
+
125
+ print("Saving search file output")
126
+ progress(0.7, desc = "Saving search output to file")
127
+
128
+ results_df_out.to_excel(results_df_name, index= None)
129
+ results_first_text = results_df_out[text_column].iloc[0]
130
+
131
+ print("Returning results")
132
+
133
+ return results_first_text, results_df_name
134
+
135
+
136
+ match_list = spacy_fuzzy_search(string_query, df_list)
137
+ print(match_list)