Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Feb 2, 2024

Commit

739b386

1 Parent(s): c6dc87d

Cut out semantic search temporarily while issues with Jina gated model resolved. Improved error/progress tracking and messaging. Placeholder for Spacy fuzzy search.

Browse files

Files changed (8) hide show

app.py +34 -33
how_to_create_exe_dist.txt +3 -0
requirements.txt +4 -4
search_funcs/bm25_functions.py +24 -18
search_funcs/helper_functions.py +13 -5
search_funcs/semantic_functions.py +2 -2
search_funcs/semantic_ingest_functions.py +2 -2
search_funcs/spacy_search_funcs.py +137 -0

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from typing import Type
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
-from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
-from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
-from search_funcs.helper_functions import dummy_function, display_info, put_columns_in_df, put_columns_in_join_df, get_temp_folder_path, empty_folder
 import gradio as gr
 import pandas as pd
@@ -25,6 +25,7 @@ with block:
     vectorstore_state = gr.State() # globals()["vectorstore"]
     embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
     search_index_state = gr.State()
     k_val = gr.State(9999)
     out_passages = gr.State(9999)
@@ -82,31 +83,31 @@ depends on factors such as the type of documents or queries. Information taken f
                 output_single_text = gr.Textbox(label="Top result")
                 output_file = gr.File(label="File output")
-    with gr.Tab("Semantic search"):
-        gr.Markdown(
-    """
-    **Thematic/semantic search**
-    This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
-    """)
-        with gr.Row():
-            current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
-        with gr.Accordion("Load in data", open = True):
-            in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
-            with gr.Row():
-                in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
-                load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
-            semantic_load_progress = gr.Textbox(label="Load progress")
-        semantic_query = gr.Textbox(label="Enter semantic search query here")
-        semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
-        with gr.Row():
-            semantic_output_single_text = gr.Textbox(label="Top result")
-            semantic_output_file = gr.File(label="File output")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label="Data load / save options", open = True):
@@ -148,12 +149,12 @@ depends on factors such as the type of documents or queries. Information taken f
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, load_finished_message])
     in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
-    load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
-    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
     # BM25 search functions on click or enter
     keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
@@ -161,20 +162,20 @@ depends on factors such as the type of documents or queries. Information taken f
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress])
-    load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
-             then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
-             then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
-    # Semantic search query
-    semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
-    semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
     # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
     in_bm25_column.change(dummy_function, in_bm25_column, None)
     search_df_join_column.change(dummy_function, search_df_join_column, None)
     in_join_column.change(dummy_function, in_join_column, None)
-    in_semantic_column.change(dummy_function, in_join_column, None)
 block.queue().launch(debug=True)

 from typing import Type
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
+#from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
+#from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
+from search_funcs.helper_functions import dummy_function, display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
 import gradio as gr
 import pandas as pd
     vectorstore_state = gr.State() # globals()["vectorstore"]
     embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
     search_index_state = gr.State()
+    tokenised_state = gr.State()
     k_val = gr.State(9999)
     out_passages = gr.State(9999)
                 output_single_text = gr.Textbox(label="Top result")
                 output_file = gr.File(label="File output")
+    # with gr.Tab("Semantic search"):
+    #     gr.Markdown(
+    # """
+    # **Thematic/semantic search**
+    # This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
+    # """)
+    #     with gr.Row():
+    #         current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
+    #     with gr.Accordion("Load in data", open = True):
+    #         in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
+    #         with gr.Row():
+    #             in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
+    #             load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
+    #         semantic_load_progress = gr.Textbox(label="Load progress")
+    #     semantic_query = gr.Textbox(label="Enter semantic search query here")
+    #     semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
+    #     with gr.Row():
+    #         semantic_output_single_text = gr.Textbox(label="Top result")
+    #         semantic_output_file = gr.File(label="File output")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label="Data load / save options", open = True):
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
     in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
+    load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file]).\
+    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
     # BM25 search functions on click or enter
     keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    # in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress, current_source])
+    # load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
+    #          then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
+    #          then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
+    # # Semantic search query
+    # semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
+    # semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
     # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
     in_bm25_column.change(dummy_function, in_bm25_column, None)
     search_df_join_column.change(dummy_function, search_df_join_column, None)
     in_join_column.change(dummy_function, in_join_column, None)
+    # in_semantic_column.change(dummy_function, in_join_column, None)
 block.queue().launch(debug=True)

how_to_create_exe_dist.txt CHANGED Viewed

@@ -19,6 +19,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 For one single file:
 python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
 For a small exe with a folder of dependencies:
 python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py

 For one single file:
 python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
+If not using embedding model:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client"  --onefile --clean --noconfirm --name DataSearchApp_0.2.2_keyword app.py
 For a small exe with a folder of dependencies:
 python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py

requirements.txt CHANGED Viewed

@@ -2,9 +2,9 @@ pandas==2.1.4
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
-transformers==4.32.1
-accelerate==0.26.0
-torch==2.1.2
 spacy==3.7.2
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio==3.50.2

 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
+# transformers==4.32.1
+# accelerate==0.26.0
+# torch==2.1.2
 spacy==3.7.2
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.2/en_core_web_sm-3.7.2.tar.gz
 gradio==3.50.2

search_funcs/bm25_functions.py CHANGED Viewed

@@ -231,7 +231,7 @@ class BM25:
 # These following functions are my own work
-def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
 	print(in_file)
 	if not in_file:
@@ -243,7 +243,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	#print(file_list)
-	data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
 	if not data_file_names:
 		return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
@@ -260,8 +260,8 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	## Load in pre-tokenised corpus if exists
 	tokenised_df = pd.DataFrame()
-	tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
-	search_index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
 	df[text_column] = df[text_column].astype(str).str.lower()
@@ -271,8 +271,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 		print(message)
 		return corpus, message, df, None, None, None
-	if tokenised_file_names:
-		tokenised_df = read_file(tokenised_file_names[0])
 	if clean == "Yes":
 		progress(0.1, desc = "Cleaning data")
@@ -300,12 +299,12 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	progress(0.4, desc = "Tokenising text")
-	if not tokenised_df.empty:
 		corpus = tokenised_df.iloc[:,0].tolist()
-		print("Tokeniser loaded from file")
 		#print("Corpus is: ", corpus[0:5])
-	# If doesn't already exist, tokenize texts in batches
 	else:
 		tokeniser_tic = time.perf_counter()
 		corpus = []
@@ -316,7 +315,6 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 		tokeniser_toc = time.perf_counter()
 		tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
 		print(tokenizer_time_out)
 	if len(df_list) >= 20:
 		message = "Data loaded"
@@ -324,12 +322,16 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 		message = "Data loaded. Warning: dataset may be too short to get consistent search results."
 	if return_intermediate_files == "Yes":
-		tokenised_data_file_name = data_file_out_name_no_ext + "_" + "tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
-		return corpus, message, df, out_file_name, tokenised_data_file_name, data_file_out_name
-	return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
 def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
@@ -357,7 +359,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
 	return file_name, new_text_column
-def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
 	#bm25.save("saved_df_bm25")
 	#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
@@ -385,7 +387,7 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
 	#print(file_list)
 	# Get data file name
-	data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
 	if not data_file_names:
 		return "Please load in at least one csv/Excel/parquet data file.", None
@@ -395,7 +397,7 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
 	data_file_name_no_ext = get_file_path_end(data_file_name)
 	# Check if there is a search index file already
-	#index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
 	progress(0.6, desc = "Preparing search index")
@@ -422,8 +424,12 @@ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate
 	if return_intermediate_files == "Yes":
 		print("Saving search index file")
 		progress(0.8, desc = "Saving search index to file")
-		bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
-		#np.savez_compressed(bm25_search_file_name, bm25)
 		with gzip.open(bm25_search_file_name, 'wb') as file:
 				pickle.dump(bm25, file)

 # These following functions are my own work
+def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No",  return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
 	print(in_file)
 	if not in_file:
 	#print(file_list)
+	data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
 	if not data_file_names:
 		return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
 	## Load in pre-tokenised corpus if exists
 	tokenised_df = pd.DataFrame()
+	tokenised_file_names = [string for string in file_list if "tokenised" in string.lower()]
+	search_index_file_names = [string for string in file_list if "gz" in string.lower()]
 	df[text_column] = df[text_column].astype(str).str.lower()
 		print(message)
 		return corpus, message, df, None, None, None
 	if clean == "Yes":
 		progress(0.1, desc = "Cleaning data")
 	progress(0.4, desc = "Tokenising text")
+	if tokenised_state:
+		tokenised_df = tokenised_state
 		corpus = tokenised_df.iloc[:,0].tolist()
+		print("Tokenised data loaded from file")
 		#print("Corpus is: ", corpus[0:5])
 	else:
 		tokeniser_tic = time.perf_counter()
 		corpus = []
 		tokeniser_toc = time.perf_counter()
 		tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
 		print(tokenizer_time_out)
 	if len(df_list) >= 20:
 		message = "Data loaded"
 		message = "Data loaded. Warning: dataset may be too short to get consistent search results."
 	if return_intermediate_files == "Yes":
+		if clean == "Yes":
+			tokenised_data_file_name = data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
+		else:
+			tokenised_data_file_name = data_file_out_name_no_ext + "_tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
+		return corpus, message, df, out_file_name, tokenised_data_file_name
+	return corpus, message, df, out_file_name, None # tokenised_data_file_name
 def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
 	return file_name, new_text_column
+def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
 	#bm25.save("saved_df_bm25")
 	#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
 	#print(file_list)
 	# Get data file name
+	data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
 	if not data_file_names:
 		return "Please load in at least one csv/Excel/parquet data file.", None
 	data_file_name_no_ext = get_file_path_end(data_file_name)
 	# Check if there is a search index file already
+	#index_file_names = [string for string in file_list if "gz" in string.lower()]
 	progress(0.6, desc = "Preparing search index")
 	if return_intermediate_files == "Yes":
 		print("Saving search index file")
 		progress(0.8, desc = "Saving search index to file")
+		if clean == "Yes":
+			bm25_search_file_name = data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
+		else:
+			bm25_search_file_name = data_file_name_no_ext + '_search_index.pkl.gz'
+			#np.savez_compressed(bm25_search_file_name, bm25)
 		with gzip.open(bm25_search_file_name, 'wb') as file:
 				pickle.dump(bm25, file)

search_funcs/helper_functions.py CHANGED Viewed

@@ -88,7 +88,7 @@ def read_file(filename):
     return file
-def put_columns_in_df(in_file, in_bm25_column):
     '''
     When file is loaded, update the column dropdown choices
     '''
@@ -96,13 +96,15 @@ def put_columns_in_df(in_file, in_bm25_column):
     concat_choices = []
     index_load = None
     embed_load = np.array([])
     out_message = ""
     file_list = [string.name for string in in_file]
     #print(file_list)
-    data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
@@ -110,6 +112,8 @@ def put_columns_in_df(in_file, in_bm25_column):
         return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
     data_file_name = data_file_names[0]
     df = read_file(data_file_name)
@@ -128,13 +132,13 @@ def put_columns_in_df(in_file, in_bm25_column):
     concat_choices.extend(new_choices)
     # Check if there is a search index file already
-    index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
     if index_file_names:
         index_file_name = index_file_names[0]
         index_load = read_file(index_file_name)
-    embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
     if embeddings_file_names:
         print("Loading embeddings from file.")
@@ -146,10 +150,14 @@ def put_columns_in_df(in_file, in_bm25_column):
     else:
         embed_load = np.array([])
     out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
     print(out_message)
-    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, out_message
 def put_columns_in_join_df(in_file):
     '''

     return file
+def initial_data_load(in_file, in_bm25_column):
     '''
     When file is loaded, update the column dropdown choices
     '''
     concat_choices = []
     index_load = None
     embed_load = np.array([])
+    tokenised_load =[]
     out_message = ""
+    current_source = ""
     file_list = [string.name for string in in_file]
     #print(file_list)
+    data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
         return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
     data_file_name = data_file_names[0]
+    current_source = get_file_path_end_with_ext(data_file_name)
     df = read_file(data_file_name)
     concat_choices.extend(new_choices)
     # Check if there is a search index file already
+    index_file_names = [string for string in file_list if "gz" in string.lower()]
     if index_file_names:
         index_file_name = index_file_names[0]
         index_load = read_file(index_file_name)
+    embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
     if embeddings_file_names:
         print("Loading embeddings from file.")
     else:
         embed_load = np.array([])
+    tokenised_file_names = [string for string in file_list if "tokenised" in string.lower()]
+    if tokenised_file_names:
+        tokenised_load = read_file(tokenised_file_names[0])
     out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
     print(out_message)
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, tokenised_load, out_message, current_source
 def put_columns_in_join_df(in_file):
     '''

search_funcs/semantic_functions.py CHANGED Viewed

@@ -92,8 +92,8 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
     #print(file_list)
-    embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
-    data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
     data_file_name = data_file_names[0]
     data_file_name_no_ext = get_file_path_end(data_file_name)

     #print(file_list)
+    embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
+    data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
     data_file_name = data_file_names[0]
     data_file_name_no_ext = get_file_path_end(data_file_name)

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
         #print(file_list)
-        data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
         data_file_name = data_file_names[0]
@@ -303,7 +303,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
     file_list = [string.name for string in in_file]
-    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     if not data_file_names:
         return doc_sections, "Please load in at least one csv/Excel/parquet data file."

         #print(file_list)
+        data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
         data_file_name = data_file_names[0]
     file_list = [string.name for string in in_file]
+    data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     if not data_file_names:
         return doc_sections, "Please load in at least one csv/Excel/parquet data file."

search_funcs/spacy_search_funcs.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import spacy
+from spacy.matcher import Matcher
+import numpy as np
+import gradio as gr
+import pandas as pd
+from typing import List, Type
+PandasDataFrame = Type[pd.DataFrame]
+nlp = spacy.load("en_core_web_sm")
+string_query = "knife attack run fast"
+df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]
+def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
+    ''' Conduct fuzzy match on a list of data.'''
+    query = nlp(string_query)
+    tokenised_query = [token.text for token in query]
+    print(tokenised_query)
+    spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)
+    # %%
+    if len(tokenised_query) > 1:
+        pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
+        pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
+    elif len(tokenised_query) == 1:
+        pattern_lemma = [{"LEMMA": tokenised_query[0]}]
+        pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
+    else:
+        tokenised_query = [""]
+    # %%
+    search_pattern = pattern_fuzz.copy()
+    search_pattern.extend(pattern_lemma)
+    # %%
+    matcher = Matcher(nlp.vocab)
+    # %% [markdown]
+    # from spacy.tokens import Span
+    # from spacy import displacy
+    #
+    # def add_event_ent(matcher, doc, i, matches):
+    #     # Get the current match and create tuple of entity label, start and end.
+    #     # Append entity to the doc's entity. (Don't overwrite doc.ents!)
+    #     match_id, start, end = matches[i]
+    #     entity = Span(doc, start, end, label="EVENT")
+    #     doc.ents += (entity,)
+    #     print(entity.text)
+    # %% [markdown]
+    # matched_sents = []  # Collect data of matched sentences to be visualized
+    #
+    # def collect_sents(matcher, doc, i, matches):
+    #     match_id, start, end = matches[i]
+    #     span = doc[start:end]  # Matched span
+    #     sent = span.sent  # Sentence containing matched span
+    #     # Append mock entity for match in displaCy style to matched_sents
+    #     # get the match span by ofsetting the start and end of the span with the
+    #     # start and end of the sentence in the doc
+    #     match_ents = [{
+    #         "start": span.start_char - sent.start_char,
+    #         "end": span.end_char - sent.start_char,
+    #         "label": "MATCH",
+    #     }]
+    #     matched_sents.append({"text": sent.text, "ents": match_ents})
+    # %%
+    matcher.add(string_query, [pattern_fuzz])#, on_match=add_event_ent)
+    matcher.add(string_query, [pattern_lemma])#, on_match=add_event_ent)
+    # %%
+    batch_size = 256
+    docs = nlp.pipe(df_list, batch_size=batch_size)
+    # %%
+    all_matches = []
+    # Get number of matches per doc
+    for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
+        matches = matcher(doc)
+        match_count = len(matches)
+        all_matches.append(match_count)
+    print("Search complete")
+    ## Get document lengths
+    lengths = []
+    for element in df_list:
+        lengths.append(len(element))
+    # Score is number of matches divided by length of document
+    match_scores = (np.array(all_matches)/np.array(lengths)).tolist()
+    # Prepare results and export
+    results_df = pd.DataFrame(data={"index": list(range(len(df_list))),
+                                    "search_text": df_list,
+                                    "search_score_abs": match_scores})
+    results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
+    results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
+    # Join on additional files
+    if not in_join_file.empty:
+        progress(0.5, desc = "Joining on additional data file")
+        join_df = in_join_file
+        join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
+        results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
+        # Duplicates dropped so as not to expand out dataframe
+        join_df = join_df.drop_duplicates(in_join_column)
+        results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
+    # Reorder results by score
+    results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
+    # Out file
+    query_str_file = ("_").join(token_query)
+    results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
+    print("Saving search file output")
+    progress(0.7, desc = "Saving search output to file")
+    results_df_out.to_excel(results_df_name, index= None)
+    results_first_text = results_df_out[text_column].iloc[0]
+    print("Returning results")
+    return results_first_text, results_df_name
+match_list = spacy_fuzzy_search(string_query, df_list)
+print(match_list)