Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

Sean-Case commited on Jan 31, 2024

Commit

63049fe

1 Parent(s): 3df8e40

Better error checking. Doesn't load in embeddings file twice now.

Browse files

Files changed (8) hide show

README.md +1 -1
app.py +18 -20
how_to_create_exe_dist.txt +2 -2
requirements.txt +1 -1
search_funcs/bm25_functions.py +67 -22
search_funcs/helper_functions.py +54 -17
search_funcs/semantic_functions.py +43 -31
search_funcs/semantic_ingest_functions.py +15 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔍
 colorFrom: purple
 colorTo: green
 sdk: gradio
-sdk_version: 3.50.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: purple
 colorTo: green
 sdk: gradio
+sdk_version: 4.16.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from search_funcs.helper_functions import dummy_function, display_info, put_colu
 import gradio as gr
 import pandas as pd
 PandasDataFrame = Type[pd.DataFrame]
@@ -22,18 +23,16 @@ with block:
     ingest_metadata = gr.State()
     ingest_docs = gr.State()
     vectorstore_state = gr.State() # globals()["vectorstore"]
-    embeddings_state = gr.State() # globals()["embeddings"]
     k_val = gr.State(9999)
     out_passages = gr.State(9999)
     vec_weight = gr.State(1)
-    #docs_keep_as_doc_state = gr.State()
-    #doc_df_state = gr.State()
-    #docs_keep_out_state = gr.State()
     corpus_state = gr.State()
     keyword_data_state = gr.State(pd.DataFrame())
     semantic_data_state = gr.State(pd.DataFrame())
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
@@ -76,7 +75,6 @@ depends on factors such as the type of documents or queries. Information taken f
         with gr.Accordion(label = "Search data", open=True):
             with gr.Row():
                 keyword_query = gr.Textbox(label="Enter your search term")
-                #mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
             keyword_search_button = gr.Button(value="Search text")
@@ -115,7 +113,7 @@ depends on factors such as the type of documents or queries. Information taken f
             with gr.Row():
                 in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
-                embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
         with gr.Accordion(label="Keyword search options", open = False):
             with gr.Row():
@@ -133,13 +131,14 @@ depends on factors such as the type of documents or queries. Information taken f
             with gr.Row():
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Semantic search options", open = False):
-            semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.7, minimum=0, maximum=0.95, step=0.01)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
             in_join_column = gr.Dropdown(label="Column to join in new data frame")
             search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
-        in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])
     # ---
     in_k1_button.click(display_info, inputs=in_k1_info)
@@ -149,28 +148,27 @@ depends on factors such as the type of documents or queries. Information taken f
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, keyword_data_state])
-    in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
     # Load in BM25 data
     load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
-    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
-    #then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
     # BM25 search functions on click or enter
-    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
-    keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, semantic_data_state])
     load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
              then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
-             then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
     # Semantic search query
-    semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
-    semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
     # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
     in_bm25_column.change(dummy_function, in_bm25_column, None)

 import gradio as gr
 import pandas as pd
+import numpy as np
 PandasDataFrame = Type[pd.DataFrame]
     ingest_metadata = gr.State()
     ingest_docs = gr.State()
     vectorstore_state = gr.State() # globals()["vectorstore"]
+    embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
+    search_index_state = gr.State()
     k_val = gr.State(9999)
     out_passages = gr.State(9999)
     vec_weight = gr.State(1)
     corpus_state = gr.State()
     keyword_data_state = gr.State(pd.DataFrame())
+    join_data_state = gr.State(pd.DataFrame())
     semantic_data_state = gr.State(pd.DataFrame())
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
         with gr.Accordion(label = "Search data", open=True):
             with gr.Row():
                 keyword_query = gr.Textbox(label="Enter your search term")
             keyword_search_button = gr.Button(value="Search text")
             with gr.Row():
                 in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
+                embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
         with gr.Accordion(label="Keyword search options", open = False):
             with gr.Row():
             with gr.Row():
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Semantic search options", open = False):
+            semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.75, minimum=0, maximum=0.95, step=0.01)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
+            in_join_message = gr.Textbox(label="Join file load progress")
             in_join_column = gr.Dropdown(label="Column to join in new data frame")
             search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
+        in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
     # ---
     in_k1_button.click(display_info, inputs=in_k1_info)
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, load_finished_message])
+    in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
+    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
     # BM25 search functions on click or enter
+    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
+    keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress])
     load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
              then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
+             then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
     # Semantic search query
+    semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
+    semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
     # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
     in_bm25_column.change(dummy_function, in_bm25_column, None)

how_to_create_exe_dist.txt CHANGED Viewed

@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 8. In command line, cd to the folder that contains app.py. Then run the following:
 For one single file:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.1 app.py
 For a small exe with a folder of dependencies:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.1 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

 8. In command line, cd to the folder that contains app.py. Then run the following:
 For one single file:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
 For a small exe with a folder of dependencies:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

requirements.txt CHANGED Viewed

@@ -7,4 +7,4 @@ accelerate==0.26.0
 torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio==3.50.0

 torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio==4.16.0

search_funcs/bm25_functions.py CHANGED Viewed

@@ -18,7 +18,7 @@ from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sent
 from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
 # Load the SpaCy model
-from spacy.cli import download
 import spacy
 spacy.prefer_gpu()
@@ -231,13 +231,25 @@ class BM25:
 # These following functions are my own work
-def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  return_intermediate_files = "No", progress=gr.Progress()):
 	file_list = [string.name for string in in_file]
 	#print(file_list)
-	data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
 	data_file_name = data_file_names[0]
@@ -263,6 +275,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 		tokenised_df = read_file(tokenised_file_names[0])
 	if clean == "Yes":
 		clean_tic = time.perf_counter()
 		print("Starting data clean.")
@@ -280,14 +293,16 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	else:
 		# Don't clean or save file to disk
 		df_list = list(df[text_column])
-		print("No data cleaning performed.")
 		out_file_name = None
 	# Tokenise data. If tokenised df already exists, no need to do anything
 	if not tokenised_df.empty:
 		corpus = tokenised_df.iloc[:,0].tolist()
-		print("Tokeniser loaded from file.")
 		#print("Corpus is: ", corpus[0:5])
 	# If doesn't already exist, tokenize texts in batches
@@ -316,7 +331,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
-def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column):
 	# Check if the list and the dataframe have the same length
 	if len(prepared_text_list) != len(in_df):
@@ -342,31 +357,55 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
 	return file_name, new_text_column
-def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5):
 	#bm25.save("saved_df_bm25")
 	#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
 	file_list = [string.name for string in in_file]
 	#print(file_list)
 	# Get data file name
-	data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
 	data_file_name = data_file_names[0]
 	data_file_out_name = get_file_path_end_with_ext(data_file_name)
 	data_file_name_no_ext = get_file_path_end(data_file_name)
 	# Check if there is a search index file already
-	index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
-	if index_file_names:
-		index_file_name = index_file_names[0]
-		print(index_file_name)
-		bm25_load = read_file(index_file_name)
 		#index_file_out_name = get_file_path_end_with_ext(index_file_name)
@@ -381,6 +420,8 @@ def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, a
 	bm25 = bm25_load
 	if return_intermediate_files == "Yes":
 		bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
 		#np.savez_compressed(bm25_search_file_name, bm25)
@@ -420,8 +461,10 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
     return out_query
-def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
 	# Prepare query
 	if (clean == "Yes") | (text_column.endswith("_cleaned")):
 		token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
@@ -435,7 +478,7 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 	results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
 	if not results_index:
-		return "No search results found", None, token_query
 	print("Search complete")
@@ -448,18 +491,16 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 	results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
 	# Join on additional files
-	if in_join_file:
-		join_filename = in_join_file.name
-		# Import data
-		join_df = read_file(join_filename)
 		join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
 		results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
 		# Duplicates dropped so as not to expand out dataframe
 		join_df = join_df.drop_duplicates(in_join_column)
-		results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
 	# Reorder results by score
 	results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
@@ -467,9 +508,13 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 	# Out file
 	query_str_file = ("_").join(token_query)
 	results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
 	results_df_out.to_excel(results_df_name, index= None)
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")
-	return results_first_text, results_df_name, token_query

 from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
 # Load the SpaCy model
+from spacy.cli.download import download
 import spacy
 spacy.prefer_gpu()
 # These following functions are my own work
+def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
+	print(in_file)
+	if not in_file:
+		print("No input file found. Please load in at least one file.")
+		return None, "No input file found. Please load in at least one file.", data_state, None, None, None
+	progress(0, desc = "Loading in data")
 	file_list = [string.name for string in in_file]
 	#print(file_list)
+	data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
+	if not data_file_names:
+		return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
+	if not text_column:
+		return None, "Please enter a column name to search.", data_state, None, None, None
 	data_file_name = data_file_names[0]
 		tokenised_df = read_file(tokenised_file_names[0])
 	if clean == "Yes":
+		progress(0.1, desc = "Cleaning data")
 		clean_tic = time.perf_counter()
 		print("Starting data clean.")
 	else:
 		# Don't clean or save file to disk
 		df_list = list(df[text_column])
+		print("No data cleaning performed")
 		out_file_name = None
 	# Tokenise data. If tokenised df already exists, no need to do anything
+	progress(0.4, desc = "Tokenising text")
 	if not tokenised_df.empty:
 		corpus = tokenised_df.iloc[:,0].tolist()
+		print("Tokeniser loaded from file")
 		#print("Corpus is: ", corpus[0:5])
 	# If doesn't already exist, tokenize texts in batches
 	return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
+def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
 	# Check if the list and the dataframe have the same length
 	if len(prepared_text_list) != len(in_df):
 	return file_name, new_text_column
+def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
 	#bm25.save("saved_df_bm25")
 	#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
+	if not in_file:
+		out_message ="No input file found. Please load in at least one file."
+		print(out_message)
+		return out_message, None
+	if not corpus:
+		out_message = "No data file found. Please load in at least one csv/Excel/Parquet file."
+		print(out_message)
+		return out_message, None
+	if not text_column:
+		out_message = "Please enter a column name to search."
+		print(out_message)
+		return  out_message, None
 	file_list = [string.name for string in in_file]
 	#print(file_list)
 	# Get data file name
+	data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
+	if not data_file_names:
+		return "Please load in at least one csv/Excel/parquet data file.", None
 	data_file_name = data_file_names[0]
 	data_file_out_name = get_file_path_end_with_ext(data_file_name)
 	data_file_name_no_ext = get_file_path_end(data_file_name)
 	# Check if there is a search index file already
+	#index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
+	progress(0.6, desc = "Preparing search index")
+	#if index_file_names:
+	if search_index:
+		#index_file_name = index_file_names[0]
+		#print(index_file_name)
+		bm25_load = search_index
 		#index_file_out_name = get_file_path_end_with_ext(index_file_name)
 	bm25 = bm25_load
 	if return_intermediate_files == "Yes":
+		print("Saving search index file")
+		progress(0.8, desc = "Saving search index to file")
 		bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
 		#np.savez_compressed(bm25_search_file_name, bm25)
     return out_query
+def bm25_search(free_text_query, in_no_search_results, original_data, text_column, in_join_file, clean = "No",  in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
+	progress(0, desc = "Conducting keyword search")
 	# Prepare query
 	if (clean == "Yes") | (text_column.endswith("_cleaned")):
 		token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
 	results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
 	if not results_index:
+		return "No search results found", None
 	print("Search complete")
 	results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
 	# Join on additional files
+	if not in_join_file.empty:
+		progress(0.5, desc = "Joining on additional data file")
+		join_df = in_join_file
 		join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
 		results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
 		# Duplicates dropped so as not to expand out dataframe
 		join_df = join_df.drop_duplicates(in_join_column)
+		results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
 	# Reorder results by score
 	results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
 	# Out file
 	query_str_file = ("_").join(token_query)
 	results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
+	print("Saving search file output")
+	progress(0.7, desc = "Saving search output to file")
 	results_df_out.to_excel(results_df_name, index= None)
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")
+	return results_first_text, results_df_name

search_funcs/helper_functions.py CHANGED Viewed

@@ -7,6 +7,7 @@ import shutil
 import getpass
 import gzip
 import pickle
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
@@ -89,19 +90,27 @@ def read_file(filename):
 def put_columns_in_df(in_file, in_bm25_column):
     '''
-    When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
     '''
     file_list = [string.name for string in in_file]
     #print(file_list)
-    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
-    data_file_name = data_file_names[0]
-    new_choices = []
-    concat_choices = []
     df = read_file(data_file_name)
@@ -109,32 +118,60 @@ def put_columns_in_df(in_file, in_bm25_column):
         new_choices = list(df.columns)
     else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
     #print(new_choices)
-    concat_choices.extend(new_choices)
-    return gr.Dropdown(choices=concat_choices), gr.Dropdown(value="No", choices = ["Yes", "No"]), gr.Dropdown(choices=concat_choices), df
-def put_columns_in_join_df(in_file, in_bm25_column):
     '''
-    When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
     '''
-    print("in_bm25_column")
     new_choices = []
     concat_choices = []
-    df = read_file(in_file.name)
-    new_choices = list(df.columns)
-    print(new_choices)
-    concat_choices.extend(new_choices)
-    return gr.Dropdown(choices=concat_choices)
 def dummy_function(gradio_component):
     """

 import getpass
 import gzip
 import pickle
+import numpy as np
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
 def put_columns_in_df(in_file, in_bm25_column):
     '''
+    When file is loaded, update the column dropdown choices
     '''
+    new_choices = []
+    concat_choices = []
+    index_load = None
+    embed_load = np.array([])
+    out_message = ""
     file_list = [string.name for string in in_file]
     #print(file_list)
+    data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
+    if not data_file_names:
+        out_message = "Please load in at least one csv/Excel/parquet data file."
+        print(out_message)
+        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
+    data_file_name = data_file_names[0]
     df = read_file(data_file_name)
         new_choices = list(df.columns)
+    elif "search_index" in data_file_name:
+        # If only the search_index found, need a data file too
+        new_choices = []
     else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
     #print(new_choices)
+    concat_choices.extend(new_choices)
+    # Check if there is a search index file already
+    index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
+    if index_file_names:
+        index_file_name = index_file_names[0]
+        index_load = read_file(index_file_name)
+    embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
+    if embeddings_file_names:
+        print("Loading embeddings from file.")
+        embed_load = np.load(embeddings_file_names[0])['arr_0']
+        # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
+        if "compress" in embeddings_file_names[0]:
+            embed_load /= 100
+    else:
+        embed_load = np.array([])
+    out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
+    print(out_message)
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, out_message
+def put_columns_in_join_df(in_file):
     '''
+    When file is loaded, update the column dropdown choices
     '''
+    new_df = pd.DataFrame()
+    #print("in_bm25_column")
     new_choices = []
     concat_choices = []
+    new_df = read_file(in_file.name)
+    new_choices = list(new_df.columns)
+    #print(new_choices)
+    concat_choices.extend(new_choices)
+    out_message = "File load successful. Now select a column to join below."
+    return gr.Dropdown(choices=concat_choices), new_df, out_message
 def dummy_function(gradio_component):
     """

search_funcs/semantic_functions.py CHANGED Viewed

@@ -12,7 +12,6 @@ today_rev = datetime.now().strftime("%Y%m%d")
 from transformers import AutoModel
 from torch import cuda, backends, tensor, mm
-from search_funcs.helper_functions import read_file
 # Check for torch cuda
 print("Is CUDA enabled? ", cuda.is_available())
@@ -43,18 +42,6 @@ except:
     embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
-# Chroma support is currently deprecated
-# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
-#import chromadb
-#from chromadb.config import Settings
-#from typing_extensions import Protocol
-#from chromadb import Documents, EmbeddingFunction, Embeddings
-# Remove Chroma database file. If it exists as it can cause issues
-#chromadb_file = "chroma.sqlite3"
-#if os.path.isfile(chromadb_file):
-#    os.remove(chromadb_file)
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
@@ -82,10 +69,17 @@ def load_embeddings(embeddings_name = embeddings_name):
     return embeddings
-def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress()):
     '''
     Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
     print(f"> Total split documents: {len(docs_out)}")
@@ -105,17 +99,9 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
     out_message = "Document processing complete. Ready to search."
-    if embeddings_file_names:
-        print("Loading embeddings from file.")
-        embeddings_out = np.load(embeddings_file_names[0])['arr_0']
-        # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
-        if "compress" in embeddings_file_names[0]:
-            embeddings_out /= 100
-        # print("embeddings loaded: ", embeddings_out)
-    if not embeddings_file_names:
         tic = time.perf_counter()
         print("Starting to embed documents.")
         #embeddings_list = []
@@ -132,6 +118,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
         # If you want to save your files for next time
         if return_intermediate_files == "Yes":
             if embeddings_super_compress == "No":
                 semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
                 np.savez_compressed(semantic_search_file_name, embeddings_out)
@@ -144,12 +131,15 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
             return out_message, embeddings_out, semantic_search_file_name
         return out_message, embeddings_out, None
     print(out_message)
     return out_message, embeddings_out, None#, None
-def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
     def create_docs_keep_from_df(df):
         dict_out = {'ids' : [df['ids']],
@@ -213,11 +203,10 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
     # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
     # Join on additional files
-    if in_join_file:
-        join_filename = in_join_file.name
-        # Import data
-        join_df = read_file(join_filename)
         join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
         # Duplicates dropped so as not to expand out dataframe
@@ -225,14 +214,17 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
         results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
-        results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
     return results_df_out
 def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
-                           vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings
     # print("vectorstore loaded: ", vectorstore)
     # Convert it to a PyTorch tensor and transfer to GPU
     vectorstore_tensor = tensor(vectorstore).to(device)
@@ -277,6 +269,8 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
     results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
     # If nothing found, return error message
     if results_df_out.empty:
         return 'No result found!', None
@@ -284,12 +278,30 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
     query_str_file = query_str.replace(" ", "_")
     results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     return results_first_text, results_df_name
 # Deprecated Chroma functions - kept just in case needed in future.
 def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
     '''

 from transformers import AutoModel
 from torch import cuda, backends, tensor, mm
 # Check for torch cuda
 print("Is CUDA enabled? ", cuda.is_available())
     embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     return embeddings
+def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
     '''
     Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
+    if not in_file:
+        out_message = "No input file found. Please load in at least one file."
+        print(out_message)
+        return out_message, None, None
+    progress(0.7, desc = "Loading/creating embeddings")
     print(f"> Total split documents: {len(docs_out)}")
     out_message = "Document processing complete. Ready to search."
+     # print("embeddings loaded: ", embeddings_out)
+    if embeddings_state.size == 0:
         tic = time.perf_counter()
         print("Starting to embed documents.")
         #embeddings_list = []
         # If you want to save your files for next time
         if return_intermediate_files == "Yes":
+            progress(0.9, desc = "Saving embeddings to file")
             if embeddings_super_compress == "No":
                 semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
                 np.savez_compressed(semantic_search_file_name, embeddings_out)
             return out_message, embeddings_out, semantic_search_file_name
         return out_message, embeddings_out, None
+    else:
+        # Just return existing embeddings if already exist
+        embeddings_out = embeddings_state
     print(out_message)
     return out_message, embeddings_out, None#, None
+def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
     def create_docs_keep_from_df(df):
         dict_out = {'ids' : [df['ids']],
     # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
     # Join on additional files
+    if not in_join_file.empty:
+        progress(0.5, desc = "Joining on additional data file")
+        join_df = in_join_file
         join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
         # Duplicates dropped so as not to expand out dataframe
         results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
+        results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
     return results_df_out
 def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
+                           vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
     # print("vectorstore loaded: ", vectorstore)
+    progress(0, desc = "Conducting semantic search")
+    print("Searching")
     # Convert it to a PyTorch tensor and transfer to GPU
     vectorstore_tensor = tensor(vectorstore).to(device)
     results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
+    print("Search complete")
     # If nothing found, return error message
     if results_df_out.empty:
         return 'No result found!', None
     query_str_file = query_str.replace(" ", "_")
     results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
+    print("Saving search output to file")
+    progress(0.7, desc = "Saving search output to file")
     results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
+    print("Returning results")
     return results_first_text, results_df_name
 # Deprecated Chroma functions - kept just in case needed in future.
+# Chroma support is currently deprecated
+# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
+#import chromadb
+#from chromadb.config import Settings
+#from typing_extensions import Protocol
+#from chromadb import Documents, EmbeddingFunction, Embeddings
+# Remove Chroma database file. If it exists as it can cause issues
+#chromadb_file = "chroma.sqlite3"
+#if os.path.isfile(chromadb_file):
+#    os.remove(chromadb_file)
 def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
     '''

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -294,12 +294,23 @@ def parse_metadata(row):
 #     return doc_sections, message
-def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress()) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
     file_list = [string.name for string in in_file]
     data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     data_file_name = data_file_names[0]
     # Check if file is a document format, and explode out as needed
@@ -326,6 +337,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
     df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
     if clean == "Yes":
         clean_tic = time.perf_counter()
         print("Starting data clean.")
@@ -352,6 +364,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
     #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
     #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
     # Create a list of Document objects
     doc_sections = [Document(page_content=row['page_content'],
@@ -364,6 +377,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
     print(ingest_time_out)
     if return_intermediate_files == "Yes":
         data_file_out_name_no_ext = get_file_path_end(data_file_name)
         file_name = data_file_out_name_no_ext
         #print(doc_sections)

 #     return doc_sections, message
+def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
+    if not in_file:
+        return None, "Please load in at least one file.", data_state, None, None, None
+    progress(0, desc = "Loading in data")
     file_list = [string.name for string in in_file]
     data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
+    if not data_file_names:
+        return doc_sections, "Please load in at least one csv/Excel/parquet data file."
+    if not text_column:
+        return None, "Please enter a column name to search", data_state, None, None, None
     data_file_name = data_file_names[0]
     # Check if file is a document format, and explode out as needed
     df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
     if clean == "Yes":
+        progress(0.1, desc = "Cleaning data")
         clean_tic = time.perf_counter()
         print("Starting data clean.")
     #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
     #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
+    progress(0.3, desc = "Converting data to document format")
     # Create a list of Document objects
     doc_sections = [Document(page_content=row['page_content'],
     print(ingest_time_out)
     if return_intermediate_files == "Yes":
+        progress(0.5, desc = "Saving prepared documents")
         data_file_out_name_no_ext = get_file_path_end(data_file_name)
         file_name = data_file_out_name_no_ext
         #print(doc_sections)