Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

Sean-Case commited on Feb 27, 2024

Commit

4ee3470

1 Parent(s): 352c02a

Improved code for cleaning and outputting files. Added Dockerfile

Browse files

Files changed (8) hide show

Dockerfile +30 -0
app.py +6 -5
how_to_create_exe_dist.txt +2 -2
search_funcs/bm25_functions.py +37 -5
search_funcs/helper_functions.py +1 -0
search_funcs/semantic_functions.py +7 -5
search_funcs/semantic_ingest_functions.py +14 -9
search_funcs/spacy_search_funcs.py +7 -1

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM public.ecr.aws/docker/library/python:3.10.13-slim
+WORKDIR /src
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ with block:
     corpus_state = gr.State()
     keyword_data_list_state = gr.State([])
     join_data_state = gr.State(pd.DataFrame())
     orig_keyword_data_state = gr.State(pd.DataFrame())
     keyword_data_state = gr.State(pd.DataFrame())
@@ -122,7 +123,7 @@ depends on factors such as the type of documents or queries. Information taken f
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label="Data load / save options", open = True):
             with gr.Row():
-                in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
@@ -170,8 +171,8 @@ depends on factors such as the type of documents or queries. Information taken f
     # BM25 search functions on click or enter
-    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
-    keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
     fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
@@ -181,8 +182,8 @@ depends on factors such as the type of documents or queries. Information taken f
     # Load in a csv/excel file for semantic search
     in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
-        csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
-        then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
     # Semantic search query
     semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")

     corpus_state = gr.State()
     keyword_data_list_state = gr.State([])
     join_data_state = gr.State(pd.DataFrame())
+    output_file_state = gr.State([])
     orig_keyword_data_state = gr.State(pd.DataFrame())
     keyword_data_state = gr.State(pd.DataFrame())
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label="Data load / save options", open = True):
             with gr.Row():
+                in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="Yes", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
     # BM25 search functions on click or enter
+    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
+    keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
     fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
     # Load in a csv/excel file for semantic search
     in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
+        csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
+        then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
     # Semantic search query
     semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")

how_to_create_exe_dist.txt CHANGED Viewed

@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
-a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.2.3 app.py
 b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
@@ -25,7 +25,7 @@ a = Analysis(
     }
 )
-c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.2.3.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.3 app.py
 b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
     }
 )
+c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.3.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

search_funcs/bm25_functions.py CHANGED Viewed

@@ -264,6 +264,10 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
 	df[text_column] = df[text_column].astype(str).str.lower()
 	if search_index_file_names:
 		corpus = list(df[text_column])
 		message = "Tokenisation skipped - loading search index from file."
@@ -271,7 +275,6 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
 		return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
 	if clean == "Yes":
 		progress(0.1, desc = "Cleaning data")
 		clean_tic = time.perf_counter()
@@ -466,7 +469,7 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
     return out_query
-def bm25_search(free_text_query, in_no_search_results, original_data, text_column, in_join_file, clean = "No",  in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
 	progress(0, desc = "Conducting keyword search")
@@ -493,8 +496,37 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 									"search_text": joined_texts,
 									"search_score_abs": results_scores})
 	results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
-	results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
 	# Join on additional files
 	if not in_join_file.empty:
 		progress(0.5, desc = "Joining on additional data file")
@@ -507,8 +539,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 		results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
-	# Reorder results by score
-	results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
 	# Out file
 	query_str_file = ("_").join(token_query)

 	df[text_column] = df[text_column].astype(str).str.lower()
+	if "copy_of_case_note_id" in df.columns:
+		print("copy column found")
+		df.loc[~df["copy_of_case_note_id"].isna(), text_column] = ""
 	if search_index_file_names:
 		corpus = list(df[text_column])
 		message = "Tokenisation skipped - loading search index from file."
 		return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
 	if clean == "Yes":
 		progress(0.1, desc = "Cleaning data")
 		clean_tic = time.perf_counter()
     return out_query
+def bm25_search(free_text_query, in_no_search_results, original_data, searched_data, text_column, in_join_file, clean,  in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
 	progress(0, desc = "Conducting keyword search")
 									"search_text": joined_texts,
 									"search_score_abs": results_scores})
 	results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
+	# Join scores onto searched data
+	results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(searched_data,left_on="index", right_index=True, how="left", suffixes = ("", "_y")).drop("index_y", axis=1, errors="ignore")
+	# Join on data from duplicate case notes
+	if ("copy_of_case_note_id" in original_data.columns) and ("note_id" in results_df_out.columns):
+		if clean == "No":
+			print("Clean is no")
+			orig_text_column = text_column
+		else:
+			print("Clean is yes")
+			orig_text_column = text_column.replace("_cleaned", "")
+		#print(orig_text_column)
+		#print(original_data.columns)
+		original_data["original_note_id"] = original_data["copy_of_case_note_id"]
+		original_data["original_note_id"] = original_data["original_note_id"].combine_first(original_data["note_id"])
+		results_df_out = results_df_out.merge(original_data[["original_note_id", "note_id", "copy_of_case_note_id", "person_id"]],left_on="note_id", right_on="original_note_id", how="left", suffixes=("_primary", "")) # .drop(orig_text_column, axis = 1)
+		results_df_out.loc[~results_df_out["copy_of_case_note_id"].isnull(), "search_text"] = ""
+		results_df_out.loc[~results_df_out["copy_of_case_note_id"].isnull(), text_column] = ""
+		#results_df_out = pd.concat([results_df_out, original_data[~original_data["copy_of_case_note_id"].isna()][["copy_of_case_note_id", "person_id"]]])
+		# Replace NaN with an empty string
+		# results_df_out.fillna('', inplace=True)
 	# Join on additional files
 	if not in_join_file.empty:
 		progress(0.5, desc = "Joining on additional data file")
 		results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
+	# Reorder results by score, and whether there is text
+	results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
 	# Out file
 	query_str_file = ("_").join(token_query)

search_funcs/helper_functions.py CHANGED Viewed

@@ -110,6 +110,7 @@ def initial_data_load(in_file):
     #print(file_list)
     data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."

     #print(file_list)
     data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
+    print(data_file_names)
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."

search_funcs/semantic_functions.py CHANGED Viewed

@@ -48,14 +48,14 @@ local_embeddings_location = "model/bge/"
 # Not using SentenceTransformer here
 embeddings_model = SentenceTransformer(embeddings_name)
-def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
     '''
     Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
     if not in_file:
         out_message = "No input file found. Please load in at least one file."
         print(out_message)
-        return out_message, None, None
     progress(0.6, desc = "Loading/creating embeddings")
@@ -114,16 +114,18 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, retur
                 embeddings_out_round *= 100 # Rounding not currently used
                 np.savez_compressed(semantic_search_file_name, embeddings_out_round)
-            return out_message, embeddings_out, semantic_search_file_name
-        return out_message, embeddings_out, None
     else:
         # Just return existing embeddings if already exist
         embeddings_out = embeddings_state
     print(out_message)
-    return out_message, embeddings_out, None#, None
 def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):

 # Not using SentenceTransformer here
 embeddings_model = SentenceTransformer(embeddings_name)
+def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
     '''
     Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
     if not in_file:
         out_message = "No input file found. Please load in at least one file."
         print(out_message)
+        return out_message, None, None, output_file_state
     progress(0.6, desc = "Loading/creating embeddings")
                 embeddings_out_round *= 100 # Rounding not currently used
                 np.savez_compressed(semantic_search_file_name, embeddings_out_round)
+            output_file_state.append(semantic_search_file_name)
+            return out_message, embeddings_out, output_file_state, output_file_state
+        return out_message, embeddings_out, output_file_state, output_file_state
     else:
         # Just return existing embeddings if already exist
         embeddings_out = embeddings_state
     print(out_message)
+    return out_message, embeddings_out, output_file_state, output_file_state
 def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -197,8 +197,11 @@ def parse_metadata(row):
 def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
     if not in_file:
-        return None, "Please load in at least one file.", df, None, None, None
     progress(0, desc = "Loading in data")
@@ -207,10 +210,10 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
     data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     if not data_file_names:
-        return doc_sections, "Please load in at least one csv/Excel/parquet data file."
     if not text_column:
-        return None, "Please enter a column name to search", df, None, None, None
     data_file_name = data_file_names[0]
@@ -229,7 +232,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         # Convert each element in the Series to a Document instance
         #doc_sections = section_series.apply(lambda x: Document(**x))
-        return doc_sections, "Finished preparing documents"
     #    df = document_to_dataframe(df.iloc[:,0])
     ingest_tic = time.perf_counter()
@@ -255,7 +258,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
-        out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
         df[text_column] = df_list
@@ -301,21 +304,23 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
-            with gzip.open(file_name + "_prepared_docs.pkl.gz", 'wb') as file:
                 pickle.dump(doc_sections, file)
             #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
-            with gzip.open(file_name + "_cleaned_prepared_docs.pkl.gz", 'wb') as file:
                 pickle.dump(doc_sections, file)
             #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
         print("Documents saved to file.")
-    return doc_sections, "Finished preparing documents."
 def document_to_dataframe(documents):
     '''

 def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
+    output_list = []
     if not in_file:
+        return None, "Please load in at least one file.", output_list
     progress(0, desc = "Loading in data")
     data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     if not data_file_names:
+        return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
     if not text_column:
+        return None, "Please enter a column name to search"
     data_file_name = data_file_names[0]
         # Convert each element in the Series to a Document instance
         #doc_sections = section_series.apply(lambda x: Document(**x))
+        return doc_sections, "Finished preparing documents", output_list
     #    df = document_to_dataframe(df.iloc[:,0])
     ingest_tic = time.perf_counter()
         # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
+        out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
         df[text_column] = df_list
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
+            out_doc_file_name = file_name + "_prepared_docs.pkl.gz"
+            with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)
             #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
+            out_doc_file_name = file_name + "_cleaned_prepared_docs.pkl.gz"
+            with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)
             #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
+        output_list.append(out_doc_file_name)
         print("Documents saved to file.")
+    return doc_sections, "Finished preparing documents.", output_list
 def document_to_dataframe(documents):
     '''

search_funcs/spacy_search_funcs.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 import pandas as pd
 from typing import List, Type
 from datetime import datetime
 PandasDataFrame = Type[pd.DataFrame]
@@ -110,7 +111,12 @@ def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: Pand
     print("Saving search file output")
     progress(0.7, desc = "Saving search output to file")
-    results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out[text_column].iloc[0]
     print("Returning results")

 import pandas as pd
 from typing import List, Type
 from datetime import datetime
+from search_funcs.helper_functions import create_highlighted_excel_wb
 PandasDataFrame = Type[pd.DataFrame]
     print("Saving search file output")
     progress(0.7, desc = "Saving search output to file")
+    #results_df_out.to_excel(results_df_name, index= None)
+    # Highlight found text and save to file
+    results_df_out_wb = create_highlighted_excel_wb(results_df_out, free_text_query, "search_text")
+    results_df_out_wb.save(results_df_name)
     results_first_text = results_df_out[text_column].iloc[0]
     print("Returning results")