Sean-Case commited on
Commit
4ee3470
1 Parent(s): 352c02a

Improved code for cleaning and outputting files. Added Dockerfile

Browse files
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM public.ecr.aws/docker/library/python:3.10.13-slim
2
+
3
+ WORKDIR /src
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
+ PYTHONUNBUFFERED=1 \
18
+ GRADIO_ALLOW_FLAGGING=never \
19
+ GRADIO_NUM_PORTS=1 \
20
+ GRADIO_SERVER_NAME=0.0.0.0 \
21
+ GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces
23
+
24
+ # Set the working directory to the user's home directory
25
+ WORKDIR $HOME/app
26
+
27
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
+ COPY --chown=user . $HOME/app
29
+
30
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -37,6 +37,7 @@ with block:
37
  corpus_state = gr.State()
38
  keyword_data_list_state = gr.State([])
39
  join_data_state = gr.State(pd.DataFrame())
 
40
 
41
  orig_keyword_data_state = gr.State(pd.DataFrame())
42
  keyword_data_state = gr.State(pd.DataFrame())
@@ -122,7 +123,7 @@ depends on factors such as the type of documents or queries. Information taken f
122
  with gr.Tab(label="Advanced options"):
123
  with gr.Accordion(label="Data load / save options", open = True):
124
  with gr.Row():
125
- in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
126
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
127
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
128
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
@@ -170,8 +171,8 @@ depends on factors such as the type of documents or queries. Information taken f
170
 
171
 
172
  # BM25 search functions on click or enter
173
- keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
174
- keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
175
 
176
  # Fuzzy search functions on click
177
  fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
@@ -181,8 +182,8 @@ depends on factors such as the type of documents or queries. Information taken f
181
  # Load in a csv/excel file for semantic search
182
  in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
183
  load_semantic_data_button.click(
184
- csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
185
- then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
186
 
187
  # Semantic search query
188
  semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
 
37
  corpus_state = gr.State()
38
  keyword_data_list_state = gr.State([])
39
  join_data_state = gr.State(pd.DataFrame())
40
+ output_file_state = gr.State([])
41
 
42
  orig_keyword_data_state = gr.State(pd.DataFrame())
43
  keyword_data_state = gr.State(pd.DataFrame())
 
123
  with gr.Tab(label="Advanced options"):
124
  with gr.Accordion(label="Data load / save options", open = True):
125
  with gr.Row():
126
+ in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="Yes", choices=["Yes", "No"])
127
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
128
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
129
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
 
171
 
172
 
173
  # BM25 search functions on click or enter
174
+ keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
175
+ keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
176
 
177
  # Fuzzy search functions on click
178
  fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
 
182
  # Load in a csv/excel file for semantic search
183
  in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
184
  load_semantic_data_button.click(
185
+ csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
186
+ then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
187
 
188
  # Semantic search query
189
  semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
how_to_create_exe_dist.txt CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.2.3 app.py
18
 
19
  b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
20
 
@@ -25,7 +25,7 @@ a = Analysis(
25
  }
26
  )
27
 
28
- c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.2.3.spec
29
 
30
 
31
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
 
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.3 app.py
18
 
19
  b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
20
 
 
25
  }
26
  )
27
 
28
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.3.spec
29
 
30
 
31
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
search_funcs/bm25_functions.py CHANGED
@@ -264,6 +264,10 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
264
 
265
  df[text_column] = df[text_column].astype(str).str.lower()
266
 
 
 
 
 
267
  if search_index_file_names:
268
  corpus = list(df[text_column])
269
  message = "Tokenisation skipped - loading search index from file."
@@ -271,7 +275,6 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
271
  return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
272
 
273
 
274
-
275
  if clean == "Yes":
276
  progress(0.1, desc = "Cleaning data")
277
  clean_tic = time.perf_counter()
@@ -466,7 +469,7 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
466
 
467
  return out_query
468
 
469
- def bm25_search(free_text_query, in_no_search_results, original_data, text_column, in_join_file, clean = "No", in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
470
 
471
  progress(0, desc = "Conducting keyword search")
472
 
@@ -493,8 +496,37 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
493
  "search_text": joined_texts,
494
  "search_score_abs": results_scores})
495
  results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
496
- results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
  # Join on additional files
499
  if not in_join_file.empty:
500
  progress(0.5, desc = "Joining on additional data file")
@@ -507,8 +539,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
507
 
508
  results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
509
 
510
- # Reorder results by score
511
- results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
512
 
513
  # Out file
514
  query_str_file = ("_").join(token_query)
 
264
 
265
  df[text_column] = df[text_column].astype(str).str.lower()
266
 
267
+ if "copy_of_case_note_id" in df.columns:
268
+ print("copy column found")
269
+ df.loc[~df["copy_of_case_note_id"].isna(), text_column] = ""
270
+
271
  if search_index_file_names:
272
  corpus = list(df[text_column])
273
  message = "Tokenisation skipped - loading search index from file."
 
275
  return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
276
 
277
 
 
278
  if clean == "Yes":
279
  progress(0.1, desc = "Cleaning data")
280
  clean_tic = time.perf_counter()
 
469
 
470
  return out_query
471
 
472
+ def bm25_search(free_text_query, in_no_search_results, original_data, searched_data, text_column, in_join_file, clean, in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
473
 
474
  progress(0, desc = "Conducting keyword search")
475
 
 
496
  "search_text": joined_texts,
497
  "search_score_abs": results_scores})
498
  results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
 
499
 
500
+ # Join scores onto searched data
501
+ results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(searched_data,left_on="index", right_index=True, how="left", suffixes = ("", "_y")).drop("index_y", axis=1, errors="ignore")
502
+
503
+
504
+
505
+ # Join on data from duplicate case notes
506
+ if ("copy_of_case_note_id" in original_data.columns) and ("note_id" in results_df_out.columns):
507
+ if clean == "No":
508
+ print("Clean is no")
509
+ orig_text_column = text_column
510
+ else:
511
+ print("Clean is yes")
512
+ orig_text_column = text_column.replace("_cleaned", "")
513
+
514
+ #print(orig_text_column)
515
+ #print(original_data.columns)
516
+
517
+ original_data["original_note_id"] = original_data["copy_of_case_note_id"]
518
+ original_data["original_note_id"] = original_data["original_note_id"].combine_first(original_data["note_id"])
519
+
520
+ results_df_out = results_df_out.merge(original_data[["original_note_id", "note_id", "copy_of_case_note_id", "person_id"]],left_on="note_id", right_on="original_note_id", how="left", suffixes=("_primary", "")) # .drop(orig_text_column, axis = 1)
521
+ results_df_out.loc[~results_df_out["copy_of_case_note_id"].isnull(), "search_text"] = ""
522
+ results_df_out.loc[~results_df_out["copy_of_case_note_id"].isnull(), text_column] = ""
523
+
524
+ #results_df_out = pd.concat([results_df_out, original_data[~original_data["copy_of_case_note_id"].isna()][["copy_of_case_note_id", "person_id"]]])
525
+ # Replace NaN with an empty string
526
+ # results_df_out.fillna('', inplace=True)
527
+
528
+
529
+
530
  # Join on additional files
531
  if not in_join_file.empty:
532
  progress(0.5, desc = "Joining on additional data file")
 
539
 
540
  results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
541
 
542
+ # Reorder results by score, and whether there is text
543
+ results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
544
 
545
  # Out file
546
  query_str_file = ("_").join(token_query)
search_funcs/helper_functions.py CHANGED
@@ -110,6 +110,7 @@ def initial_data_load(in_file):
110
  #print(file_list)
111
 
112
  data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
 
113
 
114
  if not data_file_names:
115
  out_message = "Please load in at least one csv/Excel/parquet data file."
 
110
  #print(file_list)
111
 
112
  data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
113
+ print(data_file_names)
114
 
115
  if not data_file_names:
116
  out_message = "Please load in at least one csv/Excel/parquet data file."
search_funcs/semantic_functions.py CHANGED
@@ -48,14 +48,14 @@ local_embeddings_location = "model/bge/"
48
  # Not using SentenceTransformer here
49
  embeddings_model = SentenceTransformer(embeddings_name)
50
 
51
- def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
52
  '''
53
  Takes a Langchain document class and saves it into a Chroma sqlite file.
54
  '''
55
  if not in_file:
56
  out_message = "No input file found. Please load in at least one file."
57
  print(out_message)
58
- return out_message, None, None
59
 
60
 
61
  progress(0.6, desc = "Loading/creating embeddings")
@@ -114,16 +114,18 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, retur
114
  embeddings_out_round *= 100 # Rounding not currently used
115
  np.savez_compressed(semantic_search_file_name, embeddings_out_round)
116
 
117
- return out_message, embeddings_out, semantic_search_file_name
118
 
119
- return out_message, embeddings_out, None
 
 
120
  else:
121
  # Just return existing embeddings if already exist
122
  embeddings_out = embeddings_state
123
 
124
  print(out_message)
125
 
126
- return out_message, embeddings_out, None#, None
127
 
128
  def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
129
 
 
48
  # Not using SentenceTransformer here
49
  embeddings_model = SentenceTransformer(embeddings_name)
50
 
51
+ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
52
  '''
53
  Takes a Langchain document class and saves it into a Chroma sqlite file.
54
  '''
55
  if not in_file:
56
  out_message = "No input file found. Please load in at least one file."
57
  print(out_message)
58
+ return out_message, None, None, output_file_state
59
 
60
 
61
  progress(0.6, desc = "Loading/creating embeddings")
 
114
  embeddings_out_round *= 100 # Rounding not currently used
115
  np.savez_compressed(semantic_search_file_name, embeddings_out_round)
116
 
117
+ output_file_state.append(semantic_search_file_name)
118
 
119
+ return out_message, embeddings_out, output_file_state, output_file_state
120
+
121
+ return out_message, embeddings_out, output_file_state, output_file_state
122
  else:
123
  # Just return existing embeddings if already exist
124
  embeddings_out = embeddings_state
125
 
126
  print(out_message)
127
 
128
+ return out_message, embeddings_out, output_file_state, output_file_state
129
 
130
  def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
131
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -197,8 +197,11 @@ def parse_metadata(row):
197
 
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
 
 
 
200
  if not in_file:
201
- return None, "Please load in at least one file.", df, None, None, None
202
 
203
  progress(0, desc = "Loading in data")
204
 
@@ -207,10 +210,10 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
207
  data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
208
 
209
  if not data_file_names:
210
- return doc_sections, "Please load in at least one csv/Excel/parquet data file."
211
 
212
  if not text_column:
213
- return None, "Please enter a column name to search", df, None, None, None
214
 
215
  data_file_name = data_file_names[0]
216
 
@@ -229,7 +232,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
229
  # Convert each element in the Series to a Document instance
230
  #doc_sections = section_series.apply(lambda x: Document(**x))
231
 
232
- return doc_sections, "Finished preparing documents"
233
  # df = document_to_dataframe(df.iloc[:,0])
234
 
235
  ingest_tic = time.perf_counter()
@@ -255,7 +258,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
255
 
256
 
257
  # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
258
- out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
259
 
260
  df[text_column] = df_list
261
 
@@ -301,21 +304,23 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
301
 
302
  if clean == "No":
303
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
304
-
305
- with gzip.open(file_name + "_prepared_docs.pkl.gz", 'wb') as file:
306
  pickle.dump(doc_sections, file)
307
 
308
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
309
  elif clean == "Yes":
310
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
311
 
312
- with gzip.open(file_name + "_cleaned_prepared_docs.pkl.gz", 'wb') as file:
 
313
  pickle.dump(doc_sections, file)
314
 
315
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
 
316
  print("Documents saved to file.")
317
 
318
- return doc_sections, "Finished preparing documents."
319
 
320
  def document_to_dataframe(documents):
321
  '''
 
197
 
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
200
+
201
+ output_list = []
202
+
203
  if not in_file:
204
+ return None, "Please load in at least one file.", output_list
205
 
206
  progress(0, desc = "Loading in data")
207
 
 
210
  data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
211
 
212
  if not data_file_names:
213
+ return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
214
 
215
  if not text_column:
216
+ return None, "Please enter a column name to search"
217
 
218
  data_file_name = data_file_names[0]
219
 
 
232
  # Convert each element in the Series to a Document instance
233
  #doc_sections = section_series.apply(lambda x: Document(**x))
234
 
235
+ return doc_sections, "Finished preparing documents", output_list
236
  # df = document_to_dataframe(df.iloc[:,0])
237
 
238
  ingest_tic = time.perf_counter()
 
258
 
259
 
260
  # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
261
+ out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
262
 
263
  df[text_column] = df_list
264
 
 
304
 
305
  if clean == "No":
306
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
307
+ out_doc_file_name = file_name + "_prepared_docs.pkl.gz"
308
+ with gzip.open(out_doc_file_name, 'wb') as file:
309
  pickle.dump(doc_sections, file)
310
 
311
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
312
  elif clean == "Yes":
313
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
314
 
315
+ out_doc_file_name = file_name + "_cleaned_prepared_docs.pkl.gz"
316
+ with gzip.open(out_doc_file_name, 'wb') as file:
317
  pickle.dump(doc_sections, file)
318
 
319
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
320
+ output_list.append(out_doc_file_name)
321
  print("Documents saved to file.")
322
 
323
+ return doc_sections, "Finished preparing documents.", output_list
324
 
325
  def document_to_dataframe(documents):
326
  '''
search_funcs/spacy_search_funcs.py CHANGED
@@ -7,6 +7,7 @@ import gradio as gr
7
  import pandas as pd
8
  from typing import List, Type
9
  from datetime import datetime
 
10
 
11
  PandasDataFrame = Type[pd.DataFrame]
12
 
@@ -110,7 +111,12 @@ def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: Pand
110
  print("Saving search file output")
111
  progress(0.7, desc = "Saving search output to file")
112
 
113
- results_df_out.to_excel(results_df_name, index= None)
 
 
 
 
 
114
  results_first_text = results_df_out[text_column].iloc[0]
115
 
116
  print("Returning results")
 
7
  import pandas as pd
8
  from typing import List, Type
9
  from datetime import datetime
10
+ from search_funcs.helper_functions import create_highlighted_excel_wb
11
 
12
  PandasDataFrame = Type[pd.DataFrame]
13
 
 
111
  print("Saving search file output")
112
  progress(0.7, desc = "Saving search output to file")
113
 
114
+ #results_df_out.to_excel(results_df_name, index= None)
115
+
116
+ # Highlight found text and save to file
117
+ results_df_out_wb = create_highlighted_excel_wb(results_df_out, free_text_query, "search_text")
118
+ results_df_out_wb.save(results_df_name)
119
+
120
  results_first_text = results_df_out[text_column].iloc[0]
121
 
122
  print("Returning results")