Sean-Case
commited on
Commit
•
4ee3470
1
Parent(s):
352c02a
Improved code for cleaning and outputting files. Added Dockerfile
Browse files- Dockerfile +30 -0
- app.py +6 -5
- how_to_create_exe_dist.txt +2 -2
- search_funcs/bm25_functions.py +37 -5
- search_funcs/helper_functions.py +1 -0
- search_funcs/semantic_functions.py +7 -5
- search_funcs/semantic_ingest_functions.py +14 -9
- search_funcs/spacy_search_funcs.py +7 -1
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM public.ecr.aws/docker/library/python:3.10.13-slim
|
2 |
+
|
3 |
+
WORKDIR /src
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
# Set up a new user named "user" with user ID 1000
|
10 |
+
RUN useradd -m -u 1000 user
|
11 |
+
# Switch to the "user" user
|
12 |
+
USER user
|
13 |
+
# Set home to the user's home directory
|
14 |
+
ENV HOME=/home/user \
|
15 |
+
PATH=/home/user/.local/bin:$PATH \
|
16 |
+
PYTHONPATH=$HOME/app \
|
17 |
+
PYTHONUNBUFFERED=1 \
|
18 |
+
GRADIO_ALLOW_FLAGGING=never \
|
19 |
+
GRADIO_NUM_PORTS=1 \
|
20 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
+
GRADIO_THEME=huggingface \
|
22 |
+
SYSTEM=spaces
|
23 |
+
|
24 |
+
# Set the working directory to the user's home directory
|
25 |
+
WORKDIR $HOME/app
|
26 |
+
|
27 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
+
COPY --chown=user . $HOME/app
|
29 |
+
|
30 |
+
CMD ["python", "app.py"]
|
app.py
CHANGED
@@ -37,6 +37,7 @@ with block:
|
|
37 |
corpus_state = gr.State()
|
38 |
keyword_data_list_state = gr.State([])
|
39 |
join_data_state = gr.State(pd.DataFrame())
|
|
|
40 |
|
41 |
orig_keyword_data_state = gr.State(pd.DataFrame())
|
42 |
keyword_data_state = gr.State(pd.DataFrame())
|
@@ -122,7 +123,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
122 |
with gr.Tab(label="Advanced options"):
|
123 |
with gr.Accordion(label="Data load / save options", open = True):
|
124 |
with gr.Row():
|
125 |
-
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="
|
126 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
127 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
128 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
@@ -170,8 +171,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
170 |
|
171 |
|
172 |
# BM25 search functions on click or enter
|
173 |
-
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
174 |
-
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
175 |
|
176 |
# Fuzzy search functions on click
|
177 |
fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
|
@@ -181,8 +182,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
181 |
# Load in a csv/excel file for semantic search
|
182 |
in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
|
183 |
load_semantic_data_button.click(
|
184 |
-
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
185 |
-
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
186 |
|
187 |
# Semantic search query
|
188 |
semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
|
|
37 |
corpus_state = gr.State()
|
38 |
keyword_data_list_state = gr.State([])
|
39 |
join_data_state = gr.State(pd.DataFrame())
|
40 |
+
output_file_state = gr.State([])
|
41 |
|
42 |
orig_keyword_data_state = gr.State(pd.DataFrame())
|
43 |
keyword_data_state = gr.State(pd.DataFrame())
|
|
|
123 |
with gr.Tab(label="Advanced options"):
|
124 |
with gr.Accordion(label="Data load / save options", open = True):
|
125 |
with gr.Row():
|
126 |
+
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="Yes", choices=["Yes", "No"])
|
127 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
128 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
129 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
|
|
171 |
|
172 |
|
173 |
# BM25 search functions on click or enter
|
174 |
+
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
175 |
+
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
176 |
|
177 |
# Fuzzy search functions on click
|
178 |
fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
|
|
|
182 |
# Load in a csv/excel file for semantic search
|
183 |
in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
|
184 |
load_semantic_data_button.click(
|
185 |
+
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
|
186 |
+
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
|
187 |
|
188 |
# Semantic search query
|
189 |
semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
how_to_create_exe_dist.txt
CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.
|
18 |
|
19 |
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
20 |
|
@@ -25,7 +25,7 @@ a = Analysis(
|
|
25 |
}
|
26 |
)
|
27 |
|
28 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.
|
29 |
|
30 |
|
31 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.3 app.py
|
18 |
|
19 |
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
20 |
|
|
|
25 |
}
|
26 |
)
|
27 |
|
28 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.3.spec
|
29 |
|
30 |
|
31 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
search_funcs/bm25_functions.py
CHANGED
@@ -264,6 +264,10 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
264 |
|
265 |
df[text_column] = df[text_column].astype(str).str.lower()
|
266 |
|
|
|
|
|
|
|
|
|
267 |
if search_index_file_names:
|
268 |
corpus = list(df[text_column])
|
269 |
message = "Tokenisation skipped - loading search index from file."
|
@@ -271,7 +275,6 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
271 |
return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
272 |
|
273 |
|
274 |
-
|
275 |
if clean == "Yes":
|
276 |
progress(0.1, desc = "Cleaning data")
|
277 |
clean_tic = time.perf_counter()
|
@@ -466,7 +469,7 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
|
|
466 |
|
467 |
return out_query
|
468 |
|
469 |
-
def bm25_search(free_text_query, in_no_search_results, original_data, text_column, in_join_file, clean
|
470 |
|
471 |
progress(0, desc = "Conducting keyword search")
|
472 |
|
@@ -493,8 +496,37 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
493 |
"search_text": joined_texts,
|
494 |
"search_score_abs": results_scores})
|
495 |
results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
|
496 |
-
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
|
497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
# Join on additional files
|
499 |
if not in_join_file.empty:
|
500 |
progress(0.5, desc = "Joining on additional data file")
|
@@ -507,8 +539,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
507 |
|
508 |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
|
509 |
|
510 |
-
# Reorder results by score
|
511 |
-
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
512 |
|
513 |
# Out file
|
514 |
query_str_file = ("_").join(token_query)
|
|
|
264 |
|
265 |
df[text_column] = df[text_column].astype(str).str.lower()
|
266 |
|
267 |
+
if "copy_of_case_note_id" in df.columns:
|
268 |
+
print("copy column found")
|
269 |
+
df.loc[~df["copy_of_case_note_id"].isna(), text_column] = ""
|
270 |
+
|
271 |
if search_index_file_names:
|
272 |
corpus = list(df[text_column])
|
273 |
message = "Tokenisation skipped - loading search index from file."
|
|
|
275 |
return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
276 |
|
277 |
|
|
|
278 |
if clean == "Yes":
|
279 |
progress(0.1, desc = "Cleaning data")
|
280 |
clean_tic = time.perf_counter()
|
|
|
469 |
|
470 |
return out_query
|
471 |
|
472 |
+
def bm25_search(free_text_query, in_no_search_results, original_data, searched_data, text_column, in_join_file, clean, in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
|
473 |
|
474 |
progress(0, desc = "Conducting keyword search")
|
475 |
|
|
|
496 |
"search_text": joined_texts,
|
497 |
"search_score_abs": results_scores})
|
498 |
results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
|
|
|
499 |
|
500 |
+
# Join scores onto searched data
|
501 |
+
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(searched_data,left_on="index", right_index=True, how="left", suffixes = ("", "_y")).drop("index_y", axis=1, errors="ignore")
|
502 |
+
|
503 |
+
|
504 |
+
|
505 |
+
# Join on data from duplicate case notes
|
506 |
+
if ("copy_of_case_note_id" in original_data.columns) and ("note_id" in results_df_out.columns):
|
507 |
+
if clean == "No":
|
508 |
+
print("Clean is no")
|
509 |
+
orig_text_column = text_column
|
510 |
+
else:
|
511 |
+
print("Clean is yes")
|
512 |
+
orig_text_column = text_column.replace("_cleaned", "")
|
513 |
+
|
514 |
+
#print(orig_text_column)
|
515 |
+
#print(original_data.columns)
|
516 |
+
|
517 |
+
original_data["original_note_id"] = original_data["copy_of_case_note_id"]
|
518 |
+
original_data["original_note_id"] = original_data["original_note_id"].combine_first(original_data["note_id"])
|
519 |
+
|
520 |
+
results_df_out = results_df_out.merge(original_data[["original_note_id", "note_id", "copy_of_case_note_id", "person_id"]],left_on="note_id", right_on="original_note_id", how="left", suffixes=("_primary", "")) # .drop(orig_text_column, axis = 1)
|
521 |
+
results_df_out.loc[~results_df_out["copy_of_case_note_id"].isnull(), "search_text"] = ""
|
522 |
+
results_df_out.loc[~results_df_out["copy_of_case_note_id"].isnull(), text_column] = ""
|
523 |
+
|
524 |
+
#results_df_out = pd.concat([results_df_out, original_data[~original_data["copy_of_case_note_id"].isna()][["copy_of_case_note_id", "person_id"]]])
|
525 |
+
# Replace NaN with an empty string
|
526 |
+
# results_df_out.fillna('', inplace=True)
|
527 |
+
|
528 |
+
|
529 |
+
|
530 |
# Join on additional files
|
531 |
if not in_join_file.empty:
|
532 |
progress(0.5, desc = "Joining on additional data file")
|
|
|
539 |
|
540 |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
|
541 |
|
542 |
+
# Reorder results by score, and whether there is text
|
543 |
+
results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
|
544 |
|
545 |
# Out file
|
546 |
query_str_file = ("_").join(token_query)
|
search_funcs/helper_functions.py
CHANGED
@@ -110,6 +110,7 @@ def initial_data_load(in_file):
|
|
110 |
#print(file_list)
|
111 |
|
112 |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
|
|
|
113 |
|
114 |
if not data_file_names:
|
115 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
|
|
110 |
#print(file_list)
|
111 |
|
112 |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
|
113 |
+
print(data_file_names)
|
114 |
|
115 |
if not data_file_names:
|
116 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
search_funcs/semantic_functions.py
CHANGED
@@ -48,14 +48,14 @@ local_embeddings_location = "model/bge/"
|
|
48 |
# Not using SentenceTransformer here
|
49 |
embeddings_model = SentenceTransformer(embeddings_name)
|
50 |
|
51 |
-
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
52 |
'''
|
53 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
54 |
'''
|
55 |
if not in_file:
|
56 |
out_message = "No input file found. Please load in at least one file."
|
57 |
print(out_message)
|
58 |
-
return out_message, None, None
|
59 |
|
60 |
|
61 |
progress(0.6, desc = "Loading/creating embeddings")
|
@@ -114,16 +114,18 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, retur
|
|
114 |
embeddings_out_round *= 100 # Rounding not currently used
|
115 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
116 |
|
117 |
-
|
118 |
|
119 |
-
|
|
|
|
|
120 |
else:
|
121 |
# Just return existing embeddings if already exist
|
122 |
embeddings_out = embeddings_state
|
123 |
|
124 |
print(out_message)
|
125 |
|
126 |
-
return out_message, embeddings_out,
|
127 |
|
128 |
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
|
129 |
|
|
|
48 |
# Not using SentenceTransformer here
|
49 |
embeddings_model = SentenceTransformer(embeddings_name)
|
50 |
|
51 |
+
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
52 |
'''
|
53 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
54 |
'''
|
55 |
if not in_file:
|
56 |
out_message = "No input file found. Please load in at least one file."
|
57 |
print(out_message)
|
58 |
+
return out_message, None, None, output_file_state
|
59 |
|
60 |
|
61 |
progress(0.6, desc = "Loading/creating embeddings")
|
|
|
114 |
embeddings_out_round *= 100 # Rounding not currently used
|
115 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
116 |
|
117 |
+
output_file_state.append(semantic_search_file_name)
|
118 |
|
119 |
+
return out_message, embeddings_out, output_file_state, output_file_state
|
120 |
+
|
121 |
+
return out_message, embeddings_out, output_file_state, output_file_state
|
122 |
else:
|
123 |
# Just return existing embeddings if already exist
|
124 |
embeddings_out = embeddings_state
|
125 |
|
126 |
print(out_message)
|
127 |
|
128 |
+
return out_message, embeddings_out, output_file_state, output_file_state
|
129 |
|
130 |
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
|
131 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -197,8 +197,11 @@ def parse_metadata(row):
|
|
197 |
|
198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
|
|
|
|
|
|
200 |
if not in_file:
|
201 |
-
return None, "Please load in at least one file.",
|
202 |
|
203 |
progress(0, desc = "Loading in data")
|
204 |
|
@@ -207,10 +210,10 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
207 |
data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
208 |
|
209 |
if not data_file_names:
|
210 |
-
return doc_sections, "Please load in at least one csv/Excel/parquet data file."
|
211 |
|
212 |
if not text_column:
|
213 |
-
return None, "Please enter a column name to search"
|
214 |
|
215 |
data_file_name = data_file_names[0]
|
216 |
|
@@ -229,7 +232,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
229 |
# Convert each element in the Series to a Document instance
|
230 |
#doc_sections = section_series.apply(lambda x: Document(**x))
|
231 |
|
232 |
-
return doc_sections, "Finished preparing documents"
|
233 |
# df = document_to_dataframe(df.iloc[:,0])
|
234 |
|
235 |
ingest_tic = time.perf_counter()
|
@@ -255,7 +258,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
255 |
|
256 |
|
257 |
# Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
|
258 |
-
out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
259 |
|
260 |
df[text_column] = df_list
|
261 |
|
@@ -301,21 +304,23 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
301 |
|
302 |
if clean == "No":
|
303 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
304 |
-
|
305 |
-
with gzip.open(
|
306 |
pickle.dump(doc_sections, file)
|
307 |
|
308 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
|
309 |
elif clean == "Yes":
|
310 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
311 |
|
312 |
-
|
|
|
313 |
pickle.dump(doc_sections, file)
|
314 |
|
315 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|
|
|
316 |
print("Documents saved to file.")
|
317 |
|
318 |
-
return doc_sections, "Finished preparing documents."
|
319 |
|
320 |
def document_to_dataframe(documents):
|
321 |
'''
|
|
|
197 |
|
198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
200 |
+
|
201 |
+
output_list = []
|
202 |
+
|
203 |
if not in_file:
|
204 |
+
return None, "Please load in at least one file.", output_list
|
205 |
|
206 |
progress(0, desc = "Loading in data")
|
207 |
|
|
|
210 |
data_file_names = [string for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
211 |
|
212 |
if not data_file_names:
|
213 |
+
return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
|
214 |
|
215 |
if not text_column:
|
216 |
+
return None, "Please enter a column name to search"
|
217 |
|
218 |
data_file_name = data_file_names[0]
|
219 |
|
|
|
232 |
# Convert each element in the Series to a Document instance
|
233 |
#doc_sections = section_series.apply(lambda x: Document(**x))
|
234 |
|
235 |
+
return doc_sections, "Finished preparing documents", output_list
|
236 |
# df = document_to_dataframe(df.iloc[:,0])
|
237 |
|
238 |
ingest_tic = time.perf_counter()
|
|
|
258 |
|
259 |
|
260 |
# Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
|
261 |
+
out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
262 |
|
263 |
df[text_column] = df_list
|
264 |
|
|
|
304 |
|
305 |
if clean == "No":
|
306 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
307 |
+
out_doc_file_name = file_name + "_prepared_docs.pkl.gz"
|
308 |
+
with gzip.open(out_doc_file_name, 'wb') as file:
|
309 |
pickle.dump(doc_sections, file)
|
310 |
|
311 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
|
312 |
elif clean == "Yes":
|
313 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
314 |
|
315 |
+
out_doc_file_name = file_name + "_cleaned_prepared_docs.pkl.gz"
|
316 |
+
with gzip.open(out_doc_file_name, 'wb') as file:
|
317 |
pickle.dump(doc_sections, file)
|
318 |
|
319 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|
320 |
+
output_list.append(out_doc_file_name)
|
321 |
print("Documents saved to file.")
|
322 |
|
323 |
+
return doc_sections, "Finished preparing documents.", output_list
|
324 |
|
325 |
def document_to_dataframe(documents):
|
326 |
'''
|
search_funcs/spacy_search_funcs.py
CHANGED
@@ -7,6 +7,7 @@ import gradio as gr
|
|
7 |
import pandas as pd
|
8 |
from typing import List, Type
|
9 |
from datetime import datetime
|
|
|
10 |
|
11 |
PandasDataFrame = Type[pd.DataFrame]
|
12 |
|
@@ -110,7 +111,12 @@ def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: Pand
|
|
110 |
print("Saving search file output")
|
111 |
progress(0.7, desc = "Saving search output to file")
|
112 |
|
113 |
-
results_df_out.to_excel(results_df_name, index= None)
|
|
|
|
|
|
|
|
|
|
|
114 |
results_first_text = results_df_out[text_column].iloc[0]
|
115 |
|
116 |
print("Returning results")
|
|
|
7 |
import pandas as pd
|
8 |
from typing import List, Type
|
9 |
from datetime import datetime
|
10 |
+
from search_funcs.helper_functions import create_highlighted_excel_wb
|
11 |
|
12 |
PandasDataFrame = Type[pd.DataFrame]
|
13 |
|
|
|
111 |
print("Saving search file output")
|
112 |
progress(0.7, desc = "Saving search output to file")
|
113 |
|
114 |
+
#results_df_out.to_excel(results_df_name, index= None)
|
115 |
+
|
116 |
+
# Highlight found text and save to file
|
117 |
+
results_df_out_wb = create_highlighted_excel_wb(results_df_out, free_text_query, "search_text")
|
118 |
+
results_df_out_wb.save(results_df_name)
|
119 |
+
|
120 |
results_first_text = results_df_out[text_column].iloc[0]
|
121 |
|
122 |
print("Returning results")
|