Sean-Case
commited on
Commit
β’
63049fe
1
Parent(s):
3df8e40
Better error checking. Doesn't load in embeddings file twice now.
Browse files- README.md +1 -1
- app.py +18 -20
- how_to_create_exe_dist.txt +2 -2
- requirements.txt +1 -1
- search_funcs/bm25_functions.py +67 -22
- search_funcs/helper_functions.py +54 -17
- search_funcs/semantic_functions.py +43 -31
- search_funcs/semantic_ingest_functions.py +15 -1
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.16.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -6,6 +6,7 @@ from search_funcs.helper_functions import dummy_function, display_info, put_colu
|
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
|
|
9 |
|
10 |
PandasDataFrame = Type[pd.DataFrame]
|
11 |
|
@@ -22,18 +23,16 @@ with block:
|
|
22 |
ingest_metadata = gr.State()
|
23 |
ingest_docs = gr.State()
|
24 |
vectorstore_state = gr.State() # globals()["vectorstore"]
|
25 |
-
embeddings_state = gr.State() # globals()["embeddings"]
|
|
|
26 |
|
27 |
k_val = gr.State(9999)
|
28 |
out_passages = gr.State(9999)
|
29 |
vec_weight = gr.State(1)
|
30 |
|
31 |
-
#docs_keep_as_doc_state = gr.State()
|
32 |
-
#doc_df_state = gr.State()
|
33 |
-
#docs_keep_out_state = gr.State()
|
34 |
-
|
35 |
corpus_state = gr.State()
|
36 |
keyword_data_state = gr.State(pd.DataFrame())
|
|
|
37 |
semantic_data_state = gr.State(pd.DataFrame())
|
38 |
|
39 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
@@ -76,7 +75,6 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
76 |
with gr.Accordion(label = "Search data", open=True):
|
77 |
with gr.Row():
|
78 |
keyword_query = gr.Textbox(label="Enter your search term")
|
79 |
-
#mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
|
80 |
|
81 |
keyword_search_button = gr.Button(value="Search text")
|
82 |
|
@@ -115,7 +113,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
115 |
with gr.Row():
|
116 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
117 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
118 |
-
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="
|
119 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
120 |
with gr.Accordion(label="Keyword search options", open = False):
|
121 |
with gr.Row():
|
@@ -133,13 +131,14 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
133 |
with gr.Row():
|
134 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
135 |
with gr.Accordion(label="Semantic search options", open = False):
|
136 |
-
semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.
|
137 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
138 |
in_join_file = gr.File(label="Upload your data to join here")
|
|
|
139 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
140 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
141 |
|
142 |
-
in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])
|
143 |
|
144 |
# ---
|
145 |
in_k1_button.click(display_info, inputs=in_k1_info)
|
@@ -149,28 +148,27 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
149 |
|
150 |
### BM25 SEARCH ###
|
151 |
# Update dropdowns upon initial file load
|
152 |
-
in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column,
|
153 |
-
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file
|
154 |
|
155 |
# Load in BM25 data
|
156 |
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
|
157 |
-
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
158 |
-
|
159 |
-
|
160 |
# BM25 search functions on click or enter
|
161 |
-
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column,
|
162 |
-
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column,
|
163 |
|
164 |
### SEMANTIC SEARCH ###
|
165 |
# Load in a csv/excel file for semantic search
|
166 |
-
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,
|
167 |
load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
|
168 |
then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
169 |
-
then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
170 |
|
171 |
# Semantic search query
|
172 |
-
semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight,
|
173 |
-
semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight,
|
174 |
|
175 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
176 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
|
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
|
11 |
PandasDataFrame = Type[pd.DataFrame]
|
12 |
|
|
|
23 |
ingest_metadata = gr.State()
|
24 |
ingest_docs = gr.State()
|
25 |
vectorstore_state = gr.State() # globals()["vectorstore"]
|
26 |
+
embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
|
27 |
+
search_index_state = gr.State()
|
28 |
|
29 |
k_val = gr.State(9999)
|
30 |
out_passages = gr.State(9999)
|
31 |
vec_weight = gr.State(1)
|
32 |
|
|
|
|
|
|
|
|
|
33 |
corpus_state = gr.State()
|
34 |
keyword_data_state = gr.State(pd.DataFrame())
|
35 |
+
join_data_state = gr.State(pd.DataFrame())
|
36 |
semantic_data_state = gr.State(pd.DataFrame())
|
37 |
|
38 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
|
|
75 |
with gr.Accordion(label = "Search data", open=True):
|
76 |
with gr.Row():
|
77 |
keyword_query = gr.Textbox(label="Enter your search term")
|
|
|
78 |
|
79 |
keyword_search_button = gr.Button(value="Search text")
|
80 |
|
|
|
113 |
with gr.Row():
|
114 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
115 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
116 |
+
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
117 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
118 |
with gr.Accordion(label="Keyword search options", open = False):
|
119 |
with gr.Row():
|
|
|
131 |
with gr.Row():
|
132 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
133 |
with gr.Accordion(label="Semantic search options", open = False):
|
134 |
+
semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.75, minimum=0, maximum=0.95, step=0.01)
|
135 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
136 |
in_join_file = gr.File(label="Upload your data to join here")
|
137 |
+
in_join_message = gr.Textbox(label="Join file load progress")
|
138 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
139 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
140 |
|
141 |
+
in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
|
142 |
|
143 |
# ---
|
144 |
in_k1_button.click(display_info, inputs=in_k1_info)
|
|
|
148 |
|
149 |
### BM25 SEARCH ###
|
150 |
# Update dropdowns upon initial file load
|
151 |
+
in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, load_finished_message])
|
152 |
+
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
153 |
|
154 |
# Load in BM25 data
|
155 |
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
|
156 |
+
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
157 |
+
|
|
|
158 |
# BM25 search functions on click or enter
|
159 |
+
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
160 |
+
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
161 |
|
162 |
### SEMANTIC SEARCH ###
|
163 |
# Load in a csv/excel file for semantic search
|
164 |
+
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress])
|
165 |
load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
|
166 |
then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
167 |
+
then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
168 |
|
169 |
# Semantic search query
|
170 |
+
semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
171 |
+
semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
172 |
|
173 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
174 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
how_to_create_exe_dist.txt
CHANGED
@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
17 |
8. In command line, cd to the folder that contains app.py. Then run the following:
|
18 |
|
19 |
For one single file:
|
20 |
-
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.
|
21 |
|
22 |
For a small exe with a folder of dependencies:
|
23 |
-
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.
|
24 |
|
25 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
26 |
|
|
|
17 |
8. In command line, cd to the folder that contains app.py. Then run the following:
|
18 |
|
19 |
For one single file:
|
20 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
21 |
|
22 |
For a small exe with a folder of dependencies:
|
23 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
24 |
|
25 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
26 |
|
requirements.txt
CHANGED
@@ -7,4 +7,4 @@ accelerate==0.26.0
|
|
7 |
torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
-
gradio==
|
|
|
7 |
torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
+
gradio==4.16.0
|
search_funcs/bm25_functions.py
CHANGED
@@ -18,7 +18,7 @@ from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sent
|
|
18 |
from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
|
19 |
|
20 |
# Load the SpaCy model
|
21 |
-
from spacy.cli import download
|
22 |
import spacy
|
23 |
spacy.prefer_gpu()
|
24 |
|
@@ -231,13 +231,25 @@ class BM25:
|
|
231 |
|
232 |
# These following functions are my own work
|
233 |
|
234 |
-
def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", return_intermediate_files = "No", progress=gr.Progress()):
|
|
|
235 |
|
|
|
|
|
|
|
|
|
|
|
236 |
file_list = [string.name for string in in_file]
|
237 |
|
238 |
#print(file_list)
|
239 |
|
240 |
-
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
data_file_name = data_file_names[0]
|
243 |
|
@@ -263,6 +275,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
263 |
tokenised_df = read_file(tokenised_file_names[0])
|
264 |
|
265 |
if clean == "Yes":
|
|
|
266 |
clean_tic = time.perf_counter()
|
267 |
print("Starting data clean.")
|
268 |
|
@@ -280,14 +293,16 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
280 |
else:
|
281 |
# Don't clean or save file to disk
|
282 |
df_list = list(df[text_column])
|
283 |
-
print("No data cleaning performed
|
284 |
out_file_name = None
|
285 |
|
286 |
# Tokenise data. If tokenised df already exists, no need to do anything
|
287 |
|
|
|
|
|
288 |
if not tokenised_df.empty:
|
289 |
corpus = tokenised_df.iloc[:,0].tolist()
|
290 |
-
print("Tokeniser loaded from file
|
291 |
#print("Corpus is: ", corpus[0:5])
|
292 |
|
293 |
# If doesn't already exist, tokenize texts in batches
|
@@ -316,7 +331,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
316 |
|
317 |
return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
|
318 |
|
319 |
-
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column):
|
320 |
|
321 |
# Check if the list and the dataframe have the same length
|
322 |
if len(prepared_text_list) != len(in_df):
|
@@ -342,31 +357,55 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
|
|
342 |
|
343 |
return file_name, new_text_column
|
344 |
|
345 |
-
def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5):
|
346 |
#bm25.save("saved_df_bm25")
|
347 |
#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
|
348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
file_list = [string.name for string in in_file]
|
350 |
|
351 |
#print(file_list)
|
352 |
|
353 |
# Get data file name
|
354 |
-
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
|
|
|
|
|
|
355 |
|
356 |
data_file_name = data_file_names[0]
|
357 |
data_file_out_name = get_file_path_end_with_ext(data_file_name)
|
358 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
359 |
|
360 |
# Check if there is a search index file already
|
361 |
-
index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
|
362 |
|
|
|
363 |
|
364 |
-
if index_file_names:
|
365 |
-
|
|
|
366 |
|
367 |
-
print(index_file_name)
|
368 |
|
369 |
-
bm25_load =
|
370 |
|
371 |
|
372 |
#index_file_out_name = get_file_path_end_with_ext(index_file_name)
|
@@ -381,6 +420,8 @@ def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, a
|
|
381 |
bm25 = bm25_load
|
382 |
|
383 |
if return_intermediate_files == "Yes":
|
|
|
|
|
384 |
bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
|
385 |
#np.savez_compressed(bm25_search_file_name, bm25)
|
386 |
|
@@ -420,8 +461,10 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
|
|
420 |
|
421 |
return out_query
|
422 |
|
423 |
-
def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No",
|
424 |
|
|
|
|
|
425 |
# Prepare query
|
426 |
if (clean == "Yes") | (text_column.endswith("_cleaned")):
|
427 |
token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
|
@@ -435,7 +478,7 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
435 |
|
436 |
results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
|
437 |
if not results_index:
|
438 |
-
return "No search results found", None
|
439 |
|
440 |
print("Search complete")
|
441 |
|
@@ -448,18 +491,16 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
448 |
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
|
449 |
|
450 |
# Join on additional files
|
451 |
-
if in_join_file:
|
452 |
-
|
453 |
-
|
454 |
-
# Import data
|
455 |
-
join_df = read_file(join_filename)
|
456 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
457 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
458 |
|
459 |
# Duplicates dropped so as not to expand out dataframe
|
460 |
join_df = join_df.drop_duplicates(in_join_column)
|
461 |
|
462 |
-
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")
|
463 |
|
464 |
# Reorder results by score
|
465 |
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
@@ -467,9 +508,13 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
467 |
# Out file
|
468 |
query_str_file = ("_").join(token_query)
|
469 |
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
|
|
|
|
|
|
|
|
470 |
results_df_out.to_excel(results_df_name, index= None)
|
471 |
results_first_text = results_df_out[text_column].iloc[0]
|
472 |
|
473 |
print("Returning results")
|
474 |
|
475 |
-
return results_first_text, results_df_name
|
|
|
18 |
from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
|
19 |
|
20 |
# Load the SpaCy model
|
21 |
+
from spacy.cli.download import download
|
22 |
import spacy
|
23 |
spacy.prefer_gpu()
|
24 |
|
|
|
231 |
|
232 |
# These following functions are my own work
|
233 |
|
234 |
+
def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
235 |
+
print(in_file)
|
236 |
|
237 |
+
if not in_file:
|
238 |
+
print("No input file found. Please load in at least one file.")
|
239 |
+
return None, "No input file found. Please load in at least one file.", data_state, None, None, None
|
240 |
+
|
241 |
+
progress(0, desc = "Loading in data")
|
242 |
file_list = [string.name for string in in_file]
|
243 |
|
244 |
#print(file_list)
|
245 |
|
246 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
|
247 |
+
|
248 |
+
if not data_file_names:
|
249 |
+
return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
|
250 |
+
|
251 |
+
if not text_column:
|
252 |
+
return None, "Please enter a column name to search.", data_state, None, None, None
|
253 |
|
254 |
data_file_name = data_file_names[0]
|
255 |
|
|
|
275 |
tokenised_df = read_file(tokenised_file_names[0])
|
276 |
|
277 |
if clean == "Yes":
|
278 |
+
progress(0.1, desc = "Cleaning data")
|
279 |
clean_tic = time.perf_counter()
|
280 |
print("Starting data clean.")
|
281 |
|
|
|
293 |
else:
|
294 |
# Don't clean or save file to disk
|
295 |
df_list = list(df[text_column])
|
296 |
+
print("No data cleaning performed")
|
297 |
out_file_name = None
|
298 |
|
299 |
# Tokenise data. If tokenised df already exists, no need to do anything
|
300 |
|
301 |
+
progress(0.4, desc = "Tokenising text")
|
302 |
+
|
303 |
if not tokenised_df.empty:
|
304 |
corpus = tokenised_df.iloc[:,0].tolist()
|
305 |
+
print("Tokeniser loaded from file")
|
306 |
#print("Corpus is: ", corpus[0:5])
|
307 |
|
308 |
# If doesn't already exist, tokenize texts in batches
|
|
|
331 |
|
332 |
return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
|
333 |
|
334 |
+
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
335 |
|
336 |
# Check if the list and the dataframe have the same length
|
337 |
if len(prepared_text_list) != len(in_df):
|
|
|
357 |
|
358 |
return file_name, new_text_column
|
359 |
|
360 |
+
def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
|
361 |
#bm25.save("saved_df_bm25")
|
362 |
#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
|
363 |
|
364 |
+
|
365 |
+
|
366 |
+
if not in_file:
|
367 |
+
out_message ="No input file found. Please load in at least one file."
|
368 |
+
print(out_message)
|
369 |
+
return out_message, None
|
370 |
+
|
371 |
+
if not corpus:
|
372 |
+
out_message = "No data file found. Please load in at least one csv/Excel/Parquet file."
|
373 |
+
print(out_message)
|
374 |
+
return out_message, None
|
375 |
+
|
376 |
+
if not text_column:
|
377 |
+
out_message = "Please enter a column name to search."
|
378 |
+
print(out_message)
|
379 |
+
return out_message, None
|
380 |
+
|
381 |
+
|
382 |
+
|
383 |
file_list = [string.name for string in in_file]
|
384 |
|
385 |
#print(file_list)
|
386 |
|
387 |
# Get data file name
|
388 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
|
389 |
+
|
390 |
+
if not data_file_names:
|
391 |
+
return "Please load in at least one csv/Excel/parquet data file.", None
|
392 |
|
393 |
data_file_name = data_file_names[0]
|
394 |
data_file_out_name = get_file_path_end_with_ext(data_file_name)
|
395 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
396 |
|
397 |
# Check if there is a search index file already
|
398 |
+
#index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
|
399 |
|
400 |
+
progress(0.6, desc = "Preparing search index")
|
401 |
|
402 |
+
#if index_file_names:
|
403 |
+
if search_index:
|
404 |
+
#index_file_name = index_file_names[0]
|
405 |
|
406 |
+
#print(index_file_name)
|
407 |
|
408 |
+
bm25_load = search_index
|
409 |
|
410 |
|
411 |
#index_file_out_name = get_file_path_end_with_ext(index_file_name)
|
|
|
420 |
bm25 = bm25_load
|
421 |
|
422 |
if return_intermediate_files == "Yes":
|
423 |
+
print("Saving search index file")
|
424 |
+
progress(0.8, desc = "Saving search index to file")
|
425 |
bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
|
426 |
#np.savez_compressed(bm25_search_file_name, bm25)
|
427 |
|
|
|
461 |
|
462 |
return out_query
|
463 |
|
464 |
+
def bm25_search(free_text_query, in_no_search_results, original_data, text_column, in_join_file, clean = "No", in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
|
465 |
|
466 |
+
progress(0, desc = "Conducting keyword search")
|
467 |
+
|
468 |
# Prepare query
|
469 |
if (clean == "Yes") | (text_column.endswith("_cleaned")):
|
470 |
token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
|
|
|
478 |
|
479 |
results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
|
480 |
if not results_index:
|
481 |
+
return "No search results found", None
|
482 |
|
483 |
print("Search complete")
|
484 |
|
|
|
491 |
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
|
492 |
|
493 |
# Join on additional files
|
494 |
+
if not in_join_file.empty:
|
495 |
+
progress(0.5, desc = "Joining on additional data file")
|
496 |
+
join_df = in_join_file
|
|
|
|
|
497 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
498 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
499 |
|
500 |
# Duplicates dropped so as not to expand out dataframe
|
501 |
join_df = join_df.drop_duplicates(in_join_column)
|
502 |
|
503 |
+
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
|
504 |
|
505 |
# Reorder results by score
|
506 |
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
|
|
508 |
# Out file
|
509 |
query_str_file = ("_").join(token_query)
|
510 |
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
511 |
+
|
512 |
+
print("Saving search file output")
|
513 |
+
progress(0.7, desc = "Saving search output to file")
|
514 |
+
|
515 |
results_df_out.to_excel(results_df_name, index= None)
|
516 |
results_first_text = results_df_out[text_column].iloc[0]
|
517 |
|
518 |
print("Returning results")
|
519 |
|
520 |
+
return results_first_text, results_df_name
|
search_funcs/helper_functions.py
CHANGED
@@ -7,6 +7,7 @@ import shutil
|
|
7 |
import getpass
|
8 |
import gzip
|
9 |
import pickle
|
|
|
10 |
|
11 |
# Attempt to delete content of gradio temp folder
|
12 |
def get_temp_folder_path():
|
@@ -89,19 +90,27 @@ def read_file(filename):
|
|
89 |
|
90 |
def put_columns_in_df(in_file, in_bm25_column):
|
91 |
'''
|
92 |
-
When file is loaded, update the column dropdown choices
|
93 |
'''
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
file_list = [string.name for string in in_file]
|
96 |
|
97 |
#print(file_list)
|
98 |
|
99 |
-
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
100 |
-
data_file_name = data_file_names[0]
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
105 |
|
106 |
df = read_file(data_file_name)
|
107 |
|
@@ -109,32 +118,60 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
109 |
|
110 |
new_choices = list(df.columns)
|
111 |
|
|
|
|
|
|
|
|
|
112 |
else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
|
113 |
#print(new_choices)
|
114 |
|
115 |
-
concat_choices.extend(new_choices)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(
|
118 |
|
119 |
-
def put_columns_in_join_df(in_file
|
120 |
'''
|
121 |
-
When file is loaded, update the column dropdown choices
|
122 |
'''
|
123 |
-
|
124 |
-
print("in_bm25_column")
|
125 |
|
126 |
new_choices = []
|
127 |
concat_choices = []
|
128 |
|
129 |
|
130 |
-
|
131 |
-
new_choices = list(
|
|
|
|
|
132 |
|
133 |
-
|
134 |
|
135 |
-
|
136 |
|
137 |
-
return gr.Dropdown(choices=concat_choices)
|
138 |
|
139 |
def dummy_function(gradio_component):
|
140 |
"""
|
|
|
7 |
import getpass
|
8 |
import gzip
|
9 |
import pickle
|
10 |
+
import numpy as np
|
11 |
|
12 |
# Attempt to delete content of gradio temp folder
|
13 |
def get_temp_folder_path():
|
|
|
90 |
|
91 |
def put_columns_in_df(in_file, in_bm25_column):
|
92 |
'''
|
93 |
+
When file is loaded, update the column dropdown choices
|
94 |
'''
|
95 |
+
new_choices = []
|
96 |
+
concat_choices = []
|
97 |
+
index_load = None
|
98 |
+
embed_load = np.array([])
|
99 |
+
out_message = ""
|
100 |
|
101 |
file_list = [string.name for string in in_file]
|
102 |
|
103 |
#print(file_list)
|
104 |
|
105 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
|
|
|
106 |
|
107 |
+
if not data_file_names:
|
108 |
+
out_message = "Please load in at least one csv/Excel/parquet data file."
|
109 |
+
print(out_message)
|
110 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
|
111 |
+
|
112 |
+
data_file_name = data_file_names[0]
|
113 |
+
|
114 |
|
115 |
df = read_file(data_file_name)
|
116 |
|
|
|
118 |
|
119 |
new_choices = list(df.columns)
|
120 |
|
121 |
+
elif "search_index" in data_file_name:
|
122 |
+
# If only the search_index found, need a data file too
|
123 |
+
new_choices = []
|
124 |
+
|
125 |
else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
|
126 |
#print(new_choices)
|
127 |
|
128 |
+
concat_choices.extend(new_choices)
|
129 |
+
|
130 |
+
# Check if there is a search index file already
|
131 |
+
index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
|
132 |
+
|
133 |
+
if index_file_names:
|
134 |
+
index_file_name = index_file_names[0]
|
135 |
+
index_load = read_file(index_file_name)
|
136 |
+
|
137 |
+
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
138 |
+
|
139 |
+
if embeddings_file_names:
|
140 |
+
print("Loading embeddings from file.")
|
141 |
+
embed_load = np.load(embeddings_file_names[0])['arr_0']
|
142 |
+
|
143 |
+
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
144 |
+
if "compress" in embeddings_file_names[0]:
|
145 |
+
embed_load /= 100
|
146 |
+
else:
|
147 |
+
embed_load = np.array([])
|
148 |
+
|
149 |
+
out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
|
150 |
+
print(out_message)
|
151 |
|
152 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, out_message
|
153 |
|
154 |
+
def put_columns_in_join_df(in_file):
|
155 |
'''
|
156 |
+
When file is loaded, update the column dropdown choices
|
157 |
'''
|
158 |
+
new_df = pd.DataFrame()
|
159 |
+
#print("in_bm25_column")
|
160 |
|
161 |
new_choices = []
|
162 |
concat_choices = []
|
163 |
|
164 |
|
165 |
+
new_df = read_file(in_file.name)
|
166 |
+
new_choices = list(new_df.columns)
|
167 |
+
|
168 |
+
#print(new_choices)
|
169 |
|
170 |
+
concat_choices.extend(new_choices)
|
171 |
|
172 |
+
out_message = "File load successful. Now select a column to join below."
|
173 |
|
174 |
+
return gr.Dropdown(choices=concat_choices), new_df, out_message
|
175 |
|
176 |
def dummy_function(gradio_component):
|
177 |
"""
|
search_funcs/semantic_functions.py
CHANGED
@@ -12,7 +12,6 @@ today_rev = datetime.now().strftime("%Y%m%d")
|
|
12 |
from transformers import AutoModel
|
13 |
|
14 |
from torch import cuda, backends, tensor, mm
|
15 |
-
from search_funcs.helper_functions import read_file
|
16 |
|
17 |
# Check for torch cuda
|
18 |
print("Is CUDA enabled? ", cuda.is_available())
|
@@ -43,18 +42,6 @@ except:
|
|
43 |
embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
|
44 |
|
45 |
|
46 |
-
# Chroma support is currently deprecated
|
47 |
-
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
48 |
-
#import chromadb
|
49 |
-
#from chromadb.config import Settings
|
50 |
-
#from typing_extensions import Protocol
|
51 |
-
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
52 |
-
|
53 |
-
# Remove Chroma database file. If it exists as it can cause issues
|
54 |
-
#chromadb_file = "chroma.sqlite3"
|
55 |
-
|
56 |
-
#if os.path.isfile(chromadb_file):
|
57 |
-
# os.remove(chromadb_file)
|
58 |
def get_file_path_end(file_path):
|
59 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
60 |
basename = os.path.basename(file_path)
|
@@ -82,10 +69,17 @@ def load_embeddings(embeddings_name = embeddings_name):
|
|
82 |
|
83 |
return embeddings
|
84 |
|
85 |
-
def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress()):
|
86 |
'''
|
87 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
88 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
print(f"> Total split documents: {len(docs_out)}")
|
91 |
|
@@ -105,17 +99,9 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
105 |
|
106 |
out_message = "Document processing complete. Ready to search."
|
107 |
|
108 |
-
|
109 |
-
print("Loading embeddings from file.")
|
110 |
-
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
111 |
-
|
112 |
-
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
113 |
-
if "compress" in embeddings_file_names[0]:
|
114 |
-
embeddings_out /= 100
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
if not embeddings_file_names:
|
119 |
tic = time.perf_counter()
|
120 |
print("Starting to embed documents.")
|
121 |
#embeddings_list = []
|
@@ -132,6 +118,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
132 |
|
133 |
# If you want to save your files for next time
|
134 |
if return_intermediate_files == "Yes":
|
|
|
135 |
if embeddings_super_compress == "No":
|
136 |
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
137 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
@@ -144,12 +131,15 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
144 |
return out_message, embeddings_out, semantic_search_file_name
|
145 |
|
146 |
return out_message, embeddings_out, None
|
|
|
|
|
|
|
147 |
|
148 |
print(out_message)
|
149 |
|
150 |
return out_message, embeddings_out, None#, None
|
151 |
|
152 |
-
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
|
153 |
|
154 |
def create_docs_keep_from_df(df):
|
155 |
dict_out = {'ids' : [df['ids']],
|
@@ -213,11 +203,10 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
|
|
213 |
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|
214 |
|
215 |
# Join on additional files
|
216 |
-
if in_join_file:
|
217 |
-
|
|
|
218 |
|
219 |
-
# Import data
|
220 |
-
join_df = read_file(join_filename)
|
221 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
222 |
|
223 |
# Duplicates dropped so as not to expand out dataframe
|
@@ -225,14 +214,17 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
|
|
225 |
|
226 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
227 |
|
228 |
-
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")
|
229 |
|
230 |
return results_df_out
|
231 |
|
232 |
def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
233 |
-
vec_score_cut_off:float, vec_weight:float, in_join_file
|
234 |
|
235 |
# print("vectorstore loaded: ", vectorstore)
|
|
|
|
|
|
|
236 |
|
237 |
# Convert it to a PyTorch tensor and transfer to GPU
|
238 |
vectorstore_tensor = tensor(vectorstore).to(device)
|
@@ -277,6 +269,8 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
|
|
277 |
|
278 |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
279 |
|
|
|
|
|
280 |
# If nothing found, return error message
|
281 |
if results_df_out.empty:
|
282 |
return 'No result found!', None
|
@@ -284,12 +278,30 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
|
|
284 |
query_str_file = query_str.replace(" ", "_")
|
285 |
|
286 |
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
|
|
|
|
|
|
|
|
287 |
results_df_out.to_excel(results_df_name, index= None)
|
288 |
results_first_text = results_df_out.iloc[0, 1]
|
289 |
|
|
|
|
|
290 |
return results_first_text, results_df_name
|
291 |
|
292 |
# Deprecated Chroma functions - kept just in case needed in future.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
|
295 |
'''
|
|
|
12 |
from transformers import AutoModel
|
13 |
|
14 |
from torch import cuda, backends, tensor, mm
|
|
|
15 |
|
16 |
# Check for torch cuda
|
17 |
print("Is CUDA enabled? ", cuda.is_available())
|
|
|
42 |
embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
def get_file_path_end(file_path):
|
46 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
47 |
basename = os.path.basename(file_path)
|
|
|
69 |
|
70 |
return embeddings
|
71 |
|
72 |
+
def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
73 |
'''
|
74 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
75 |
'''
|
76 |
+
if not in_file:
|
77 |
+
out_message = "No input file found. Please load in at least one file."
|
78 |
+
print(out_message)
|
79 |
+
return out_message, None, None
|
80 |
+
|
81 |
+
|
82 |
+
progress(0.7, desc = "Loading/creating embeddings")
|
83 |
|
84 |
print(f"> Total split documents: {len(docs_out)}")
|
85 |
|
|
|
99 |
|
100 |
out_message = "Document processing complete. Ready to search."
|
101 |
|
102 |
+
# print("embeddings loaded: ", embeddings_out)
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
if embeddings_state.size == 0:
|
|
|
|
|
105 |
tic = time.perf_counter()
|
106 |
print("Starting to embed documents.")
|
107 |
#embeddings_list = []
|
|
|
118 |
|
119 |
# If you want to save your files for next time
|
120 |
if return_intermediate_files == "Yes":
|
121 |
+
progress(0.9, desc = "Saving embeddings to file")
|
122 |
if embeddings_super_compress == "No":
|
123 |
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
124 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
|
|
131 |
return out_message, embeddings_out, semantic_search_file_name
|
132 |
|
133 |
return out_message, embeddings_out, None
|
134 |
+
else:
|
135 |
+
# Just return existing embeddings if already exist
|
136 |
+
embeddings_out = embeddings_state
|
137 |
|
138 |
print(out_message)
|
139 |
|
140 |
return out_message, embeddings_out, None#, None
|
141 |
|
142 |
+
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
|
143 |
|
144 |
def create_docs_keep_from_df(df):
|
145 |
dict_out = {'ids' : [df['ids']],
|
|
|
203 |
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|
204 |
|
205 |
# Join on additional files
|
206 |
+
if not in_join_file.empty:
|
207 |
+
progress(0.5, desc = "Joining on additional data file")
|
208 |
+
join_df = in_join_file
|
209 |
|
|
|
|
|
210 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
211 |
|
212 |
# Duplicates dropped so as not to expand out dataframe
|
|
|
214 |
|
215 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
216 |
|
217 |
+
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
|
218 |
|
219 |
return results_df_out
|
220 |
|
221 |
def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
222 |
+
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
|
223 |
|
224 |
# print("vectorstore loaded: ", vectorstore)
|
225 |
+
progress(0, desc = "Conducting semantic search")
|
226 |
+
|
227 |
+
print("Searching")
|
228 |
|
229 |
# Convert it to a PyTorch tensor and transfer to GPU
|
230 |
vectorstore_tensor = tensor(vectorstore).to(device)
|
|
|
269 |
|
270 |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
271 |
|
272 |
+
print("Search complete")
|
273 |
+
|
274 |
# If nothing found, return error message
|
275 |
if results_df_out.empty:
|
276 |
return 'No result found!', None
|
|
|
278 |
query_str_file = query_str.replace(" ", "_")
|
279 |
|
280 |
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
281 |
+
|
282 |
+
print("Saving search output to file")
|
283 |
+
progress(0.7, desc = "Saving search output to file")
|
284 |
+
|
285 |
results_df_out.to_excel(results_df_name, index= None)
|
286 |
results_first_text = results_df_out.iloc[0, 1]
|
287 |
|
288 |
+
print("Returning results")
|
289 |
+
|
290 |
return results_first_text, results_df_name
|
291 |
|
292 |
# Deprecated Chroma functions - kept just in case needed in future.
|
293 |
+
# Chroma support is currently deprecated
|
294 |
+
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
295 |
+
#import chromadb
|
296 |
+
#from chromadb.config import Settings
|
297 |
+
#from typing_extensions import Protocol
|
298 |
+
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
299 |
+
|
300 |
+
# Remove Chroma database file. If it exists as it can cause issues
|
301 |
+
#chromadb_file = "chroma.sqlite3"
|
302 |
+
|
303 |
+
#if os.path.isfile(chromadb_file):
|
304 |
+
# os.remove(chromadb_file)
|
305 |
|
306 |
def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
|
307 |
'''
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -294,12 +294,23 @@ def parse_metadata(row):
|
|
294 |
|
295 |
# return doc_sections, message
|
296 |
|
297 |
-
def csv_excel_text_to_docs(df, in_file, text_column
|
298 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
|
|
|
|
|
|
|
|
299 |
|
300 |
file_list = [string.name for string in in_file]
|
301 |
|
302 |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
data_file_name = data_file_names[0]
|
304 |
|
305 |
# Check if file is a document format, and explode out as needed
|
@@ -326,6 +337,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
326 |
df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
|
327 |
|
328 |
if clean == "Yes":
|
|
|
329 |
clean_tic = time.perf_counter()
|
330 |
print("Starting data clean.")
|
331 |
|
@@ -352,6 +364,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
352 |
#doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
|
353 |
#doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
|
354 |
|
|
|
355 |
|
356 |
# Create a list of Document objects
|
357 |
doc_sections = [Document(page_content=row['page_content'],
|
@@ -364,6 +377,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
364 |
print(ingest_time_out)
|
365 |
|
366 |
if return_intermediate_files == "Yes":
|
|
|
367 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
368 |
file_name = data_file_out_name_no_ext
|
369 |
#print(doc_sections)
|
|
|
294 |
|
295 |
# return doc_sections, message
|
296 |
|
297 |
+
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
298 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
299 |
+
if not in_file:
|
300 |
+
return None, "Please load in at least one file.", data_state, None, None, None
|
301 |
+
|
302 |
+
progress(0, desc = "Loading in data")
|
303 |
|
304 |
file_list = [string.name for string in in_file]
|
305 |
|
306 |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
307 |
+
|
308 |
+
if not data_file_names:
|
309 |
+
return doc_sections, "Please load in at least one csv/Excel/parquet data file."
|
310 |
+
|
311 |
+
if not text_column:
|
312 |
+
return None, "Please enter a column name to search", data_state, None, None, None
|
313 |
+
|
314 |
data_file_name = data_file_names[0]
|
315 |
|
316 |
# Check if file is a document format, and explode out as needed
|
|
|
337 |
df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
|
338 |
|
339 |
if clean == "Yes":
|
340 |
+
progress(0.1, desc = "Cleaning data")
|
341 |
clean_tic = time.perf_counter()
|
342 |
print("Starting data clean.")
|
343 |
|
|
|
364 |
#doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
|
365 |
#doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
|
366 |
|
367 |
+
progress(0.3, desc = "Converting data to document format")
|
368 |
|
369 |
# Create a list of Document objects
|
370 |
doc_sections = [Document(page_content=row['page_content'],
|
|
|
377 |
print(ingest_time_out)
|
378 |
|
379 |
if return_intermediate_files == "Yes":
|
380 |
+
progress(0.5, desc = "Saving prepared documents")
|
381 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
382 |
file_name = data_file_out_name_no_ext
|
383 |
#print(doc_sections)
|