Spaces:
Sleeping
Sleeping
Commit
·
200480d
1
Parent(s):
f2db299
Minor changes to file path for outputs, documentation, location of pyinstaller build dependencies
Browse files- .gitignore +2 -1
- README.md +4 -4
- app.py +2 -2
- hook-en_core_web_sm.py +0 -8
- hook-gradio.py +0 -8
- how_to_create_exe_dist.txt +2 -2
- search_funcs/bm25_functions.py +43 -42
- search_funcs/helper_functions.py +1 -1
- search_funcs/semantic_functions.py +13 -13
- search_funcs/semantic_ingest_functions.py +4 -4
.gitignore
CHANGED
@@ -20,4 +20,5 @@ dist/*
|
|
20 |
__pycache__/*
|
21 |
db/*
|
22 |
experiments/*
|
23 |
-
model/*
|
|
|
|
20 |
__pycache__/*
|
21 |
db/*
|
22 |
experiments/*
|
23 |
+
model/*
|
24 |
+
build_deps/*
|
README.md
CHANGED
@@ -15,8 +15,8 @@ Search through long-form text fields in your tabular data. Either for exact, spe
|
|
15 |
# Guide
|
16 |
## Keyword search
|
17 |
|
18 |
-
1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet).
|
19 |
-
2. Wait
|
20 |
3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
|
21 |
4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
|
22 |
5. Hit search text. You may have to wait depending on the size of the data you are searching.
|
@@ -26,8 +26,8 @@ Search through long-form text fields in your tabular data. Either for exact, spe
|
|
26 |
|
27 |
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words.
|
28 |
|
29 |
-
1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet).
|
30 |
-
2. Wait
|
31 |
3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
|
32 |
4. In the 'Enter semantic search query here' area below this, type in the terms you would like to search for.
|
33 |
5. Press 'Start semantic search'. You may have to wait depending on the size of the data you are searching.
|
|
|
15 |
# Guide
|
16 |
## Keyword search
|
17 |
|
18 |
+
1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .parquet files in this folder (both the file with and without 'tokenised' in the name) to load into the app.
|
19 |
+
2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
|
20 |
3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
|
21 |
4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
|
22 |
5. Hit search text. You may have to wait depending on the size of the data you are searching.
|
|
|
26 |
|
27 |
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words.
|
28 |
|
29 |
+
1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Semantic search' folder has been prepared, select both of the .files in this folder to load into the app. This should be a '..prepared_docs.pkl.gz' file, and a '...embeddings_compress.npz' or 'embeddings.npz' file.
|
30 |
+
2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search. If the 'Semantic search' folder has been prepared, this field should be 'page_contents'.
|
31 |
3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
|
32 |
4. In the 'Enter semantic search query here' area below this, type in the terms you would like to search for.
|
33 |
5. Press 'Start semantic search'. You may have to wait depending on the size of the data you are searching.
|
app.py
CHANGED
@@ -58,7 +58,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
58 |
"""
|
59 |
**Exact term keyword search**
|
60 |
|
61 |
-
1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...
|
62 |
""")
|
63 |
with gr.Row():
|
64 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
@@ -88,7 +88,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
88 |
"""
|
89 |
**Thematic/semantic search**
|
90 |
|
91 |
-
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '
|
92 |
""")
|
93 |
with gr.Row():
|
94 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
|
|
58 |
"""
|
59 |
**Exact term keyword search**
|
60 |
|
61 |
+
1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...tokenised.parquet' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
62 |
""")
|
63 |
with gr.Row():
|
64 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
|
|
88 |
"""
|
89 |
**Thematic/semantic search**
|
90 |
|
91 |
+
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
92 |
""")
|
93 |
with gr.Row():
|
94 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
hook-en_core_web_sm.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
from PyInstaller.utils.hooks import collect_data_files
|
2 |
-
|
3 |
-
hiddenimports = [
|
4 |
-
'en_core_web_sm'
|
5 |
-
]
|
6 |
-
|
7 |
-
# Use collect_data_files to find data files. Replace 'en_core_web_sm' with the correct package name if it's different.
|
8 |
-
datas = collect_data_files('en_core_web_sm')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hook-gradio.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
from PyInstaller.utils.hooks import collect_data_files
|
2 |
-
|
3 |
-
hiddenimports = [
|
4 |
-
'gradio'
|
5 |
-
]
|
6 |
-
|
7 |
-
# Use collect_data_files to find data files. Replace 'gradio' with the correct package name if it's different.
|
8 |
-
datas = collect_data_files('gradio')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
how_to_create_exe_dist.txt
CHANGED
@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
17 |
8. In command line, cd to the folder that contains app.py. Then run the following:
|
18 |
|
19 |
For one single file:
|
20 |
-
python -m PyInstaller --additional-hooks-dir
|
21 |
|
22 |
For a small exe with a folder of dependencies:
|
23 |
-
python -m PyInstaller --additional-hooks-dir
|
24 |
|
25 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
26 |
|
|
|
17 |
8. In command line, cd to the folder that contains app.py. Then run the following:
|
18 |
|
19 |
For one single file:
|
20 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.1.1 app.py
|
21 |
|
22 |
For a small exe with a folder of dependencies:
|
23 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.1.1 app.py
|
24 |
|
25 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
26 |
|
search_funcs/bm25_functions.py
CHANGED
@@ -235,7 +235,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
235 |
|
236 |
#print(file_list)
|
237 |
|
238 |
-
data_file_names = [string for string in file_list if "tokenised" not in string and "
|
239 |
|
240 |
data_file_name = data_file_names[0]
|
241 |
|
@@ -246,7 +246,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
246 |
## Load in pre-tokenised corpus if exists
|
247 |
tokenised_df = pd.DataFrame()
|
248 |
|
249 |
-
tokenised_file_names = [string for string in file_list if "tokenised" in string]
|
250 |
|
251 |
if tokenised_file_names:
|
252 |
tokenised_df = read_file(tokenised_file_names[0])
|
@@ -303,7 +303,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
303 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
304 |
|
305 |
if return_intermediate_files == "Yes":
|
306 |
-
tokenised_data_file_name = data_file_out_name_no_ext + "_" + "
|
307 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
308 |
|
309 |
return corpus, message, df, out_file_name, tokenised_data_file_name, data_file_out_name
|
@@ -374,53 +374,54 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
|
|
374 |
|
375 |
def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
|
376 |
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
382 |
|
383 |
-
|
|
|
|
|
384 |
|
385 |
-
|
386 |
-
print("Searching")
|
387 |
|
388 |
-
|
389 |
-
|
390 |
-
|
|
|
|
|
|
|
|
|
391 |
|
392 |
-
|
|
|
|
|
393 |
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
"search_score_abs": results_scores})
|
399 |
-
results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
|
400 |
-
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
|
401 |
-
|
402 |
-
# Join on additional files
|
403 |
-
if in_join_file:
|
404 |
-
join_filename = in_join_file.name
|
405 |
|
406 |
-
|
407 |
-
|
408 |
-
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
409 |
-
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
410 |
|
411 |
-
|
412 |
-
join_df = join_df.drop_duplicates(in_join_column)
|
413 |
|
414 |
-
|
415 |
-
|
416 |
-
# Reorder results by score
|
417 |
-
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
418 |
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
|
|
423 |
|
424 |
-
|
425 |
|
426 |
-
|
|
|
235 |
|
236 |
#print(file_list)
|
237 |
|
238 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
239 |
|
240 |
data_file_name = data_file_names[0]
|
241 |
|
|
|
246 |
## Load in pre-tokenised corpus if exists
|
247 |
tokenised_df = pd.DataFrame()
|
248 |
|
249 |
+
tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
|
250 |
|
251 |
if tokenised_file_names:
|
252 |
tokenised_df = read_file(tokenised_file_names[0])
|
|
|
303 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
304 |
|
305 |
if return_intermediate_files == "Yes":
|
306 |
+
tokenised_data_file_name = data_file_out_name_no_ext + "_" + "tokenised.parquet"
|
307 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
308 |
|
309 |
return corpus, message, df, out_file_name, tokenised_data_file_name, data_file_out_name
|
|
|
374 |
|
375 |
def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
|
376 |
|
377 |
+
# Prepare query
|
378 |
+
if (clean == "Yes") | (text_column.endswith("_cleaned")):
|
379 |
+
token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
|
380 |
+
else:
|
381 |
+
token_query = convert_bm25_query_to_tokens(free_text_query, clean="No")
|
382 |
+
|
383 |
+
#print(token_query)
|
384 |
+
|
385 |
+
# Perform search
|
386 |
+
print("Searching")
|
387 |
|
388 |
+
results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
|
389 |
+
if not results_index:
|
390 |
+
return "No search results found", None, token_query
|
391 |
|
392 |
+
print("Search complete")
|
|
|
393 |
|
394 |
+
# Prepare results and export
|
395 |
+
joined_texts = [' '.join(inner_list) for inner_list in results_text]
|
396 |
+
results_df = pd.DataFrame(data={"index": results_index,
|
397 |
+
"search_text": joined_texts,
|
398 |
+
"search_score_abs": results_scores})
|
399 |
+
results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
|
400 |
+
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
|
401 |
|
402 |
+
# Join on additional files
|
403 |
+
if in_join_file:
|
404 |
+
join_filename = in_join_file.name
|
405 |
|
406 |
+
# Import data
|
407 |
+
join_df = read_file(join_filename)
|
408 |
+
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
409 |
+
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
|
411 |
+
# Duplicates dropped so as not to expand out dataframe
|
412 |
+
join_df = join_df.drop_duplicates(in_join_column)
|
|
|
|
|
413 |
|
414 |
+
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
|
|
415 |
|
416 |
+
# Reorder results by score
|
417 |
+
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
|
|
|
|
418 |
|
419 |
+
# Out file
|
420 |
+
query_str_file = ("_").join(token_query)
|
421 |
+
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
422 |
+
results_df_out.to_csv(results_df_name, index= None)
|
423 |
+
results_first_text = results_df_out[text_column].iloc[0]
|
424 |
|
425 |
+
print("Returning results")
|
426 |
|
427 |
+
return results_first_text, results_df_name, token_query
|
search_funcs/helper_functions.py
CHANGED
@@ -97,7 +97,7 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
97 |
|
98 |
#print(file_list)
|
99 |
|
100 |
-
data_file_names = [string for string in file_list if "tokenised" not in string and "
|
101 |
data_file_name = data_file_names[0]
|
102 |
|
103 |
new_choices = []
|
|
|
97 |
|
98 |
#print(file_list)
|
99 |
|
100 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
101 |
data_file_name = data_file_names[0]
|
102 |
|
103 |
new_choices = []
|
search_funcs/semantic_functions.py
CHANGED
@@ -96,10 +96,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
96 |
## Load in pre-embedded file if exists
|
97 |
file_list = [string.name for string in in_file]
|
98 |
|
99 |
-
|
100 |
|
101 |
-
embeddings_file_names = [string for string in file_list if "
|
102 |
-
data_file_names = [string for string in file_list if "tokenised" not in string]
|
103 |
data_file_name = data_file_names[0]
|
104 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
105 |
|
@@ -110,7 +110,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
110 |
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
111 |
|
112 |
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
113 |
-
if "
|
114 |
embeddings_out /= 100
|
115 |
|
116 |
# print("embeddings loaded: ", embeddings_out)
|
@@ -125,8 +125,6 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
125 |
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
126 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
127 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
128 |
-
|
129 |
-
|
130 |
|
131 |
toc = time.perf_counter()
|
132 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
@@ -135,10 +133,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
135 |
# If you want to save your files for next time
|
136 |
if return_intermediate_files == "Yes":
|
137 |
if embeddings_super_compress == "No":
|
138 |
-
semantic_search_file_name = data_file_name_no_ext + '_' + '
|
139 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
140 |
else:
|
141 |
-
semantic_search_file_name = data_file_name_no_ext + '_' + '
|
142 |
embeddings_out_round = np.round(embeddings_out, 3)
|
143 |
embeddings_out_round *= 100 # Rounding not currently used
|
144 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
@@ -231,7 +229,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
|
|
231 |
|
232 |
return results_df_out
|
233 |
|
234 |
-
def jina_simple_retrieval(
|
235 |
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings
|
236 |
|
237 |
# print("vectorstore loaded: ", vectorstore)
|
@@ -243,7 +241,7 @@ def jina_simple_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_c
|
|
243 |
embeddings = embeddings.to(device)
|
244 |
|
245 |
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
246 |
-
query = embeddings.encode(
|
247 |
query_tensor = tensor(query).to(device)
|
248 |
|
249 |
if query_tensor.dim() == 1:
|
@@ -282,8 +280,10 @@ def jina_simple_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_c
|
|
282 |
# If nothing found, return error message
|
283 |
if results_df_out.empty:
|
284 |
return 'No result found!', None
|
|
|
|
|
285 |
|
286 |
-
results_df_name = "semantic_search_result_" + today_rev + ".csv"
|
287 |
results_df_out.to_csv(results_df_name, index= None)
|
288 |
results_first_text = results_df_out.iloc[0, 1]
|
289 |
|
@@ -394,10 +394,10 @@ def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, prog
|
|
394 |
|
395 |
return out_message, collection
|
396 |
|
397 |
-
def chroma_retrieval_deprecated(
|
398 |
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, embeddings = embeddings_model): # ,vectorstore, embeddings
|
399 |
|
400 |
-
query = embeddings.encode(
|
401 |
|
402 |
docs = vectorstore.query(
|
403 |
query_embeddings=query,
|
|
|
96 |
## Load in pre-embedded file if exists
|
97 |
file_list = [string.name for string in in_file]
|
98 |
|
99 |
+
print(file_list)
|
100 |
|
101 |
+
embeddings_file_names = [string.lower() for string in file_list if "npz" in string.lower()]
|
102 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
103 |
data_file_name = data_file_names[0]
|
104 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
105 |
|
|
|
110 |
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
111 |
|
112 |
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
113 |
+
if "compress" in embeddings_file_names[0]:
|
114 |
embeddings_out /= 100
|
115 |
|
116 |
# print("embeddings loaded: ", embeddings_out)
|
|
|
125 |
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
126 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
127 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
|
|
|
|
128 |
|
129 |
toc = time.perf_counter()
|
130 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
|
|
133 |
# If you want to save your files for next time
|
134 |
if return_intermediate_files == "Yes":
|
135 |
if embeddings_super_compress == "No":
|
136 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
137 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
138 |
else:
|
139 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
|
140 |
embeddings_out_round = np.round(embeddings_out, 3)
|
141 |
embeddings_out_round *= 100 # Rounding not currently used
|
142 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
|
|
229 |
|
230 |
return results_df_out
|
231 |
|
232 |
+
def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
233 |
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings
|
234 |
|
235 |
# print("vectorstore loaded: ", vectorstore)
|
|
|
241 |
embeddings = embeddings.to(device)
|
242 |
|
243 |
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
244 |
+
query = embeddings.encode(query_str)
|
245 |
query_tensor = tensor(query).to(device)
|
246 |
|
247 |
if query_tensor.dim() == 1:
|
|
|
280 |
# If nothing found, return error message
|
281 |
if results_df_out.empty:
|
282 |
return 'No result found!', None
|
283 |
+
|
284 |
+
query_str_file = query_str.replace(" ", "_")
|
285 |
|
286 |
+
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
287 |
results_df_out.to_csv(results_df_name, index= None)
|
288 |
results_first_text = results_df_out.iloc[0, 1]
|
289 |
|
|
|
394 |
|
395 |
return out_message, collection
|
396 |
|
397 |
+
def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
398 |
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, embeddings = embeddings_model): # ,vectorstore, embeddings
|
399 |
|
400 |
+
query = embeddings.encode(query_str).tolist()
|
401 |
|
402 |
docs = vectorstore.query(
|
403 |
query_embeddings=query,
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
|
|
130 |
|
131 |
#print(file_list)
|
132 |
|
133 |
-
data_file_names = [string for string in file_list if "tokenised" not in string and "
|
134 |
|
135 |
data_file_name = data_file_names[0]
|
136 |
|
@@ -299,7 +299,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
299 |
|
300 |
file_list = [string.name for string in in_file]
|
301 |
|
302 |
-
data_file_names = [string for string in file_list if "tokenised" not in string and "
|
303 |
data_file_name = data_file_names[0]
|
304 |
|
305 |
# Check if file is a document format, and explode out as needed
|
@@ -312,7 +312,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
312 |
|
313 |
doc_sections = df
|
314 |
|
315 |
-
print(doc_sections[0])
|
316 |
|
317 |
# Convert each element in the Series to a Document instance
|
318 |
#doc_sections = section_series.apply(lambda x: Document(**x))
|
@@ -365,7 +365,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
365 |
|
366 |
if return_intermediate_files == "Yes":
|
367 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
368 |
-
file_name = data_file_out_name_no_ext
|
369 |
#print(doc_sections)
|
370 |
#page_content_series_string = pd.Series(doc_sections).astype(str)
|
371 |
#page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':")
|
|
|
130 |
|
131 |
#print(file_list)
|
132 |
|
133 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
134 |
|
135 |
data_file_name = data_file_names[0]
|
136 |
|
|
|
299 |
|
300 |
file_list = [string.name for string in in_file]
|
301 |
|
302 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
303 |
data_file_name = data_file_names[0]
|
304 |
|
305 |
# Check if file is a document format, and explode out as needed
|
|
|
312 |
|
313 |
doc_sections = df
|
314 |
|
315 |
+
#print(doc_sections[0])
|
316 |
|
317 |
# Convert each element in the Series to a Document instance
|
318 |
#doc_sections = section_series.apply(lambda x: Document(**x))
|
|
|
365 |
|
366 |
if return_intermediate_files == "Yes":
|
367 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
368 |
+
file_name = data_file_out_name_no_ext
|
369 |
#print(doc_sections)
|
370 |
#page_content_series_string = pd.Series(doc_sections).astype(str)
|
371 |
#page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':")
|