Sean-Case
commited on
Commit
•
3df8e40
1
Parent(s):
200480d
Fixed data input for semantic search. Allowed for docs to be loaded in directly for semantic search. 0.2.1
Browse files- README.md +1 -1
- app.py +14 -13
- how_to_create_exe_dist.txt +2 -2
- search_funcs/bm25_functions.py +66 -18
- search_funcs/helper_functions.py +5 -6
- search_funcs/semantic_functions.py +5 -5
- search_funcs/semantic_ingest_functions.py +2 -2
README.md
CHANGED
@@ -15,7 +15,7 @@ Search through long-form text fields in your tabular data. Either for exact, spe
|
|
15 |
# Guide
|
16 |
## Keyword search
|
17 |
|
18 |
-
1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .
|
19 |
2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
|
20 |
3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
|
21 |
4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
|
|
|
15 |
# Guide
|
16 |
## Keyword search
|
17 |
|
18 |
+
1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .files in this folder (both the data file and the file ending 'search_index.pkl.gz') to load into the app.
|
19 |
2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
|
20 |
3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
|
21 |
4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
|
app.py
CHANGED
@@ -28,12 +28,13 @@ with block:
|
|
28 |
out_passages = gr.State(9999)
|
29 |
vec_weight = gr.State(1)
|
30 |
|
31 |
-
docs_keep_as_doc_state = gr.State()
|
32 |
-
doc_df_state = gr.State()
|
33 |
-
docs_keep_out_state = gr.State()
|
34 |
|
35 |
corpus_state = gr.State()
|
36 |
-
|
|
|
37 |
|
38 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
39 |
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
|
@@ -58,13 +59,13 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
58 |
"""
|
59 |
**Exact term keyword search**
|
60 |
|
61 |
-
1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...
|
62 |
""")
|
63 |
with gr.Row():
|
64 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
65 |
|
66 |
with gr.Accordion(label = "Load in data", open=True):
|
67 |
-
in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =
|
68 |
with gr.Row():
|
69 |
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
70 |
load_bm25_data_button = gr.Button(value="Load data")
|
@@ -148,22 +149,22 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
148 |
|
149 |
### BM25 SEARCH ###
|
150 |
# Update dropdowns upon initial file load
|
151 |
-
in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column,
|
152 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
|
153 |
|
154 |
# Load in BM25 data
|
155 |
-
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column,
|
156 |
-
then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])#.\
|
157 |
#then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
158 |
|
159 |
# BM25 search functions on click or enter
|
160 |
-
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results,
|
161 |
-
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results,
|
162 |
|
163 |
### SEMANTIC SEARCH ###
|
164 |
# Load in a csv/excel file for semantic search
|
165 |
-
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column,
|
166 |
-
load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file,
|
167 |
then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
168 |
then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
169 |
|
|
|
28 |
out_passages = gr.State(9999)
|
29 |
vec_weight = gr.State(1)
|
30 |
|
31 |
+
#docs_keep_as_doc_state = gr.State()
|
32 |
+
#doc_df_state = gr.State()
|
33 |
+
#docs_keep_out_state = gr.State()
|
34 |
|
35 |
corpus_state = gr.State()
|
36 |
+
keyword_data_state = gr.State(pd.DataFrame())
|
37 |
+
semantic_data_state = gr.State(pd.DataFrame())
|
38 |
|
39 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
40 |
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
|
|
|
59 |
"""
|
60 |
**Exact term keyword search**
|
61 |
|
62 |
+
1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...search_index.pkl.gz' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
63 |
""")
|
64 |
with gr.Row():
|
65 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
66 |
|
67 |
with gr.Accordion(label = "Load in data", open=True):
|
68 |
+
in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz'])
|
69 |
with gr.Row():
|
70 |
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
71 |
load_bm25_data_button = gr.Button(value="Load data")
|
|
|
149 |
|
150 |
### BM25 SEARCH ###
|
151 |
# Update dropdowns upon initial file load
|
152 |
+
in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, keyword_data_state])
|
153 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
|
154 |
|
155 |
# Load in BM25 data
|
156 |
+
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
|
157 |
+
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
158 |
#then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
159 |
|
160 |
# BM25 search functions on click or enter
|
161 |
+
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
162 |
+
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
163 |
|
164 |
### SEMANTIC SEARCH ###
|
165 |
# Load in a csv/excel file for semantic search
|
166 |
+
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, semantic_data_state])
|
167 |
+
load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
|
168 |
then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
169 |
then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
170 |
|
how_to_create_exe_dist.txt
CHANGED
@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
17 |
8. In command line, cd to the folder that contains app.py. Then run the following:
|
18 |
|
19 |
For one single file:
|
20 |
-
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.
|
21 |
|
22 |
For a small exe with a folder of dependencies:
|
23 |
-
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.
|
24 |
|
25 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
26 |
|
|
|
17 |
8. In command line, cd to the folder that contains app.py. Then run the following:
|
18 |
|
19 |
For one single file:
|
20 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.1 app.py
|
21 |
|
22 |
For a small exe with a folder of dependencies:
|
23 |
+
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.1 app.py
|
24 |
|
25 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
26 |
|
search_funcs/bm25_functions.py
CHANGED
@@ -3,8 +3,10 @@ import heapq
|
|
3 |
import math
|
4 |
import pickle
|
5 |
import sys
|
|
|
6 |
import time
|
7 |
import pandas as pd
|
|
|
8 |
from numpy import inf
|
9 |
import gradio as gr
|
10 |
|
@@ -235,7 +237,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
235 |
|
236 |
#print(file_list)
|
237 |
|
238 |
-
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
239 |
|
240 |
data_file_name = data_file_names[0]
|
241 |
|
@@ -247,20 +249,24 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
|
|
247 |
tokenised_df = pd.DataFrame()
|
248 |
|
249 |
tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
if tokenised_file_names:
|
252 |
tokenised_df = read_file(tokenised_file_names[0])
|
253 |
-
#print("Tokenised df is: ", tokenised_df.head())
|
254 |
-
|
255 |
-
#df = pd.read_parquet(file_in.name)
|
256 |
-
|
257 |
-
df[text_column] = df[text_column].astype(str).str.lower()
|
258 |
|
259 |
if clean == "Yes":
|
260 |
clean_tic = time.perf_counter()
|
261 |
print("Starting data clean.")
|
262 |
|
263 |
-
df = df.drop_duplicates(text_column)
|
264 |
df_list = list(df[text_column])
|
265 |
df_list = initial_clean(df_list)
|
266 |
|
@@ -336,20 +342,62 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
|
|
336 |
|
337 |
return file_name, new_text_column
|
338 |
|
339 |
-
def prepare_bm25(corpus, k1=1.5, b = 0.75, alpha=-5):
|
340 |
-
|
341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
|
343 |
-
|
344 |
|
345 |
-
|
346 |
-
bm25 = BM25(corpus, k1=k1, b=b, alpha=alpha)
|
347 |
|
348 |
-
|
349 |
|
350 |
-
|
351 |
|
352 |
-
|
353 |
|
354 |
def convert_bm25_query_to_tokens(free_text_query, clean="No"):
|
355 |
'''
|
@@ -418,8 +466,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
418 |
|
419 |
# Out file
|
420 |
query_str_file = ("_").join(token_query)
|
421 |
-
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".
|
422 |
-
results_df_out.
|
423 |
results_first_text = results_df_out[text_column].iloc[0]
|
424 |
|
425 |
print("Returning results")
|
|
|
3 |
import math
|
4 |
import pickle
|
5 |
import sys
|
6 |
+
import gzip
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
from numpy import inf
|
11 |
import gradio as gr
|
12 |
|
|
|
237 |
|
238 |
#print(file_list)
|
239 |
|
240 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
241 |
|
242 |
data_file_name = data_file_names[0]
|
243 |
|
|
|
249 |
tokenised_df = pd.DataFrame()
|
250 |
|
251 |
tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
|
252 |
+
search_index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
|
253 |
+
|
254 |
+
df[text_column] = df[text_column].astype(str).str.lower()
|
255 |
+
|
256 |
+
if search_index_file_names:
|
257 |
+
corpus = list(df[text_column])
|
258 |
+
message = "Tokenisation skipped - loading search index from file."
|
259 |
+
print(message)
|
260 |
+
return corpus, message, df, None, None, None
|
261 |
|
262 |
if tokenised_file_names:
|
263 |
tokenised_df = read_file(tokenised_file_names[0])
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
if clean == "Yes":
|
266 |
clean_tic = time.perf_counter()
|
267 |
print("Starting data clean.")
|
268 |
|
269 |
+
#df = df.drop_duplicates(text_column)
|
270 |
df_list = list(df[text_column])
|
271 |
df_list = initial_clean(df_list)
|
272 |
|
|
|
342 |
|
343 |
return file_name, new_text_column
|
344 |
|
345 |
+
def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5):
|
346 |
+
#bm25.save("saved_df_bm25")
|
347 |
+
#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
|
348 |
+
|
349 |
+
file_list = [string.name for string in in_file]
|
350 |
+
|
351 |
+
#print(file_list)
|
352 |
+
|
353 |
+
# Get data file name
|
354 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
355 |
+
|
356 |
+
data_file_name = data_file_names[0]
|
357 |
+
data_file_out_name = get_file_path_end_with_ext(data_file_name)
|
358 |
+
data_file_name_no_ext = get_file_path_end(data_file_name)
|
359 |
+
|
360 |
+
# Check if there is a search index file already
|
361 |
+
index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
|
362 |
+
|
363 |
+
|
364 |
+
if index_file_names:
|
365 |
+
index_file_name = index_file_names[0]
|
366 |
+
|
367 |
+
print(index_file_name)
|
368 |
+
|
369 |
+
bm25_load = read_file(index_file_name)
|
370 |
+
|
371 |
+
|
372 |
+
#index_file_out_name = get_file_path_end_with_ext(index_file_name)
|
373 |
+
#index_file_name_no_ext = get_file_path_end(index_file_name)
|
374 |
+
|
375 |
+
else:
|
376 |
+
print("Preparing BM25 corpus")
|
377 |
+
|
378 |
+
bm25_load = BM25(corpus, k1=k1, b=b, alpha=alpha)
|
379 |
+
|
380 |
+
global bm25
|
381 |
+
bm25 = bm25_load
|
382 |
+
|
383 |
+
if return_intermediate_files == "Yes":
|
384 |
+
bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
|
385 |
+
#np.savez_compressed(bm25_search_file_name, bm25)
|
386 |
+
|
387 |
+
with gzip.open(bm25_search_file_name, 'wb') as file:
|
388 |
+
pickle.dump(bm25, file)
|
389 |
+
|
390 |
+
print("Search index saved to file")
|
391 |
|
392 |
+
message = "Search parameters loaded."
|
393 |
|
394 |
+
return message, bm25_search_file_name
|
|
|
395 |
|
396 |
+
message = "Search parameters loaded."
|
397 |
|
398 |
+
print(message)
|
399 |
|
400 |
+
return message, None
|
401 |
|
402 |
def convert_bm25_query_to_tokens(free_text_query, clean="No"):
|
403 |
'''
|
|
|
466 |
|
467 |
# Out file
|
468 |
query_str_file = ("_").join(token_query)
|
469 |
+
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
470 |
+
results_df_out.to_excel(results_df_name, index= None)
|
471 |
results_first_text = results_df_out[text_column].iloc[0]
|
472 |
|
473 |
print("Returning results")
|
search_funcs/helper_functions.py
CHANGED
@@ -2,10 +2,6 @@ import os
|
|
2 |
import re
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
-
|
6 |
-
import os
|
7 |
-
import shutil
|
8 |
-
|
9 |
import os
|
10 |
import shutil
|
11 |
import getpass
|
@@ -35,7 +31,6 @@ def empty_folder(directory_path):
|
|
35 |
|
36 |
|
37 |
|
38 |
-
|
39 |
def get_file_path_end(file_path):
|
40 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
41 |
basename = os.path.basename(file_path)
|
@@ -64,6 +59,8 @@ def detect_file_type(filename):
|
|
64 |
return 'parquet'
|
65 |
elif filename.endswith('.pkl.gz'):
|
66 |
return 'pkl.gz'
|
|
|
|
|
67 |
else:
|
68 |
raise ValueError("Unsupported file type.")
|
69 |
|
@@ -82,7 +79,9 @@ def read_file(filename):
|
|
82 |
elif file_type == 'pkl.gz':
|
83 |
with gzip.open(filename, 'rb') as file:
|
84 |
file = pickle.load(file)
|
85 |
-
|
|
|
|
|
86 |
|
87 |
print("File load complete")
|
88 |
|
|
|
2 |
import re
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
|
|
|
|
|
|
|
|
5 |
import os
|
6 |
import shutil
|
7 |
import getpass
|
|
|
31 |
|
32 |
|
33 |
|
|
|
34 |
def get_file_path_end(file_path):
|
35 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
36 |
basename = os.path.basename(file_path)
|
|
|
59 |
return 'parquet'
|
60 |
elif filename.endswith('.pkl.gz'):
|
61 |
return 'pkl.gz'
|
62 |
+
#elif filename.endswith('.gz'):
|
63 |
+
# return 'gz'
|
64 |
else:
|
65 |
raise ValueError("Unsupported file type.")
|
66 |
|
|
|
79 |
elif file_type == 'pkl.gz':
|
80 |
with gzip.open(filename, 'rb') as file:
|
81 |
file = pickle.load(file)
|
82 |
+
#elif file_type == ".gz":
|
83 |
+
# with gzip.open(filename, 'rb') as file:
|
84 |
+
# file = pickle.load(file)
|
85 |
|
86 |
print("File load complete")
|
87 |
|
search_funcs/semantic_functions.py
CHANGED
@@ -96,10 +96,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
|
|
96 |
## Load in pre-embedded file if exists
|
97 |
file_list = [string.name for string in in_file]
|
98 |
|
99 |
-
print(file_list)
|
100 |
|
101 |
-
embeddings_file_names = [string.lower() for string in file_list if "
|
102 |
-
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
103 |
data_file_name = data_file_names[0]
|
104 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
105 |
|
@@ -283,8 +283,8 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
|
|
283 |
|
284 |
query_str_file = query_str.replace(" ", "_")
|
285 |
|
286 |
-
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".
|
287 |
-
results_df_out.
|
288 |
results_first_text = results_df_out.iloc[0, 1]
|
289 |
|
290 |
return results_first_text, results_df_name
|
|
|
96 |
## Load in pre-embedded file if exists
|
97 |
file_list = [string.name for string in in_file]
|
98 |
|
99 |
+
#print(file_list)
|
100 |
|
101 |
+
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
102 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
103 |
data_file_name = data_file_names[0]
|
104 |
data_file_name_no_ext = get_file_path_end(data_file_name)
|
105 |
|
|
|
283 |
|
284 |
query_str_file = query_str.replace(" ", "_")
|
285 |
|
286 |
+
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
287 |
+
results_df_out.to_excel(results_df_name, index= None)
|
288 |
results_first_text = results_df_out.iloc[0, 1]
|
289 |
|
290 |
return results_first_text, results_df_name
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
|
|
130 |
|
131 |
#print(file_list)
|
132 |
|
133 |
-
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
|
134 |
|
135 |
data_file_name = data_file_names[0]
|
136 |
|
@@ -329,7 +329,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
|
|
329 |
clean_tic = time.perf_counter()
|
330 |
print("Starting data clean.")
|
331 |
|
332 |
-
df = df.drop_duplicates(text_column)
|
333 |
|
334 |
df[text_column] = initial_clean(df[text_column])
|
335 |
df_list = list(df[text_column])
|
|
|
130 |
|
131 |
#print(file_list)
|
132 |
|
133 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
134 |
|
135 |
data_file_name = data_file_names[0]
|
136 |
|
|
|
329 |
clean_tic = time.perf_counter()
|
330 |
print("Starting data clean.")
|
331 |
|
332 |
+
#df = df.drop_duplicates(text_column)
|
333 |
|
334 |
df[text_column] = initial_clean(df[text_column])
|
335 |
df_list = list(df[text_column])
|