seanpedrickcase
commited on
Commit
•
650da6e
1
Parent(s):
58d3f97
Improvements with embeddings load and file save
Browse files- app.py +3 -3
- search_funcs/helper_functions.py +7 -9
- search_funcs/semantic_functions.py +3 -3
- search_funcs/semantic_ingest_functions.py +26 -13
app.py
CHANGED
@@ -7,7 +7,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
-
from search_funcs.semantic_functions import load_embedding_model,
|
11 |
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
@@ -99,7 +99,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
99 |
"""
|
100 |
**Thematic/semantic search**
|
101 |
|
102 |
-
This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the '
|
103 |
""")
|
104 |
|
105 |
with gr.Row():
|
@@ -202,7 +202,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
202 |
|
203 |
load_semantic_data_button.click(
|
204 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
|
205 |
-
then(
|
206 |
|
207 |
# Semantic search query
|
208 |
semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
|
|
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
+
from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, bge_semantic_search
|
11 |
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
|
|
99 |
"""
|
100 |
**Thematic/semantic search**
|
101 |
|
102 |
+
This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embed... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
103 |
""")
|
104 |
|
105 |
with gr.Row():
|
|
|
202 |
|
203 |
load_semantic_data_button.click(
|
204 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
|
205 |
+
then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
|
206 |
|
207 |
# Semantic search query
|
208 |
semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
|
search_funcs/helper_functions.py
CHANGED
@@ -261,13 +261,13 @@ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)
|
|
261 |
|
262 |
progress(0.3, desc="Loading in data files")
|
263 |
|
264 |
-
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()
|
265 |
print("Data file names:", data_file_names)
|
266 |
|
267 |
if not data_file_names:
|
268 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
269 |
print(out_message)
|
270 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
|
271 |
|
272 |
# This if you have loaded in a documents object for the semantic search
|
273 |
if "pkl" in data_file_names[0]:
|
@@ -288,11 +288,9 @@ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)
|
|
288 |
if file_size > file_size_bytes_500mb:
|
289 |
out_message = "Data file greater than 500mb in size. Please use smaller sizes."
|
290 |
print(out_message)
|
291 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
|
292 |
-
|
293 |
|
294 |
df_new = read_file(file)
|
295 |
-
|
296 |
df = pd.concat([df, df_new], ignore_index = True)
|
297 |
|
298 |
new_choices = list(df.columns)
|
@@ -302,22 +300,22 @@ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)
|
|
302 |
progress(0.6, desc="Loading in embedding/search index files")
|
303 |
|
304 |
# Check if there is a search index file already
|
305 |
-
index_file_names = [string for string in file_list if ".gz" in string.lower()]
|
306 |
|
307 |
if index_file_names:
|
308 |
index_file_name = index_file_names[0]
|
309 |
print("Search index file name found:", index_file_name)
|
310 |
index_load = read_file(index_file_name)
|
311 |
|
312 |
-
embeddings_file_names = [string for string in file_list if "
|
313 |
|
314 |
if embeddings_file_names:
|
315 |
print("Loading embeddings from file.")
|
316 |
embed_load = np.load(embeddings_file_names[0])['arr_0']
|
317 |
|
318 |
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
319 |
-
if "compress" in embeddings_file_names[0]:
|
320 |
-
|
321 |
else:
|
322 |
embed_load = np.array([])
|
323 |
|
|
|
261 |
|
262 |
progress(0.3, desc="Loading in data files")
|
263 |
|
264 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() or "prep_docs" in string.lower()]
|
265 |
print("Data file names:", data_file_names)
|
266 |
|
267 |
if not data_file_names:
|
268 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
269 |
print(out_message)
|
270 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None, file_list
|
271 |
|
272 |
# This if you have loaded in a documents object for the semantic search
|
273 |
if "pkl" in data_file_names[0]:
|
|
|
288 |
if file_size > file_size_bytes_500mb:
|
289 |
out_message = "Data file greater than 500mb in size. Please use smaller sizes."
|
290 |
print(out_message)
|
291 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None, file_list
|
|
|
292 |
|
293 |
df_new = read_file(file)
|
|
|
294 |
df = pd.concat([df, df_new], ignore_index = True)
|
295 |
|
296 |
new_choices = list(df.columns)
|
|
|
300 |
progress(0.6, desc="Loading in embedding/search index files")
|
301 |
|
302 |
# Check if there is a search index file already
|
303 |
+
index_file_names = [string for string in file_list if "pkl.gz" in string.lower()]
|
304 |
|
305 |
if index_file_names:
|
306 |
index_file_name = index_file_names[0]
|
307 |
print("Search index file name found:", index_file_name)
|
308 |
index_load = read_file(index_file_name)
|
309 |
|
310 |
+
embeddings_file_names = [string for string in file_list if ".npz" in string.lower()]
|
311 |
|
312 |
if embeddings_file_names:
|
313 |
print("Loading embeddings from file.")
|
314 |
embed_load = np.load(embeddings_file_names[0])['arr_0']
|
315 |
|
316 |
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
317 |
+
#if "compress" in embeddings_file_names[0]:
|
318 |
+
# embed_load /= 100
|
319 |
else:
|
320 |
embed_load = np.array([])
|
321 |
|
search_funcs/semantic_functions.py
CHANGED
@@ -50,7 +50,7 @@ def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_l
|
|
50 |
|
51 |
return embeddings_model, torch_device
|
52 |
|
53 |
-
def
|
54 |
docs_out: list,
|
55 |
in_file: list,
|
56 |
output_file_state: str,
|
@@ -136,9 +136,9 @@ def docs_to_bge_embed_np_array(
|
|
136 |
|
137 |
progress(0.9, desc = "Saving embeddings to file")
|
138 |
if embeddings_compress == "No":
|
139 |
-
semantic_search_file_name = output_folder + data_file_name_no_ext + '
|
140 |
else:
|
141 |
-
semantic_search_file_name = output_folder + data_file_name_no_ext + '
|
142 |
|
143 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
144 |
|
|
|
50 |
|
51 |
return embeddings_model, torch_device
|
52 |
|
53 |
+
def docs_to_embed_np_array(
|
54 |
docs_out: list,
|
55 |
in_file: list,
|
56 |
output_file_state: str,
|
|
|
136 |
|
137 |
progress(0.9, desc = "Saving embeddings to file")
|
138 |
if embeddings_compress == "No":
|
139 |
+
semantic_search_file_name = output_folder + data_file_name_no_ext + '_embeddings.npz'
|
140 |
else:
|
141 |
+
semantic_search_file_name = output_folder + data_file_name_no_ext + '_embedding_compress.npz'
|
142 |
|
143 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
144 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -97,6 +97,11 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
|
|
97 |
ensure_output_folder_exists(output_folder)
|
98 |
output_list = []
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
if not in_file:
|
101 |
return None, "Please load in at least one file.", output_list
|
102 |
|
@@ -115,12 +120,12 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
|
|
115 |
data_file_name = data_file_names[0]
|
116 |
|
117 |
# Check if file is a document format, and explode out as needed
|
118 |
-
if "
|
119 |
print("Loading in documents from file.")
|
120 |
|
121 |
doc_sections = df
|
122 |
|
123 |
-
|
124 |
|
125 |
return doc_sections, "Finished preparing documents", output_list
|
126 |
|
@@ -147,17 +152,29 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
|
|
147 |
clean_toc = time.perf_counter()
|
148 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
149 |
print(clean_time_out)
|
|
|
|
|
|
|
|
|
150 |
|
151 |
-
|
|
|
|
|
152 |
|
|
|
153 |
df["metadata"] = combine_metadata_columns(df, cols)
|
154 |
|
155 |
progress(0.3, desc = "Converting data to document format")
|
156 |
|
|
|
|
|
|
|
|
|
157 |
# Create a list of Document objects
|
158 |
-
doc_sections = [Document(page_content=row[text_column],
|
159 |
-
metadata= parse_metadata(row["metadata"]))
|
160 |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
|
|
|
|
161 |
|
162 |
ingest_toc = time.perf_counter()
|
163 |
|
@@ -169,15 +186,11 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
|
|
169 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
170 |
file_name = data_file_out_name_no_ext
|
171 |
|
172 |
-
if clean == "No":
|
173 |
-
|
174 |
-
with gzip.open(out_doc_file_name, 'wb') as file:
|
175 |
-
pickle.dump(doc_sections, file)
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
with gzip.open(out_doc_file_name, 'wb') as file:
|
180 |
-
pickle.dump(doc_sections, file)
|
181 |
|
182 |
output_list.append(out_doc_file_name)
|
183 |
print("Documents saved to file.")
|
|
|
97 |
ensure_output_folder_exists(output_folder)
|
98 |
output_list = []
|
99 |
|
100 |
+
if not isinstance(text_column, str):
|
101 |
+
text_column = str(text_column)
|
102 |
+
|
103 |
+
print("text_column:", text_column)
|
104 |
+
|
105 |
if not in_file:
|
106 |
return None, "Please load in at least one file.", output_list
|
107 |
|
|
|
120 |
data_file_name = data_file_names[0]
|
121 |
|
122 |
# Check if file is a document format, and explode out as needed
|
123 |
+
if "prep_docs" in data_file_name:
|
124 |
print("Loading in documents from file.")
|
125 |
|
126 |
doc_sections = df
|
127 |
|
128 |
+
print("doc_sections:", doc_sections[0])
|
129 |
|
130 |
return doc_sections, "Finished preparing documents", output_list
|
131 |
|
|
|
152 |
clean_toc = time.perf_counter()
|
153 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
154 |
print(clean_time_out)
|
155 |
+
|
156 |
+
else:
|
157 |
+
df_list = list(df[text_column])
|
158 |
+
prepared_text_df = pd.DataFrame(data={text_column:df_list})
|
159 |
|
160 |
+
# Drop original column from input file to reduce file size
|
161 |
+
in_df = df.drop(text_column, axis = 1)
|
162 |
+
df = pd.concat([in_df, prepared_text_df], axis = 1)
|
163 |
|
164 |
+
cols = [col for col in df.columns if col != original_text_column]
|
165 |
df["metadata"] = combine_metadata_columns(df, cols)
|
166 |
|
167 |
progress(0.3, desc = "Converting data to document format")
|
168 |
|
169 |
+
#print("text_column name:", text_column)
|
170 |
+
#print("text_column:", df[text_column])
|
171 |
+
#print("metadata", df["metadata"])
|
172 |
+
|
173 |
# Create a list of Document objects
|
174 |
+
doc_sections = [Document(page_content=row[text_column], metadata= parse_metadata(row["metadata"]))
|
|
|
175 |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
176 |
+
|
177 |
+
print("doc_sections:", doc_sections[0])
|
178 |
|
179 |
ingest_toc = time.perf_counter()
|
180 |
|
|
|
186 |
data_file_out_name_no_ext = get_file_path_end(data_file_name)
|
187 |
file_name = data_file_out_name_no_ext
|
188 |
|
189 |
+
if clean == "No": out_doc_file_name = output_folder + file_name + "_prep_docs.pkl.gz"
|
190 |
+
elif clean == "Yes": out_doc_file_name = output_folder + file_name + "_cleaned_prep_docs.pkl.gz"
|
|
|
|
|
191 |
|
192 |
+
with gzip.open(out_doc_file_name, 'wb') as file:
|
193 |
+
pickle.dump(doc_sections, file)
|
|
|
|
|
194 |
|
195 |
output_list.append(out_doc_file_name)
|
196 |
print("Documents saved to file.")
|