Sean-Case
commited on
Commit
•
352c02a
1
Parent(s):
36a404e
Improved xlsx output formatting. Deals better with cleaning data then analysing in same session.
Browse files- .gitignore +1 -0
- app.py +10 -5
- search_funcs/bm25_functions.py +9 -9
- search_funcs/clean_funcs.py +13 -54
- search_funcs/helper_functions.py +24 -16
- search_funcs/semantic_functions.py +83 -101
- search_funcs/semantic_ingest_functions.py +1 -1
.gitignore
CHANGED
@@ -16,6 +16,7 @@
|
|
16 |
*.pkl
|
17 |
*.pkl.gz
|
18 |
*.pem
|
|
|
19 |
build/*
|
20 |
dist/*
|
21 |
__pycache__/*
|
|
|
16 |
*.pkl
|
17 |
*.pkl.gz
|
18 |
*.pem
|
19 |
+
docs/*
|
20 |
build/*
|
21 |
dist/*
|
22 |
__pycache__/*
|
app.py
CHANGED
@@ -35,9 +35,13 @@ with block:
|
|
35 |
vec_weight = gr.State(1)
|
36 |
|
37 |
corpus_state = gr.State()
|
38 |
-
keyword_data_state = gr.State(pd.DataFrame())
|
39 |
keyword_data_list_state = gr.State([])
|
40 |
join_data_state = gr.State(pd.DataFrame())
|
|
|
|
|
|
|
|
|
|
|
41 |
semantic_data_state = gr.State(pd.DataFrame())
|
42 |
|
43 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
@@ -157,13 +161,14 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
157 |
|
158 |
### BM25 SEARCH ###
|
159 |
# Update dropdowns upon initial file load
|
160 |
-
in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
|
161 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
162 |
|
163 |
# Load in BM25 data
|
164 |
-
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state]).\
|
165 |
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
166 |
|
|
|
167 |
# BM25 search functions on click or enter
|
168 |
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
169 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
@@ -174,10 +179,10 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
174 |
### SEMANTIC SEARCH ###
|
175 |
|
176 |
# Load in a csv/excel file for semantic search
|
177 |
-
in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column,
|
178 |
load_semantic_data_button.click(
|
179 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
180 |
-
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
181 |
|
182 |
# Semantic search query
|
183 |
semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
|
|
35 |
vec_weight = gr.State(1)
|
36 |
|
37 |
corpus_state = gr.State()
|
|
|
38 |
keyword_data_list_state = gr.State([])
|
39 |
join_data_state = gr.State(pd.DataFrame())
|
40 |
+
|
41 |
+
orig_keyword_data_state = gr.State(pd.DataFrame())
|
42 |
+
keyword_data_state = gr.State(pd.DataFrame())
|
43 |
+
|
44 |
+
orig_semantic_data_state = gr.State(pd.DataFrame())
|
45 |
semantic_data_state = gr.State(pd.DataFrame())
|
46 |
|
47 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
|
|
161 |
|
162 |
### BM25 SEARCH ###
|
163 |
# Update dropdowns upon initial file load
|
164 |
+
in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
|
165 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
166 |
|
167 |
# Load in BM25 data
|
168 |
+
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
|
169 |
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
170 |
|
171 |
+
|
172 |
# BM25 search functions on click or enter
|
173 |
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
|
174 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
|
|
179 |
### SEMANTIC SEARCH ###
|
180 |
|
181 |
# Load in a csv/excel file for semantic search
|
182 |
+
in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
|
183 |
load_semantic_data_button.click(
|
184 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
185 |
+
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
186 |
|
187 |
# Semantic search query
|
188 |
semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
search_funcs/bm25_functions.py
CHANGED
@@ -231,11 +231,11 @@ class BM25:
|
|
231 |
# These following functions are my own work
|
232 |
|
233 |
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
234 |
-
print(in_file)
|
235 |
|
236 |
if not in_file:
|
237 |
print("No input file found. Please load in at least one file.")
|
238 |
-
return None, "No input file found. Please load in at least one file.", data_state, None, None,
|
239 |
|
240 |
progress(0, desc = "Loading in data")
|
241 |
file_list = [string.name for string in in_file]
|
@@ -245,10 +245,10 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
245 |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
|
246 |
|
247 |
if not data_file_names:
|
248 |
-
return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None,
|
249 |
|
250 |
if not text_column:
|
251 |
-
return None, "Please enter a column name to search.", data_state, None, None,
|
252 |
|
253 |
data_file_name = data_file_names[0]
|
254 |
|
@@ -268,7 +268,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
268 |
corpus = list(df[text_column])
|
269 |
message = "Tokenisation skipped - loading search index from file."
|
270 |
print(message)
|
271 |
-
return corpus, message, df, None, None,
|
272 |
|
273 |
|
274 |
|
@@ -282,7 +282,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
282 |
df_list = initial_clean(df_list)
|
283 |
|
284 |
# Save to file if you have cleaned the data
|
285 |
-
out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
286 |
|
287 |
clean_toc = time.perf_counter()
|
288 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
@@ -328,9 +328,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
328 |
|
329 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
330 |
|
331 |
-
return corpus, message, df, out_file_name, tokenised_data_file_name, df_list
|
332 |
|
333 |
-
return corpus, message, df, out_file_name, None, df_list
|
334 |
|
335 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
336 |
|
@@ -356,7 +356,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
|
|
356 |
prepared_df.to_parquet(file_name)
|
357 |
else: file_name = None
|
358 |
|
359 |
-
return file_name, new_text_column
|
360 |
|
361 |
def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
|
362 |
#bm25.save("saved_df_bm25")
|
|
|
231 |
# These following functions are my own work
|
232 |
|
233 |
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
234 |
+
#print(in_file)
|
235 |
|
236 |
if not in_file:
|
237 |
print("No input file found. Please load in at least one file.")
|
238 |
+
return None, "No input file found. Please load in at least one file.", data_state, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
239 |
|
240 |
progress(0, desc = "Loading in data")
|
241 |
file_list = [string.name for string in in_file]
|
|
|
245 |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
|
246 |
|
247 |
if not data_file_names:
|
248 |
+
return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
249 |
|
250 |
if not text_column:
|
251 |
+
return None, "Please enter a column name to search.", data_state, None, None,[], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
252 |
|
253 |
data_file_name = data_file_names[0]
|
254 |
|
|
|
268 |
corpus = list(df[text_column])
|
269 |
message = "Tokenisation skipped - loading search index from file."
|
270 |
print(message)
|
271 |
+
return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
272 |
|
273 |
|
274 |
|
|
|
282 |
df_list = initial_clean(df_list)
|
283 |
|
284 |
# Save to file if you have cleaned the data
|
285 |
+
out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
286 |
|
287 |
clean_toc = time.perf_counter()
|
288 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
|
|
328 |
|
329 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
330 |
|
331 |
+
return corpus, message, df, out_file_name, tokenised_data_file_name, df_list, gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
332 |
|
333 |
+
return corpus, message, df, out_file_name, None, df_list, gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
|
334 |
|
335 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
336 |
|
|
|
356 |
prepared_df.to_parquet(file_name)
|
357 |
else: file_name = None
|
358 |
|
359 |
+
return file_name, new_text_column, prepared_df
|
360 |
|
361 |
def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
|
362 |
#bm25.save("saved_df_bm25")
|
search_funcs/clean_funcs.py
CHANGED
@@ -32,68 +32,27 @@ num_pattern_regex = r'[0-9]+'
|
|
32 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
33 |
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
34 |
nbsp_pattern_regex = r' '
|
|
|
35 |
|
36 |
# Pre-compiling the regular expressions for efficiency
|
37 |
-
email_start_pattern = re.compile(email_start_pattern_regex)
|
38 |
-
email_end_pattern = re.compile(email_end_pattern_regex)
|
39 |
-
html_pattern = re.compile(html_pattern_regex)
|
40 |
-
email_pattern = re.compile(email_end_pattern_regex)
|
41 |
-
num_pattern = re.compile(num_pattern_regex)
|
42 |
-
postcode_pattern = re.compile(postcode_pattern_regex)
|
43 |
-
warning_pattern = re.compile(warning_pattern_regex)
|
44 |
-
nbsp_pattern = re.compile(nbsp_pattern_regex)
|
45 |
-
|
46 |
-
# def stem_sentence(sentence):
|
47 |
-
|
48 |
-
# words = sentence.split()
|
49 |
-
# stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words]
|
50 |
-
# return stemmed_words
|
51 |
-
|
52 |
-
# def stem_sentences(sentences, progress=gr.Progress()):
|
53 |
-
# """Stem each sentence in a list of sentences."""
|
54 |
-
# stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
|
55 |
-
# return stemmed_sentences
|
56 |
-
|
57 |
-
# def get_lemma_text(text):
|
58 |
-
# # Tokenize the input string into words
|
59 |
-
# tokens = word_tokenize(text)
|
60 |
-
|
61 |
-
# lemmas = []
|
62 |
-
# for word in tokens:
|
63 |
-
# if len(word) > 3:
|
64 |
-
# lemma = wn.morphy(word)
|
65 |
-
# else:
|
66 |
-
# lemma = None
|
67 |
-
|
68 |
-
# if lemma is None:
|
69 |
-
# lemmas.append(word)
|
70 |
-
# else:
|
71 |
-
# lemmas.append(lemma)
|
72 |
-
# return lemmas
|
73 |
-
|
74 |
-
# def get_lemma_tokens(tokens):
|
75 |
-
# Tokenize the input string into words
|
76 |
-
|
77 |
-
# lemmas = []
|
78 |
-
# for word in tokens:
|
79 |
-
# if len(word) > 3:
|
80 |
-
# lemma = wn.morphy(word)
|
81 |
-
# else:
|
82 |
-
# lemma = None
|
83 |
-
|
84 |
-
# if lemma is None:
|
85 |
-
# lemmas.append(word)
|
86 |
-
# else:
|
87 |
-
# lemmas.append(lemma)
|
88 |
-
# return lemmas
|
89 |
|
90 |
def initial_clean(texts , progress=gr.Progress()):
|
91 |
texts = pl.Series(texts)#[]
|
92 |
|
93 |
-
text = texts.str.replace_all(
|
|
|
94 |
text = text.str.replace_all(email_end_pattern_regex, '')
|
95 |
-
text = text.str.replace_all(html_pattern_regex, '')
|
96 |
text = text.str.replace_all(email_pattern_regex, '')
|
|
|
97 |
|
98 |
text = text.to_list()
|
99 |
|
|
|
32 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
33 |
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
34 |
nbsp_pattern_regex = r' '
|
35 |
+
multiple_spaces_regex = r'\s{2,}'
|
36 |
|
37 |
# Pre-compiling the regular expressions for efficiency
|
38 |
+
# email_start_pattern = re.compile(email_start_pattern_regex)
|
39 |
+
# email_end_pattern = re.compile(email_end_pattern_regex)
|
40 |
+
# html_pattern = re.compile(html_pattern_regex)
|
41 |
+
# email_pattern = re.compile(email_end_pattern_regex)
|
42 |
+
# num_pattern = re.compile(num_pattern_regex)
|
43 |
+
# postcode_pattern = re.compile(postcode_pattern_regex)
|
44 |
+
# warning_pattern = re.compile(warning_pattern_regex)
|
45 |
+
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
46 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def initial_clean(texts , progress=gr.Progress()):
|
49 |
texts = pl.Series(texts)#[]
|
50 |
|
51 |
+
text = texts.str.replace_all(html_pattern_regex, '')
|
52 |
+
text = text.str.replace_all(email_start_pattern_regex, '')
|
53 |
text = text.str.replace_all(email_end_pattern_regex, '')
|
|
|
54 |
text = text.str.replace_all(email_pattern_regex, '')
|
55 |
+
text = text.str.replace_all(multiple_spaces_regex, ' ')
|
56 |
|
57 |
text = text.to_list()
|
58 |
|
search_funcs/helper_functions.py
CHANGED
@@ -13,7 +13,7 @@ import numpy as np
|
|
13 |
from openpyxl import Workbook
|
14 |
from openpyxl.cell.text import InlineFont
|
15 |
from openpyxl.cell.rich_text import TextBlock, CellRichText
|
16 |
-
from openpyxl.styles import Font
|
17 |
|
18 |
# Attempt to delete content of gradio temp folder
|
19 |
def get_temp_folder_path():
|
@@ -103,6 +103,7 @@ def initial_data_load(in_file):
|
|
103 |
tokenised_load =[]
|
104 |
out_message = ""
|
105 |
current_source = ""
|
|
|
106 |
|
107 |
file_list = [string.name for string in in_file]
|
108 |
|
@@ -113,25 +114,25 @@ def initial_data_load(in_file):
|
|
113 |
if not data_file_names:
|
114 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
115 |
print(out_message)
|
116 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), index_load, out_message
|
117 |
|
118 |
-
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
df = read_file(data_file_name)
|
124 |
-
|
125 |
-
if "pkl" not in data_file_name:
|
126 |
|
127 |
-
|
|
|
|
|
128 |
|
129 |
-
|
130 |
-
# If only the search_index found, need a data file too
|
131 |
-
new_choices = []
|
132 |
|
133 |
-
|
134 |
-
#print(new_choices)
|
135 |
|
136 |
concat_choices.extend(new_choices)
|
137 |
|
@@ -161,7 +162,7 @@ def initial_data_load(in_file):
|
|
161 |
out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
|
162 |
print(out_message)
|
163 |
|
164 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, tokenised_load, out_message, current_source
|
165 |
|
166 |
def put_columns_in_join_df(in_file):
|
167 |
'''
|
@@ -291,12 +292,19 @@ def create_highlighted_excel_wb(df, search_text, column_to_highlight):
|
|
291 |
for cell in sheet[1]:
|
292 |
cell.font = Font(bold=True)
|
293 |
|
|
|
|
|
|
|
|
|
|
|
294 |
# Find substrings in cells and highlight
|
295 |
for r_idx, row in enumerate(df.itertuples(), start=2):
|
296 |
for c_idx, cell_value in enumerate(row[1:], start=1):
|
297 |
sheet.cell(row=r_idx, column=c_idx, value=cell_value)
|
298 |
if df.columns[c_idx - 1] == column_to_highlight:
|
|
|
299 |
html_text, combined_positions = highlight_found_text(search_text, cell_value)
|
300 |
sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
|
|
|
301 |
|
302 |
return wb
|
|
|
13 |
from openpyxl import Workbook
|
14 |
from openpyxl.cell.text import InlineFont
|
15 |
from openpyxl.cell.rich_text import TextBlock, CellRichText
|
16 |
+
from openpyxl.styles import Font, Alignment
|
17 |
|
18 |
# Attempt to delete content of gradio temp folder
|
19 |
def get_temp_folder_path():
|
|
|
103 |
tokenised_load =[]
|
104 |
out_message = ""
|
105 |
current_source = ""
|
106 |
+
df = pd.DataFrame()
|
107 |
|
108 |
file_list = [string.name for string in in_file]
|
109 |
|
|
|
114 |
if not data_file_names:
|
115 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
116 |
print(out_message)
|
117 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, out_message
|
118 |
|
119 |
+
# This if you have loaded in a documents object for the semantic search
|
120 |
+
if "pkl" in data_file_names[0]:
|
121 |
+
df = read_file(data_file_names[0])
|
122 |
+
new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
|
123 |
+
current_source = get_file_path_end_with_ext(data_file_names[0])
|
124 |
|
125 |
+
# This if you have loaded in a csv/parquets/xlsx
|
126 |
+
else:
|
127 |
+
for file in data_file_names:
|
|
|
|
|
|
|
128 |
|
129 |
+
current_source = current_source + get_file_path_end_with_ext(file) + " "
|
130 |
+
|
131 |
+
df_new = read_file(file)
|
132 |
|
133 |
+
df = pd.concat([df, df_new], ignore_index = True)
|
|
|
|
|
134 |
|
135 |
+
new_choices = list(df.columns)
|
|
|
136 |
|
137 |
concat_choices.extend(new_choices)
|
138 |
|
|
|
162 |
out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
|
163 |
print(out_message)
|
164 |
|
165 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source
|
166 |
|
167 |
def put_columns_in_join_df(in_file):
|
168 |
'''
|
|
|
292 |
for cell in sheet[1]:
|
293 |
cell.font = Font(bold=True)
|
294 |
|
295 |
+
column_width = 150 # Adjust as needed
|
296 |
+
relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
|
297 |
+
print(relevant_column_no)
|
298 |
+
sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
|
299 |
+
|
300 |
# Find substrings in cells and highlight
|
301 |
for r_idx, row in enumerate(df.itertuples(), start=2):
|
302 |
for c_idx, cell_value in enumerate(row[1:], start=1):
|
303 |
sheet.cell(row=r_idx, column=c_idx, value=cell_value)
|
304 |
if df.columns[c_idx - 1] == column_to_highlight:
|
305 |
+
|
306 |
html_text, combined_positions = highlight_found_text(search_text, cell_value)
|
307 |
sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
|
308 |
+
sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)
|
309 |
|
310 |
return wb
|
search_funcs/semantic_functions.py
CHANGED
@@ -25,7 +25,7 @@ else:
|
|
25 |
|
26 |
print("Device used is: ", torch_device)
|
27 |
|
28 |
-
|
29 |
|
30 |
PandasDataFrame = Type[pd.DataFrame]
|
31 |
|
@@ -45,106 +45,10 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
45 |
embeddings_name = "BAAI/bge-small-en-v1.5"
|
46 |
local_embeddings_location = "model/bge/"
|
47 |
|
48 |
-
#try:
|
49 |
-
# tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
|
50 |
-
# embeddings_model = AutoModel.from_pretrained(local_embeddings_location, local_files_only=True).to(torch_device)
|
51 |
-
#except:
|
52 |
-
# tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
|
53 |
-
# embeddings_model = AutoModel.from_pretrained(embeddings_name).to(torch_device)
|
54 |
-
|
55 |
# Not using SentenceTransformer here
|
56 |
embeddings_model = SentenceTransformer(embeddings_name)
|
57 |
|
58 |
-
|
59 |
-
# # Tokenize sentences
|
60 |
-
# print("Tokenising")
|
61 |
-
# encoded_input = tokenizer(docs, padding=True, truncation=True, return_tensors='pt', max_length=32).to(torch_device)
|
62 |
-
|
63 |
-
# # Compute token embeddings
|
64 |
-
# print("Calculating embeddings")
|
65 |
-
# with torch.no_grad():
|
66 |
-
# model_output = embeddings_model(**encoded_input).to(torch_device)
|
67 |
-
# # Perform pooling. In this case, cls pooling.
|
68 |
-
# embeddings_out = model_output[0][:, 0]
|
69 |
-
# # normalize embeddings
|
70 |
-
# embeddings_out = torch.nn.functional.normalize(embeddings_out, p=2, dim=1)
|
71 |
-
# #print("Sentence embeddings:", embeddings_out)
|
72 |
-
|
73 |
-
# return embeddings_out
|
74 |
-
|
75 |
-
|
76 |
-
def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
77 |
-
'''
|
78 |
-
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
79 |
-
'''
|
80 |
-
if not in_file:
|
81 |
-
out_message = "No input file found. Please load in at least one file."
|
82 |
-
print(out_message)
|
83 |
-
return out_message, None, None
|
84 |
-
|
85 |
-
|
86 |
-
progress(0.6, desc = "Loading/creating embeddings")
|
87 |
-
|
88 |
-
print(f"> Total split documents: {len(docs_out)}")
|
89 |
-
|
90 |
-
#print(docs_out)
|
91 |
-
|
92 |
-
page_contents = [doc.page_content for doc in docs_out]
|
93 |
-
|
94 |
-
## Load in pre-embedded file if exists
|
95 |
-
file_list = [string.name for string in in_file]
|
96 |
-
|
97 |
-
#print(file_list)
|
98 |
-
|
99 |
-
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
100 |
-
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
101 |
-
data_file_name = data_file_names[0]
|
102 |
-
data_file_name_no_ext = get_file_path_end(data_file_name)
|
103 |
-
|
104 |
-
out_message = "Document processing complete. Ready to search."
|
105 |
-
|
106 |
-
# print("embeddings loaded: ", embeddings_out)
|
107 |
-
|
108 |
-
if embeddings_state.size == 0:
|
109 |
-
tic = time.perf_counter()
|
110 |
-
print("Starting to embed documents.")
|
111 |
-
#embeddings_list = []
|
112 |
-
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
113 |
-
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
114 |
-
|
115 |
-
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
116 |
-
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
117 |
-
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
118 |
-
|
119 |
-
toc = time.perf_counter()
|
120 |
-
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
121 |
-
print(time_out)
|
122 |
-
|
123 |
-
# If you want to save your files for next time
|
124 |
-
if return_intermediate_files == "Yes":
|
125 |
-
progress(0.9, desc = "Saving embeddings to file")
|
126 |
-
if embeddings_super_compress == "No":
|
127 |
-
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
128 |
-
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
129 |
-
else:
|
130 |
-
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
|
131 |
-
embeddings_out_round = np.round(embeddings_out, 3)
|
132 |
-
embeddings_out_round *= 100 # Rounding not currently used
|
133 |
-
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
134 |
-
|
135 |
-
return out_message, embeddings_out, semantic_search_file_name
|
136 |
-
|
137 |
-
return out_message, embeddings_out, None
|
138 |
-
else:
|
139 |
-
# Just return existing embeddings if already exist
|
140 |
-
embeddings_out = embeddings_state
|
141 |
-
|
142 |
-
print(out_message)
|
143 |
-
|
144 |
-
return out_message, embeddings_out, None#, None
|
145 |
-
|
146 |
-
|
147 |
-
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
148 |
'''
|
149 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
150 |
'''
|
@@ -197,6 +101,9 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_inter
|
|
197 |
|
198 |
# If you want to save your files for next time
|
199 |
if return_intermediate_files == "Yes":
|
|
|
|
|
|
|
200 |
progress(0.9, desc = "Saving embeddings to file")
|
201 |
if embeddings_super_compress == "No":
|
202 |
semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
|
@@ -273,7 +180,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
|
|
273 |
# Concatenate the original DataFrame with the expanded metadata DataFrame
|
274 |
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
|
275 |
|
276 |
-
results_df_out = results_df_out.rename(columns={"documents":
|
277 |
|
278 |
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
|
279 |
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
|
@@ -371,7 +278,11 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
|
|
371 |
print("Saving search output to file")
|
372 |
progress(0.7, desc = "Saving search output to file")
|
373 |
|
374 |
-
|
|
|
|
|
|
|
|
|
375 |
results_first_text = results_df_out.iloc[0, 1]
|
376 |
|
377 |
print("Returning results")
|
@@ -379,7 +290,77 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
|
|
379 |
return results_first_text, results_df_name
|
380 |
|
381 |
|
382 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
|
384 |
|
385 |
# print("vectorstore loaded: ", vectorstore)
|
@@ -464,6 +445,7 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
|
|
464 |
#if os.path.isfile(chromadb_file):
|
465 |
# os.remove(chromadb_file)
|
466 |
|
|
|
467 |
def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
|
468 |
'''
|
469 |
Takes a Langchain document class and saves it into a Chroma sqlite file. Not currently used.
|
|
|
25 |
|
26 |
print("Device used is: ", torch_device)
|
27 |
|
28 |
+
from search_funcs.helper_functions import create_highlighted_excel_wb
|
29 |
|
30 |
PandasDataFrame = Type[pd.DataFrame]
|
31 |
|
|
|
45 |
embeddings_name = "BAAI/bge-small-en-v1.5"
|
46 |
local_embeddings_location = "model/bge/"
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
# Not using SentenceTransformer here
|
49 |
embeddings_model = SentenceTransformer(embeddings_name)
|
50 |
|
51 |
+
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
'''
|
53 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
54 |
'''
|
|
|
101 |
|
102 |
# If you want to save your files for next time
|
103 |
if return_intermediate_files == "Yes":
|
104 |
+
if clean == "Yes": data_file_name_no_ext = data_file_name_no_ext + "_cleaned"
|
105 |
+
else: data_file_name_no_ext = data_file_name_no_ext
|
106 |
+
|
107 |
progress(0.9, desc = "Saving embeddings to file")
|
108 |
if embeddings_super_compress == "No":
|
109 |
semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
|
|
|
180 |
# Concatenate the original DataFrame with the expanded metadata DataFrame
|
181 |
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
|
182 |
|
183 |
+
results_df_out = results_df_out.rename(columns={"documents":"search_text"})
|
184 |
|
185 |
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
|
186 |
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
|
|
|
278 |
print("Saving search output to file")
|
279 |
progress(0.7, desc = "Saving search output to file")
|
280 |
|
281 |
+
# Highlight found text and save to file
|
282 |
+
results_df_out_wb = create_highlighted_excel_wb(results_df_out, query_str, "search_text")
|
283 |
+
results_df_out_wb.save(results_df_name)
|
284 |
+
|
285 |
+
#results_df_out.to_excel(results_df_name, index= None)
|
286 |
results_first_text = results_df_out.iloc[0, 1]
|
287 |
|
288 |
print("Returning results")
|
|
|
290 |
return results_first_text, results_df_name
|
291 |
|
292 |
|
293 |
+
def docs_to_jina_embed_np_array_deprecated(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
294 |
+
'''
|
295 |
+
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
296 |
+
'''
|
297 |
+
if not in_file:
|
298 |
+
out_message = "No input file found. Please load in at least one file."
|
299 |
+
print(out_message)
|
300 |
+
return out_message, None, None
|
301 |
+
|
302 |
+
|
303 |
+
progress(0.6, desc = "Loading/creating embeddings")
|
304 |
+
|
305 |
+
print(f"> Total split documents: {len(docs_out)}")
|
306 |
+
|
307 |
+
#print(docs_out)
|
308 |
+
|
309 |
+
page_contents = [doc.page_content for doc in docs_out]
|
310 |
+
|
311 |
+
## Load in pre-embedded file if exists
|
312 |
+
file_list = [string.name for string in in_file]
|
313 |
+
|
314 |
+
#print(file_list)
|
315 |
+
|
316 |
+
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
317 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
318 |
+
data_file_name = data_file_names[0]
|
319 |
+
data_file_name_no_ext = get_file_path_end(data_file_name)
|
320 |
+
|
321 |
+
out_message = "Document processing complete. Ready to search."
|
322 |
+
|
323 |
+
# print("embeddings loaded: ", embeddings_out)
|
324 |
+
|
325 |
+
if embeddings_state.size == 0:
|
326 |
+
tic = time.perf_counter()
|
327 |
+
print("Starting to embed documents.")
|
328 |
+
#embeddings_list = []
|
329 |
+
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
330 |
+
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
331 |
+
|
332 |
+
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
333 |
+
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
334 |
+
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
335 |
+
|
336 |
+
toc = time.perf_counter()
|
337 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
338 |
+
print(time_out)
|
339 |
+
|
340 |
+
# If you want to save your files for next time
|
341 |
+
if return_intermediate_files == "Yes":
|
342 |
+
progress(0.9, desc = "Saving embeddings to file")
|
343 |
+
if embeddings_super_compress == "No":
|
344 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
345 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
346 |
+
else:
|
347 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
|
348 |
+
embeddings_out_round = np.round(embeddings_out, 3)
|
349 |
+
embeddings_out_round *= 100 # Rounding not currently used
|
350 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
351 |
+
|
352 |
+
return out_message, embeddings_out, semantic_search_file_name
|
353 |
+
|
354 |
+
return out_message, embeddings_out, None
|
355 |
+
else:
|
356 |
+
# Just return existing embeddings if already exist
|
357 |
+
embeddings_out = embeddings_state
|
358 |
+
|
359 |
+
print(out_message)
|
360 |
+
|
361 |
+
return out_message, embeddings_out, None#, None
|
362 |
+
|
363 |
+
def jina_simple_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
364 |
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
|
365 |
|
366 |
# print("vectorstore loaded: ", vectorstore)
|
|
|
445 |
#if os.path.isfile(chromadb_file):
|
446 |
# os.remove(chromadb_file)
|
447 |
|
448 |
+
|
449 |
def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
|
450 |
'''
|
451 |
Takes a Langchain document class and saves it into a Chroma sqlite file. Not currently used.
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -309,7 +309,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
309 |
elif clean == "Yes":
|
310 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
311 |
|
312 |
-
with gzip.open(file_name + "
|
313 |
pickle.dump(doc_sections, file)
|
314 |
|
315 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|
|
|
309 |
elif clean == "Yes":
|
310 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
311 |
|
312 |
+
with gzip.open(file_name + "_cleaned_prepared_docs.pkl.gz", 'wb') as file:
|
313 |
pickle.dump(doc_sections, file)
|
314 |
|
315 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|