Sean-Case commited on
Commit
352c02a
1 Parent(s): 36a404e

Improved xlsx output formatting. Deals better with cleaning data then analysing in same session.

Browse files
.gitignore CHANGED
@@ -16,6 +16,7 @@
16
  *.pkl
17
  *.pkl.gz
18
  *.pem
 
19
  build/*
20
  dist/*
21
  __pycache__/*
 
16
  *.pkl
17
  *.pkl.gz
18
  *.pem
19
+ docs/*
20
  build/*
21
  dist/*
22
  __pycache__/*
app.py CHANGED
@@ -35,9 +35,13 @@ with block:
35
  vec_weight = gr.State(1)
36
 
37
  corpus_state = gr.State()
38
- keyword_data_state = gr.State(pd.DataFrame())
39
  keyword_data_list_state = gr.State([])
40
  join_data_state = gr.State(pd.DataFrame())
 
 
 
 
 
41
  semantic_data_state = gr.State(pd.DataFrame())
42
 
43
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
@@ -157,13 +161,14 @@ depends on factors such as the type of documents or queries. Information taken f
157
 
158
  ### BM25 SEARCH ###
159
  # Update dropdowns upon initial file load
160
- in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
161
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
162
 
163
  # Load in BM25 data
164
- load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state]).\
165
  then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
166
 
 
167
  # BM25 search functions on click or enter
168
  keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
169
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
@@ -174,10 +179,10 @@ depends on factors such as the type of documents or queries. Information taken f
174
  ### SEMANTIC SEARCH ###
175
 
176
  # Load in a csv/excel file for semantic search
177
- in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
178
  load_semantic_data_button.click(
179
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
180
- then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
181
 
182
  # Semantic search query
183
  semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
 
35
  vec_weight = gr.State(1)
36
 
37
  corpus_state = gr.State()
 
38
  keyword_data_list_state = gr.State([])
39
  join_data_state = gr.State(pd.DataFrame())
40
+
41
+ orig_keyword_data_state = gr.State(pd.DataFrame())
42
+ keyword_data_state = gr.State(pd.DataFrame())
43
+
44
+ orig_semantic_data_state = gr.State(pd.DataFrame())
45
  semantic_data_state = gr.State(pd.DataFrame())
46
 
47
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 
161
 
162
  ### BM25 SEARCH ###
163
  # Update dropdowns upon initial file load
164
+ in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
165
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
166
 
167
  # Load in BM25 data
168
+ load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
169
  then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
170
 
171
+
172
  # BM25 search functions on click or enter
173
  keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
174
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
 
179
  ### SEMANTIC SEARCH ###
180
 
181
  # Load in a csv/excel file for semantic search
182
+ in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
183
  load_semantic_data_button.click(
184
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
185
+ then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
186
 
187
  # Semantic search query
188
  semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
search_funcs/bm25_functions.py CHANGED
@@ -231,11 +231,11 @@ class BM25:
231
  # These following functions are my own work
232
 
233
  def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
234
- print(in_file)
235
 
236
  if not in_file:
237
  print("No input file found. Please load in at least one file.")
238
- return None, "No input file found. Please load in at least one file.", data_state, None, None, None, []
239
 
240
  progress(0, desc = "Loading in data")
241
  file_list = [string.name for string in in_file]
@@ -245,10 +245,10 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
245
  data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
246
 
247
  if not data_file_names:
248
- return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None, []
249
 
250
  if not text_column:
251
- return None, "Please enter a column name to search.", data_state, None, None, None, []
252
 
253
  data_file_name = data_file_names[0]
254
 
@@ -268,7 +268,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
268
  corpus = list(df[text_column])
269
  message = "Tokenisation skipped - loading search index from file."
270
  print(message)
271
- return corpus, message, df, None, None, None
272
 
273
 
274
 
@@ -282,7 +282,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
282
  df_list = initial_clean(df_list)
283
 
284
  # Save to file if you have cleaned the data
285
- out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
286
 
287
  clean_toc = time.perf_counter()
288
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
@@ -328,9 +328,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
328
 
329
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
330
 
331
- return corpus, message, df, out_file_name, tokenised_data_file_name, df_list
332
 
333
- return corpus, message, df, out_file_name, None, df_list
334
 
335
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
336
 
@@ -356,7 +356,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
356
  prepared_df.to_parquet(file_name)
357
  else: file_name = None
358
 
359
- return file_name, new_text_column
360
 
361
  def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
362
  #bm25.save("saved_df_bm25")
 
231
  # These following functions are my own work
232
 
233
  def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
234
+ #print(in_file)
235
 
236
  if not in_file:
237
  print("No input file found. Please load in at least one file.")
238
+ return None, "No input file found. Please load in at least one file.", data_state, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
239
 
240
  progress(0, desc = "Loading in data")
241
  file_list = [string.name for string in in_file]
 
245
  data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
246
 
247
  if not data_file_names:
248
+ return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
249
 
250
  if not text_column:
251
+ return None, "Please enter a column name to search.", data_state, None, None,[], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
252
 
253
  data_file_name = data_file_names[0]
254
 
 
268
  corpus = list(df[text_column])
269
  message = "Tokenisation skipped - loading search index from file."
270
  print(message)
271
+ return corpus, message, df, None, None, [], gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
272
 
273
 
274
 
 
282
  df_list = initial_clean(df_list)
283
 
284
  # Save to file if you have cleaned the data
285
+ out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
286
 
287
  clean_toc = time.perf_counter()
288
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
 
328
 
329
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
330
 
331
+ return corpus, message, df, out_file_name, tokenised_data_file_name, df_list, gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
332
 
333
+ return corpus, message, df, out_file_name, None, df_list, gr.Dropdown(allow_custom_value=True, value=text_column, choices=data_state.columns.to_list())
334
 
335
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
336
 
 
356
  prepared_df.to_parquet(file_name)
357
  else: file_name = None
358
 
359
+ return file_name, new_text_column, prepared_df
360
 
361
  def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
362
  #bm25.save("saved_df_bm25")
search_funcs/clean_funcs.py CHANGED
@@ -32,68 +32,27 @@ num_pattern_regex = r'[0-9]+'
32
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
33
  warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
34
  nbsp_pattern_regex = r' '
 
35
 
36
  # Pre-compiling the regular expressions for efficiency
37
- email_start_pattern = re.compile(email_start_pattern_regex)
38
- email_end_pattern = re.compile(email_end_pattern_regex)
39
- html_pattern = re.compile(html_pattern_regex)
40
- email_pattern = re.compile(email_end_pattern_regex)
41
- num_pattern = re.compile(num_pattern_regex)
42
- postcode_pattern = re.compile(postcode_pattern_regex)
43
- warning_pattern = re.compile(warning_pattern_regex)
44
- nbsp_pattern = re.compile(nbsp_pattern_regex)
45
-
46
- # def stem_sentence(sentence):
47
-
48
- # words = sentence.split()
49
- # stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words]
50
- # return stemmed_words
51
-
52
- # def stem_sentences(sentences, progress=gr.Progress()):
53
- # """Stem each sentence in a list of sentences."""
54
- # stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
55
- # return stemmed_sentences
56
-
57
- # def get_lemma_text(text):
58
- # # Tokenize the input string into words
59
- # tokens = word_tokenize(text)
60
-
61
- # lemmas = []
62
- # for word in tokens:
63
- # if len(word) > 3:
64
- # lemma = wn.morphy(word)
65
- # else:
66
- # lemma = None
67
-
68
- # if lemma is None:
69
- # lemmas.append(word)
70
- # else:
71
- # lemmas.append(lemma)
72
- # return lemmas
73
-
74
- # def get_lemma_tokens(tokens):
75
- # Tokenize the input string into words
76
-
77
- # lemmas = []
78
- # for word in tokens:
79
- # if len(word) > 3:
80
- # lemma = wn.morphy(word)
81
- # else:
82
- # lemma = None
83
-
84
- # if lemma is None:
85
- # lemmas.append(word)
86
- # else:
87
- # lemmas.append(lemma)
88
- # return lemmas
89
 
90
  def initial_clean(texts , progress=gr.Progress()):
91
  texts = pl.Series(texts)#[]
92
 
93
- text = texts.str.replace_all(email_start_pattern_regex, '')
 
94
  text = text.str.replace_all(email_end_pattern_regex, '')
95
- text = text.str.replace_all(html_pattern_regex, '')
96
  text = text.str.replace_all(email_pattern_regex, '')
 
97
 
98
  text = text.to_list()
99
 
 
32
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
33
  warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
34
  nbsp_pattern_regex = r' '
35
+ multiple_spaces_regex = r'\s{2,}'
36
 
37
  # Pre-compiling the regular expressions for efficiency
38
+ # email_start_pattern = re.compile(email_start_pattern_regex)
39
+ # email_end_pattern = re.compile(email_end_pattern_regex)
40
+ # html_pattern = re.compile(html_pattern_regex)
41
+ # email_pattern = re.compile(email_end_pattern_regex)
42
+ # num_pattern = re.compile(num_pattern_regex)
43
+ # postcode_pattern = re.compile(postcode_pattern_regex)
44
+ # warning_pattern = re.compile(warning_pattern_regex)
45
+ # nbsp_pattern = re.compile(nbsp_pattern_regex)
46
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def initial_clean(texts , progress=gr.Progress()):
49
  texts = pl.Series(texts)#[]
50
 
51
+ text = texts.str.replace_all(html_pattern_regex, '')
52
+ text = text.str.replace_all(email_start_pattern_regex, '')
53
  text = text.str.replace_all(email_end_pattern_regex, '')
 
54
  text = text.str.replace_all(email_pattern_regex, '')
55
+ text = text.str.replace_all(multiple_spaces_regex, ' ')
56
 
57
  text = text.to_list()
58
 
search_funcs/helper_functions.py CHANGED
@@ -13,7 +13,7 @@ import numpy as np
13
  from openpyxl import Workbook
14
  from openpyxl.cell.text import InlineFont
15
  from openpyxl.cell.rich_text import TextBlock, CellRichText
16
- from openpyxl.styles import Font
17
 
18
  # Attempt to delete content of gradio temp folder
19
  def get_temp_folder_path():
@@ -103,6 +103,7 @@ def initial_data_load(in_file):
103
  tokenised_load =[]
104
  out_message = ""
105
  current_source = ""
 
106
 
107
  file_list = [string.name for string in in_file]
108
 
@@ -113,25 +114,25 @@ def initial_data_load(in_file):
113
  if not data_file_names:
114
  out_message = "Please load in at least one csv/Excel/parquet data file."
115
  print(out_message)
116
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), index_load, out_message
117
 
118
- data_file_name = data_file_names[0]
 
 
 
 
119
 
120
- current_source = get_file_path_end_with_ext(data_file_name)
121
-
122
-
123
- df = read_file(data_file_name)
124
-
125
- if "pkl" not in data_file_name:
126
 
127
- new_choices = list(df.columns)
 
 
128
 
129
- elif "search_index" in data_file_name:
130
- # If only the search_index found, need a data file too
131
- new_choices = []
132
 
133
- else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
134
- #print(new_choices)
135
 
136
  concat_choices.extend(new_choices)
137
 
@@ -161,7 +162,7 @@ def initial_data_load(in_file):
161
  out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
162
  print(out_message)
163
 
164
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, tokenised_load, out_message, current_source
165
 
166
  def put_columns_in_join_df(in_file):
167
  '''
@@ -291,12 +292,19 @@ def create_highlighted_excel_wb(df, search_text, column_to_highlight):
291
  for cell in sheet[1]:
292
  cell.font = Font(bold=True)
293
 
 
 
 
 
 
294
  # Find substrings in cells and highlight
295
  for r_idx, row in enumerate(df.itertuples(), start=2):
296
  for c_idx, cell_value in enumerate(row[1:], start=1):
297
  sheet.cell(row=r_idx, column=c_idx, value=cell_value)
298
  if df.columns[c_idx - 1] == column_to_highlight:
 
299
  html_text, combined_positions = highlight_found_text(search_text, cell_value)
300
  sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
 
301
 
302
  return wb
 
13
  from openpyxl import Workbook
14
  from openpyxl.cell.text import InlineFont
15
  from openpyxl.cell.rich_text import TextBlock, CellRichText
16
+ from openpyxl.styles import Font, Alignment
17
 
18
  # Attempt to delete content of gradio temp folder
19
  def get_temp_folder_path():
 
103
  tokenised_load =[]
104
  out_message = ""
105
  current_source = ""
106
+ df = pd.DataFrame()
107
 
108
  file_list = [string.name for string in in_file]
109
 
 
114
  if not data_file_names:
115
  out_message = "Please load in at least one csv/Excel/parquet data file."
116
  print(out_message)
117
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, out_message
118
 
119
+ # This if you have loaded in a documents object for the semantic search
120
+ if "pkl" in data_file_names[0]:
121
+ df = read_file(data_file_names[0])
122
+ new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
123
+ current_source = get_file_path_end_with_ext(data_file_names[0])
124
 
125
+ # This if you have loaded in a csv/parquets/xlsx
126
+ else:
127
+ for file in data_file_names:
 
 
 
128
 
129
+ current_source = current_source + get_file_path_end_with_ext(file) + " "
130
+
131
+ df_new = read_file(file)
132
 
133
+ df = pd.concat([df, df_new], ignore_index = True)
 
 
134
 
135
+ new_choices = list(df.columns)
 
136
 
137
  concat_choices.extend(new_choices)
138
 
 
162
  out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
163
  print(out_message)
164
 
165
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source
166
 
167
  def put_columns_in_join_df(in_file):
168
  '''
 
292
  for cell in sheet[1]:
293
  cell.font = Font(bold=True)
294
 
295
+ column_width = 150 # Adjust as needed
296
+ relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
297
+ print(relevant_column_no)
298
+ sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
299
+
300
  # Find substrings in cells and highlight
301
  for r_idx, row in enumerate(df.itertuples(), start=2):
302
  for c_idx, cell_value in enumerate(row[1:], start=1):
303
  sheet.cell(row=r_idx, column=c_idx, value=cell_value)
304
  if df.columns[c_idx - 1] == column_to_highlight:
305
+
306
  html_text, combined_positions = highlight_found_text(search_text, cell_value)
307
  sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
308
+ sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)
309
 
310
  return wb
search_funcs/semantic_functions.py CHANGED
@@ -25,7 +25,7 @@ else:
25
 
26
  print("Device used is: ", torch_device)
27
 
28
- #from search_funcs.helper_functions import get_file_path_end
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
@@ -45,106 +45,10 @@ PandasDataFrame = Type[pd.DataFrame]
45
  embeddings_name = "BAAI/bge-small-en-v1.5"
46
  local_embeddings_location = "model/bge/"
47
 
48
- #try:
49
- # tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
50
- # embeddings_model = AutoModel.from_pretrained(local_embeddings_location, local_files_only=True).to(torch_device)
51
- #except:
52
- # tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
53
- # embeddings_model = AutoModel.from_pretrained(embeddings_name).to(torch_device)
54
-
55
  # Not using SentenceTransformer here
56
  embeddings_model = SentenceTransformer(embeddings_name)
57
 
58
- # def calc_bge_norm_embeddings(docs, embeddings_model=embeddings_model, tokenizer=tokenizer, progress=gr.Progress(track_tqdm=True)):
59
- # # Tokenize sentences
60
- # print("Tokenising")
61
- # encoded_input = tokenizer(docs, padding=True, truncation=True, return_tensors='pt', max_length=32).to(torch_device)
62
-
63
- # # Compute token embeddings
64
- # print("Calculating embeddings")
65
- # with torch.no_grad():
66
- # model_output = embeddings_model(**encoded_input).to(torch_device)
67
- # # Perform pooling. In this case, cls pooling.
68
- # embeddings_out = model_output[0][:, 0]
69
- # # normalize embeddings
70
- # embeddings_out = torch.nn.functional.normalize(embeddings_out, p=2, dim=1)
71
- # #print("Sentence embeddings:", embeddings_out)
72
-
73
- # return embeddings_out
74
-
75
-
76
- def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
77
- '''
78
- Takes a Langchain document class and saves it into a Chroma sqlite file.
79
- '''
80
- if not in_file:
81
- out_message = "No input file found. Please load in at least one file."
82
- print(out_message)
83
- return out_message, None, None
84
-
85
-
86
- progress(0.6, desc = "Loading/creating embeddings")
87
-
88
- print(f"> Total split documents: {len(docs_out)}")
89
-
90
- #print(docs_out)
91
-
92
- page_contents = [doc.page_content for doc in docs_out]
93
-
94
- ## Load in pre-embedded file if exists
95
- file_list = [string.name for string in in_file]
96
-
97
- #print(file_list)
98
-
99
- embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
100
- data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
101
- data_file_name = data_file_names[0]
102
- data_file_name_no_ext = get_file_path_end(data_file_name)
103
-
104
- out_message = "Document processing complete. Ready to search."
105
-
106
- # print("embeddings loaded: ", embeddings_out)
107
-
108
- if embeddings_state.size == 0:
109
- tic = time.perf_counter()
110
- print("Starting to embed documents.")
111
- #embeddings_list = []
112
- #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
113
- # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
114
-
115
- embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
116
- #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
117
- #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
118
-
119
- toc = time.perf_counter()
120
- time_out = f"The embedding took {toc - tic:0.1f} seconds"
121
- print(time_out)
122
-
123
- # If you want to save your files for next time
124
- if return_intermediate_files == "Yes":
125
- progress(0.9, desc = "Saving embeddings to file")
126
- if embeddings_super_compress == "No":
127
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
128
- np.savez_compressed(semantic_search_file_name, embeddings_out)
129
- else:
130
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
131
- embeddings_out_round = np.round(embeddings_out, 3)
132
- embeddings_out_round *= 100 # Rounding not currently used
133
- np.savez_compressed(semantic_search_file_name, embeddings_out_round)
134
-
135
- return out_message, embeddings_out, semantic_search_file_name
136
-
137
- return out_message, embeddings_out, None
138
- else:
139
- # Just return existing embeddings if already exist
140
- embeddings_out = embeddings_state
141
-
142
- print(out_message)
143
-
144
- return out_message, embeddings_out, None#, None
145
-
146
-
147
- def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
148
  '''
149
  Takes a Langchain document class and saves it into a Chroma sqlite file.
150
  '''
@@ -197,6 +101,9 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_inter
197
 
198
  # If you want to save your files for next time
199
  if return_intermediate_files == "Yes":
 
 
 
200
  progress(0.9, desc = "Saving embeddings to file")
201
  if embeddings_super_compress == "No":
202
  semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
@@ -273,7 +180,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
273
  # Concatenate the original DataFrame with the expanded metadata DataFrame
274
  results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
275
 
276
- results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
277
 
278
  results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
279
  results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
@@ -371,7 +278,11 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
371
  print("Saving search output to file")
372
  progress(0.7, desc = "Saving search output to file")
373
 
374
- results_df_out.to_excel(results_df_name, index= None)
 
 
 
 
375
  results_first_text = results_df_out.iloc[0, 1]
376
 
377
  print("Returning results")
@@ -379,7 +290,77 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
379
  return results_first_text, results_df_name
380
 
381
 
382
- def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
384
 
385
  # print("vectorstore loaded: ", vectorstore)
@@ -464,6 +445,7 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
464
  #if os.path.isfile(chromadb_file):
465
  # os.remove(chromadb_file)
466
 
 
467
  def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
468
  '''
469
  Takes a Langchain document class and saves it into a Chroma sqlite file. Not currently used.
 
25
 
26
  print("Device used is: ", torch_device)
27
 
28
+ from search_funcs.helper_functions import create_highlighted_excel_wb
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
 
45
  embeddings_name = "BAAI/bge-small-en-v1.5"
46
  local_embeddings_location = "model/bge/"
47
 
 
 
 
 
 
 
 
48
  # Not using SentenceTransformer here
49
  embeddings_model = SentenceTransformer(embeddings_name)
50
 
51
+ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  '''
53
  Takes a Langchain document class and saves it into a Chroma sqlite file.
54
  '''
 
101
 
102
  # If you want to save your files for next time
103
  if return_intermediate_files == "Yes":
104
+ if clean == "Yes": data_file_name_no_ext = data_file_name_no_ext + "_cleaned"
105
+ else: data_file_name_no_ext = data_file_name_no_ext
106
+
107
  progress(0.9, desc = "Saving embeddings to file")
108
  if embeddings_super_compress == "No":
109
  semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
 
180
  # Concatenate the original DataFrame with the expanded metadata DataFrame
181
  results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
182
 
183
+ results_df_out = results_df_out.rename(columns={"documents":"search_text"})
184
 
185
  results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
186
  results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
 
278
  print("Saving search output to file")
279
  progress(0.7, desc = "Saving search output to file")
280
 
281
+ # Highlight found text and save to file
282
+ results_df_out_wb = create_highlighted_excel_wb(results_df_out, query_str, "search_text")
283
+ results_df_out_wb.save(results_df_name)
284
+
285
+ #results_df_out.to_excel(results_df_name, index= None)
286
  results_first_text = results_df_out.iloc[0, 1]
287
 
288
  print("Returning results")
 
290
  return results_first_text, results_df_name
291
 
292
 
293
+ def docs_to_jina_embed_np_array_deprecated(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
294
+ '''
295
+ Takes a Langchain document class and saves it into a Chroma sqlite file.
296
+ '''
297
+ if not in_file:
298
+ out_message = "No input file found. Please load in at least one file."
299
+ print(out_message)
300
+ return out_message, None, None
301
+
302
+
303
+ progress(0.6, desc = "Loading/creating embeddings")
304
+
305
+ print(f"> Total split documents: {len(docs_out)}")
306
+
307
+ #print(docs_out)
308
+
309
+ page_contents = [doc.page_content for doc in docs_out]
310
+
311
+ ## Load in pre-embedded file if exists
312
+ file_list = [string.name for string in in_file]
313
+
314
+ #print(file_list)
315
+
316
+ embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
317
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
318
+ data_file_name = data_file_names[0]
319
+ data_file_name_no_ext = get_file_path_end(data_file_name)
320
+
321
+ out_message = "Document processing complete. Ready to search."
322
+
323
+ # print("embeddings loaded: ", embeddings_out)
324
+
325
+ if embeddings_state.size == 0:
326
+ tic = time.perf_counter()
327
+ print("Starting to embed documents.")
328
+ #embeddings_list = []
329
+ #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
330
+ # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
331
+
332
+ embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
333
+ #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
334
+ #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
335
+
336
+ toc = time.perf_counter()
337
+ time_out = f"The embedding took {toc - tic:0.1f} seconds"
338
+ print(time_out)
339
+
340
+ # If you want to save your files for next time
341
+ if return_intermediate_files == "Yes":
342
+ progress(0.9, desc = "Saving embeddings to file")
343
+ if embeddings_super_compress == "No":
344
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
345
+ np.savez_compressed(semantic_search_file_name, embeddings_out)
346
+ else:
347
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
348
+ embeddings_out_round = np.round(embeddings_out, 3)
349
+ embeddings_out_round *= 100 # Rounding not currently used
350
+ np.savez_compressed(semantic_search_file_name, embeddings_out_round)
351
+
352
+ return out_message, embeddings_out, semantic_search_file_name
353
+
354
+ return out_message, embeddings_out, None
355
+ else:
356
+ # Just return existing embeddings if already exist
357
+ embeddings_out = embeddings_state
358
+
359
+ print(out_message)
360
+
361
+ return out_message, embeddings_out, None#, None
362
+
363
+ def jina_simple_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
364
  vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
365
 
366
  # print("vectorstore loaded: ", vectorstore)
 
445
  #if os.path.isfile(chromadb_file):
446
  # os.remove(chromadb_file)
447
 
448
+
449
  def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
450
  '''
451
  Takes a Langchain document class and saves it into a Chroma sqlite file. Not currently used.
search_funcs/semantic_ingest_functions.py CHANGED
@@ -309,7 +309,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
309
  elif clean == "Yes":
310
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
311
 
312
- with gzip.open(file_name + "cleaned_prepared_docs.pkl.gz", 'wb') as file:
313
  pickle.dump(doc_sections, file)
314
 
315
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
 
309
  elif clean == "Yes":
310
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
311
 
312
+ with gzip.open(file_name + "_cleaned_prepared_docs.pkl.gz", 'wb') as file:
313
  pickle.dump(doc_sections, file)
314
 
315
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")