seanpedrickcase commited on
Commit
d3b1ac5
1 Parent(s): 6768a6d

Now works correctly with npz. Minor formatting improvements

Browse files
Files changed (2) hide show
  1. app.py +39 -77
  2. search_funcs/ingest.py +0 -6
app.py CHANGED
@@ -19,6 +19,8 @@ import pandas as pd
19
  import numpy as np
20
  import os
21
  import time
 
 
22
  from chromadb.config import Settings
23
 
24
  from transformers import AutoModel
@@ -124,28 +126,14 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
124
 
125
  #df = pd.read_parquet(file_in.name)
126
  df_list = list(df[text_column].astype(str).str.lower())
127
- #df_list = df
128
 
129
- import math
130
-
131
- def get_total_batches(my_list, batch_size):
132
- return math.ceil(len(my_list) / batch_size)
133
-
134
- from itertools import islice
135
-
136
- def batch(iterable, batch_size):
137
- iterator = iter(iterable)
138
- for first in iterator:
139
- yield [first] + list(islice(iterator, batch_size - 1))
140
-
141
- #def batch(my_list, batch_size):
142
- # Splitting the list into batches
143
- # for i in range(0, len(my_list), batch_size):
144
- # batch = my_list[i:i + batch_size]
145
-
146
- # Process each batch
147
- # Replace this with your processing logic
148
- #print("Processing batch:", batch)
149
 
150
  batch_size = 256
151
 
@@ -157,29 +145,7 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
157
  # Save to file if you have cleaned the data
158
  out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
159
 
160
- #corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
161
- #corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
162
-
163
- #total_batches = get_total_batches(df_list_clean, batch_size)
164
- #data_batched = batch(df_list_clean, batch_size)
165
 
166
- #print(data_batched)
167
-
168
- #print(df_list_clean[0])
169
-
170
- # Using encode_batch
171
- #encodings = tokenizer.encode_batch(texts)
172
-
173
- # Extracting tokens
174
- #tokens_list = [encoding.tokens for encoding in encodings]
175
-
176
- #corpus = [tokenizer(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
177
- #corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
178
- # print(df_list_clean)
179
- # corpus = tokenizer.batch_encode_plus(df_list_clean).tokens
180
-
181
- #corpus = [[token.text for token in nlp(text)] for text in df_list_clean]
182
-
183
  # Tokenize texts in batches
184
  if not tokenised_df.empty:
185
  corpus = tokenised_df.iloc[:,0].tolist()
@@ -189,24 +155,11 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
189
  corpus = []
190
  for doc in tokenizer.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
191
  corpus.append([token.text for token in doc])
192
- #for doc in nlp.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), batch_size=batch_size): # You can adjust batch_size based on your requirement
193
- # corpus.append([token.text for token in doc])
194
-
195
-
196
  else:
197
- #total_batches = get_total_batches(df_list, batch_size)
198
- #data_batched = batch(df_list, batch_size)
199
-
200
- #print(data_batched)
201
-
202
- #corpus = [word_tokenize(doc.lower()) for doc in df_list]
203
- #corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
204
- #corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
205
- #corpus = tokenizer.batch_encode_plus(df_list).tokens # for jina
206
-
207
  print(df_list[0])
208
- #corpus = [[token.text for token in nlp(text)] for text in df_list]
209
-
210
  # Tokenize texts in batches
211
  if not tokenised_df.empty:
212
  corpus = tokenised_df.iloc[:,0].tolist()
@@ -216,10 +169,8 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
216
 
217
  corpus = []
218
  for doc in tokenizer.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
219
- #for doc in nlp.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), #batch_size=batch_size): # You can adjust batch_size based on your requirement
220
  corpus.append([token.text for token in doc])
221
 
222
- #corpus = tokenizer(df_list)
223
  out_file_name = None
224
 
225
  print(corpus[0])
@@ -235,9 +186,10 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
235
  else:
236
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
237
 
238
- pd.DataFrame(data={"Corpus":corpus}).to_parquet("keyword_search_tokenised_data.parquet")
 
239
 
240
- return corpus, message, df, out_file_name
241
 
242
  def get_file_path_end(file_path):
243
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
@@ -551,12 +503,14 @@ def docs_to_np_array(docs_out, in_file, embeddings = embeddings, progress=gr.Pro
551
  ## Load in pre-embedded file if exists
552
  file_list = [string.name for string in in_file]
553
 
554
- print(file_list)
555
 
556
  embeddings_file_names = [string for string in file_list if "embedding" in string]
557
 
 
 
558
  if embeddings_file_names:
559
- embeddings_out = np.load(embeddings_file_names[0])
560
  print("embeddings loaded: ", embeddings_out)
561
 
562
  if not embeddings_file_names:
@@ -568,16 +522,24 @@ def docs_to_np_array(docs_out, in_file, embeddings = embeddings, progress=gr.Pro
568
  embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
569
  #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
570
  #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
 
 
 
571
 
572
  toc = time.perf_counter()
573
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
574
 
575
- np.savez_compressed('semantic_search_embeddings.npz', embeddings_out)
 
576
 
577
- out_message = "Document processing complete. Ready to search."
 
 
 
 
578
  print(out_message)
579
 
580
- return out_message, embeddings_out
581
 
582
  def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
583
 
@@ -787,7 +749,7 @@ depends on factors such as the type of documents or queries. Information taken f
787
  current_source = gr.Textbox(label="Current data source(s)", value="None")
788
 
789
  with gr.Accordion(label = "Load in data", open=True):
790
- in_bm25_file = gr.File(label="Upload your search data here", file_count= 'multiple', file_types = ['.parquet', '.csv'])
791
  with gr.Row():
792
  in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
793
  load_bm25_data_button = gr.Button(value="Load data")
@@ -815,9 +777,9 @@ depends on factors such as the type of documents or queries. Information taken f
815
 
816
  with gr.Row():
817
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
818
- load_semantic_data_button = gr.Button(value="Load in data file", variant="secondary")
819
 
820
- ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
821
 
822
  semantic_query = gr.Textbox(label="Enter semantic search query here")
823
  semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
@@ -865,25 +827,25 @@ depends on factors such as the type of documents or queries. Information taken f
865
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
866
 
867
  # Load in BM25 data
868
- load_bm25_data_button.click(fn=prepare_input_data, inputs=[in_bm25_file, in_bm25_column, in_clean_data], outputs=[corpus_state, load_finished_message, data_state, output_file]).\
869
  then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message]).\
870
  then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
871
 
872
  # BM25 search functions on click or enter
873
- keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="search")
874
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
875
 
876
  ### SEMANTIC SEARCH ###
877
  # Load in a csv/excel file for semantic search
878
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
879
- load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic, ingest_embed_out]).\
880
- then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, ingest_embed_out]).\
881
- then(docs_to_np_array, inputs=[ingest_docs, in_semantic_file], outputs=[ingest_embed_out, vectorstore_state])
882
 
883
  # Semantic search query
884
  semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
885
 
886
- semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
887
 
888
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
889
  in_bm25_column.change(dummy_function, in_bm25_column, None)
 
19
  import numpy as np
20
  import os
21
  import time
22
+ import math
23
+ from itertools import islice
24
  from chromadb.config import Settings
25
 
26
  from transformers import AutoModel
 
126
 
127
  #df = pd.read_parquet(file_in.name)
128
  df_list = list(df[text_column].astype(str).str.lower())
 
129
 
130
+ # def get_total_batches(my_list, batch_size):
131
+ # return math.ceil(len(my_list) / batch_size)
132
+
133
+ # def batch(iterable, batch_size):
134
+ # iterator = iter(iterable)
135
+ # for first in iterator:
136
+ # yield [first] + list(islice(iterator, batch_size - 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  batch_size = 256
139
 
 
145
  # Save to file if you have cleaned the data
146
  out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
147
 
 
 
 
 
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # Tokenize texts in batches
150
  if not tokenised_df.empty:
151
  corpus = tokenised_df.iloc[:,0].tolist()
 
155
  corpus = []
156
  for doc in tokenizer.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
157
  corpus.append([token.text for token in doc])
158
+
 
 
 
159
  else:
160
+
 
 
 
 
 
 
 
 
 
161
  print(df_list[0])
162
+
 
163
  # Tokenize texts in batches
164
  if not tokenised_df.empty:
165
  corpus = tokenised_df.iloc[:,0].tolist()
 
169
 
170
  corpus = []
171
  for doc in tokenizer.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
 
172
  corpus.append([token.text for token in doc])
173
 
 
174
  out_file_name = None
175
 
176
  print(corpus[0])
 
186
  else:
187
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
188
 
189
+ tokenised_data_file_name = "keyword_search_tokenised_data.parquet"
190
+ pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
191
 
192
+ return corpus, message, df, out_file_name, tokenised_data_file_name
193
 
194
  def get_file_path_end(file_path):
195
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
 
503
  ## Load in pre-embedded file if exists
504
  file_list = [string.name for string in in_file]
505
 
506
+ #print(file_list)
507
 
508
  embeddings_file_names = [string for string in file_list if "embedding" in string]
509
 
510
+ out_message = "Document processing complete. Ready to search."
511
+
512
  if embeddings_file_names:
513
+ embeddings_out = np.load(embeddings_file_names[0])['arr_0']
514
  print("embeddings loaded: ", embeddings_out)
515
 
516
  if not embeddings_file_names:
 
522
  embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
523
  #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
524
  #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
525
+
526
+ print(embeddings_out)
527
+ embeddings_out_round = np.round(embeddings_out, 4)
528
 
529
  toc = time.perf_counter()
530
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
531
 
532
+ semantic_search_file_name = 'semantic_search_embeddings.npz'
533
+ semantic_search_rounded_file_name = 'semantic_search_embeddings_rounded.npz'
534
 
535
+ np.savez_compressed(semantic_search_file_name, embeddings_out)
536
+ np.savez_compressed(semantic_search_rounded_file_name, embeddings_out_round)
537
+
538
+ return out_message, embeddings_out, semantic_search_file_name, semantic_search_rounded_file_name
539
+
540
  print(out_message)
541
 
542
+ return out_message, embeddings_out, None, None
543
 
544
  def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
545
 
 
749
  current_source = gr.Textbox(label="Current data source(s)", value="None")
750
 
751
  with gr.Accordion(label = "Load in data", open=True):
752
+ in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types = ['.parquet', '.csv'])
753
  with gr.Row():
754
  in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
755
  load_bm25_data_button = gr.Button(value="Load data")
 
777
 
778
  with gr.Row():
779
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
780
+ load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
781
 
782
+ semantic_load_progress = gr.Textbox(label="Load progress")
783
 
784
  semantic_query = gr.Textbox(label="Enter semantic search query here")
785
  semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
 
827
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
828
 
829
  # Load in BM25 data
830
+ load_bm25_data_button.click(fn=prepare_input_data, inputs=[in_bm25_file, in_bm25_column, in_clean_data], outputs=[corpus_state, load_finished_message, data_state, output_file, output_file]).\
831
  then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message]).\
832
  then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
833
 
834
  # BM25 search functions on click or enter
835
+ keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="keyword")
836
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
837
 
838
  ### SEMANTIC SEARCH ###
839
  # Load in a csv/excel file for semantic search
840
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
841
+ load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
842
+ then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, semantic_load_progress]).\
843
+ then(docs_to_np_array, inputs=[ingest_docs, in_semantic_file], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, semantic_output_file])
844
 
845
  # Semantic search query
846
  semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
847
 
848
+ semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
849
 
850
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
851
  in_bm25_column.change(dummy_function, in_bm25_column, None)
search_funcs/ingest.py CHANGED
@@ -36,9 +36,6 @@ class Document(BaseModel):
36
  """
37
  type: Literal["Document"] = "Document"
38
 
39
-
40
- # -
41
-
42
  split_strat = ["\n\n", "\n", ". ", "! ", "? "]
43
  chunk_size = 500
44
  chunk_overlap = 0
@@ -221,7 +218,6 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
221
 
222
  return doc_sections#, page_docs
223
 
224
-
225
  def write_out_metadata_as_string(metadata_in):
226
  # If metadata_in is a single dictionary, wrap it in a list
227
  if isinstance(metadata_in, dict):
@@ -301,8 +297,6 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
301
 
302
  return doc_sections, message
303
 
304
-
305
-
306
  def clean_line_breaks(text):
307
  # Replace \n and \r\n with a space
308
  return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
 
36
  """
37
  type: Literal["Document"] = "Document"
38
 
 
 
 
39
  split_strat = ["\n\n", "\n", ". ", "! ", "? "]
40
  chunk_size = 500
41
  chunk_overlap = 0
 
218
 
219
  return doc_sections#, page_docs
220
 
 
221
  def write_out_metadata_as_string(metadata_in):
222
  # If metadata_in is a single dictionary, wrap it in a list
223
  if isinstance(metadata_in, dict):
 
297
 
298
  return doc_sections, message
299
 
 
 
300
  def clean_line_breaks(text):
301
  # Replace \n and \r\n with a space
302
  return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')