Sean-Case commited on
Commit
2a8aba8
1 Parent(s): acfac99

Now outputs correct dataframe for semantic search. Can join on extra details

Browse files
Files changed (2) hide show
  1. app.py +68 -155
  2. search_funcs/ingest.py +1 -3
app.py CHANGED
@@ -26,8 +26,14 @@ import search_funcs.chatfuncs as chatf
26
 
27
  # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
28
  import chromadb
 
 
29
 
30
- #collection = client.create_collection(name="my_collection")
 
 
 
 
31
 
32
  def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
33
 
@@ -257,25 +263,13 @@ def dummy_function(gradio_component):
257
  def display_info(info_component):
258
  gr.Info(info_component)
259
 
260
- embeddings_name = "jinaai/jina-embeddings-v2-small-en"
261
-
262
- #embeddings_name = "BAAI/bge-base-en-v1.5"
263
- import chromadb
264
- from typing_extensions import Protocol
265
- from chromadb import Documents, EmbeddingFunction, Embeddings
266
-
267
- embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
268
-
269
- class MyEmbeddingFunction(EmbeddingFunction):
270
- def __call__(self, input) -> Embeddings:
271
-
272
-
273
- embeddings = []
274
- for text in input:
275
- embeddings.append(embeddings_model.encode(text))
276
-
277
- return embeddings
278
 
 
279
 
280
  def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
281
  '''
@@ -293,15 +287,17 @@ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
293
 
294
  return embeddings
295
 
 
 
 
 
296
  embeddings = load_embeddings(embeddings_name)
297
 
298
- def docs_to_chroma_save(docs_out, embeddings=embeddings, progress=gr.Progress()):
299
  '''
300
  Takes a Langchain document class and saves it into a Chroma sqlite file.
301
  '''
302
 
303
-
304
-
305
  print(f"> Total split documents: {len(docs_out)}")
306
 
307
  #print(docs_out)
@@ -349,126 +345,23 @@ def jina_simple_retrieval(new_question_kworded, vectorstore, docs, k_val, out_pa
349
  # Calculate cosine similarity with each string in the list
350
  cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
351
 
352
-
353
-
354
  print(cosine_similarities)
355
 
 
356
 
357
-
358
- #vectorstore=globals()["vectorstore"]
359
- #embeddings=globals()["embeddings"]
360
- doc_df = pd.DataFrame()
361
-
362
-
363
- docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
364
-
365
- print("Docs from similarity search:")
366
- print(docs)
367
-
368
- # Keep only documents with a certain score
369
- docs_len = [len(x[0].page_content) for x in docs]
370
- docs_scores = [x[1] for x in docs]
371
-
372
- # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
373
- score_more_limit = pd.Series(docs_scores) < vec_score_cut_off
374
- docs_keep = list(compress(docs, score_more_limit))
375
-
376
- if not docs_keep:
377
- return [], pd.DataFrame(), []
378
-
379
- # Only keep sources that are at least 100 characters long
380
- length_more_limit = pd.Series(docs_len) >= 100
381
- docs_keep = list(compress(docs_keep, length_more_limit))
382
-
383
- if not docs_keep:
384
- return [], pd.DataFrame(), []
385
-
386
- docs_keep_as_doc = [x[0] for x in docs_keep]
387
- docs_keep_length = len(docs_keep_as_doc)
388
-
389
-
390
-
391
- if docs_keep_length == 1:
392
-
393
- content=[]
394
- meta_url=[]
395
- score=[]
396
-
397
- for item in docs_keep:
398
- content.append(item[0].page_content)
399
- meta_url.append(item[0].metadata['source'])
400
- score.append(item[1])
401
-
402
- # Create df from 'winning' passages
403
-
404
- doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
405
- columns =['page_content', 'meta_url', 'score'])
406
-
407
- docs_content = doc_df['page_content'].astype(str)
408
- docs_url = doc_df['meta_url']
409
-
410
- return docs_keep_as_doc, docs_content, docs_url
411
-
412
- # Check for if more docs are removed than the desired output
413
- if out_passages > docs_keep_length:
414
- out_passages = docs_keep_length
415
- k_val = docs_keep_length
416
-
417
- vec_rank = [*range(1, docs_keep_length+1)]
418
- vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
419
-
420
- ## Calculate final score based on three ranking methods
421
- final_score = [a for a in zip(vec_score)]
422
- final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
423
- # Force final_rank to increment by 1 each time
424
- final_rank = list(pd.Series(final_rank).rank(method='first'))
425
-
426
- #print("final rank: " + str(final_rank))
427
- #print("out_passages: " + str(out_passages))
428
-
429
- best_rank_index_pos = []
430
-
431
- for x in range(1,out_passages+1):
432
- try:
433
- best_rank_index_pos.append(final_rank.index(x))
434
- except IndexError: # catch the error
435
- pass
436
-
437
- # Adjust best_rank_index_pos to
438
-
439
- best_rank_pos_series = pd.Series(best_rank_index_pos)
440
-
441
-
442
- docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
443
-
444
- # Keep only 'best' options
445
- docs_keep_as_doc = [x[0] for x in docs_keep_out]
446
-
447
- # Make df of best options
448
- doc_df = create_doc_df(docs_keep_out)
449
-
450
- return docs_keep_as_doc, doc_df, docs_keep_out
451
-
452
- def chroma_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passages,
453
- vec_score_cut_off, vec_weight): # ,vectorstore, embeddings
454
 
455
  query = embeddings.encode(new_question_kworded).tolist()
456
 
457
  docs = vectorstore.query(
458
  query_embeddings=query,
459
- n_results= 9999 # No practical limit on number of responses returned
460
  #where={"metadata_field": "is_equal_to_this"},
461
  #where_document={"$contains":"search_string"}
462
  )
463
 
464
- # Calculate cosine similarity with each string in the list
465
- #cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
466
-
467
- #print(docs)
468
-
469
- #vectorstore=globals()["vectorstore"]
470
- #embeddings=globals()["embeddings"]
471
- df = pd.DataFrame(data={'ids': docs['ids'][0],
472
  'documents': docs['documents'][0],
473
  'metadatas':docs['metadatas'][0],
474
  'distances':docs['distances'][0]#,
@@ -479,23 +372,18 @@ def chroma_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passage
479
  dict_out = {'ids' : [df['ids']],
480
  'documents': [df['documents']],
481
  'metadatas': [df['metadatas']],
482
- 'distances': [df['distances']],
483
  'embeddings': None
484
  }
485
  return dict_out
486
 
487
  # Prepare the DataFrame by transposing
488
- df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
489
-
490
- #print(df_docs)
491
-
492
 
493
  # Keep only documents with a certain score
494
 
495
  docs_scores = df_docs["distances"] #.astype(float)
496
 
497
- #print(docs_scores)
498
-
499
  # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
500
  score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
501
  docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
@@ -510,21 +398,48 @@ def chroma_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passage
510
  length_more_limit = score_more_limit.loc[docs_len, :] #pd.Series(docs_len) >= 100
511
  docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
512
 
513
- #print(docs_keep)
514
-
515
- print(length_more_limit)
516
 
517
  if not docs_keep:
518
  return 'No result found!', ""
519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  results_df_name = "semantic_search_result.csv"
521
- length_more_limit.to_csv(results_df_name, index= None)
522
- results_first_text = length_more_limit["documents"][0]
523
 
524
-
525
  return results_first_text, results_df_name
526
 
527
- # ## Gradio app - BM25 search
528
  block = gr.Blocks(theme = gr.themes.Base())
529
 
530
  with block:
@@ -535,8 +450,8 @@ with block:
535
  vectorstore_state = gr.State() # globals()["vectorstore"]
536
  embeddings_state = gr.State() # globals()["embeddings"]
537
 
538
- k_val = gr.State(100)
539
- out_passages = gr.State(100)
540
  vec_score_cut_off = gr.State(100)
541
  vec_weight = gr.State(1)
542
 
@@ -564,9 +479,8 @@ depends on factors such as the type of documents or queries. Information taken f
564
  # Fast text search
565
  Enter a text query below to search through a text data column and find relevant terms. It will only find terms containing the exact text you enter. Your data should contain at least 20 entries for the search to consistently return results.
566
  """)
567
-
568
 
569
- with gr.Tab(label="Search your data"):
570
  with gr.Row():
571
  current_source = gr.Textbox(label="Current data source(s)", value="None")
572
 
@@ -577,11 +491,9 @@ depends on factors such as the type of documents or queries. Information taken f
577
 
578
  load_bm25_data_button = gr.Button(value="Load data")
579
 
580
-
581
  with gr.Row():
582
  load_finished_message = gr.Textbox(label="Load progress", scale = 2)
583
 
584
-
585
  with gr.Accordion(label = "Search data", open=True):
586
  with gr.Row():
587
  in_query = gr.Textbox(label="Enter your search term")
@@ -593,9 +505,11 @@ depends on factors such as the type of documents or queries. Information taken f
593
  output_single_text = gr.Textbox(label="Top result")
594
  output_file = gr.File(label="File output")
595
 
596
-
597
  with gr.Tab("Fuzzy/semantic search"):
598
- with gr.Accordion("CSV/Excel file", open = True):
 
 
 
599
  in_semantic_file = gr.File(label="Upload data file for semantic search")
600
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
601
  load_semantic_data_button = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
@@ -608,7 +522,6 @@ depends on factors such as the type of documents or queries. Information taken f
608
  semantic_output_single_text = gr.Textbox(label="Top result")
609
  semantic_output_file = gr.File(label="File output")
610
 
611
-
612
  with gr.Tab(label="Advanced options"):
613
  with gr.Accordion(label="Data load / save options", open = False):
614
  #with gr.Row():
@@ -658,12 +571,12 @@ depends on factors such as the type of documents or queries. Information taken f
658
 
659
  # Load in a csv/excel file for semantic search
660
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
661
- load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source]).\
662
  then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs]).\
663
  then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
664
 
665
  # Semantic search query
666
- semantic_submit.click(chroma_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, k_val,out_passages, vec_score_cut_off, vec_weight], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
667
 
668
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
669
  in_bm25_column.change(dummy_function, in_bm25_column, None)
 
26
 
27
  # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
28
  import chromadb
29
+ #from typing_extensions import Protocol
30
+ #from chromadb import Documents, EmbeddingFunction, Embeddings
31
 
32
+ # Remove Chroma database file. If it exists as it can cause issues
33
+ chromadb_file = "chroma.sqlite3"
34
+
35
+ if os.path.isfile(chromadb_file):
36
+ os.remove(chromadb_file)
37
 
38
  def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
39
 
 
263
  def display_info(info_component):
264
  gr.Info(info_component)
265
 
266
+ # class MyEmbeddingFunction(EmbeddingFunction):
267
+ # def __call__(self, input) -> Embeddings:
268
+ # embeddings = []
269
+ # for text in input:
270
+ # embeddings.append(embeddings_model.encode(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
+ # return embeddings
273
 
274
  def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
275
  '''
 
287
 
288
  return embeddings
289
 
290
+ # Load embeddings
291
+ embeddings_name = "jinaai/jina-embeddings-v2-small-en"
292
+ #embeddings_name = "BAAI/bge-base-en-v1.5"
293
+ embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
294
  embeddings = load_embeddings(embeddings_name)
295
 
296
+ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
297
  '''
298
  Takes a Langchain document class and saves it into a Chroma sqlite file.
299
  '''
300
 
 
 
301
  print(f"> Total split documents: {len(docs_out)}")
302
 
303
  #print(docs_out)
 
345
  # Calculate cosine similarity with each string in the list
346
  cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
347
 
 
 
348
  print(cosine_similarities)
349
 
350
+ return cosine_similarities
351
 
352
+ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
353
+ vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None): # ,vectorstore, embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  query = embeddings.encode(new_question_kworded).tolist()
356
 
357
  docs = vectorstore.query(
358
  query_embeddings=query,
359
+ n_results= k_val # No practical limit on number of responses returned
360
  #where={"metadata_field": "is_equal_to_this"},
361
  #where_document={"$contains":"search_string"}
362
  )
363
 
364
+ df_docs = pd.DataFrame(data={'ids': docs['ids'][0],
 
 
 
 
 
 
 
365
  'documents': docs['documents'][0],
366
  'metadatas':docs['metadatas'][0],
367
  'distances':docs['distances'][0]#,
 
372
  dict_out = {'ids' : [df['ids']],
373
  'documents': [df['documents']],
374
  'metadatas': [df['metadatas']],
375
+ 'distances': [round(df['distances'].astype(float), 2)],
376
  'embeddings': None
377
  }
378
  return dict_out
379
 
380
  # Prepare the DataFrame by transposing
381
+ #df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
 
 
 
382
 
383
  # Keep only documents with a certain score
384
 
385
  docs_scores = df_docs["distances"] #.astype(float)
386
 
 
 
387
  # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
388
  score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
389
  docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
 
398
  length_more_limit = score_more_limit.loc[docs_len, :] #pd.Series(docs_len) >= 100
399
  docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
400
 
401
+ #print(length_more_limit)
 
 
402
 
403
  if not docs_keep:
404
  return 'No result found!', ""
405
 
406
+ length_more_limit['ids'] = length_more_limit['ids'].astype(int)
407
+
408
+ #length_more_limit.to_csv("length_more_limit.csv", index = None)
409
+
410
+ # Explode the 'metadatas' dictionary into separate columns
411
+ df_metadata_expanded = df_docs['metadatas'].apply(pd.Series)
412
+
413
+ # Concatenate the original DataFrame with the expanded metadata DataFrame
414
+ results_df_out = pd.concat([df_docs.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
415
+
416
+ results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
417
+
418
+ results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
419
+ results_df_out['distances'] = round(results_df_out['distances'].astype(float), 2)
420
+
421
+ # Join back to original df
422
+ # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
423
+
424
+ # Join on additional files
425
+ if in_join_file:
426
+ join_filename = in_join_file.name
427
+
428
+ # Import data
429
+ join_df = read_file(join_filename)
430
+ join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
431
+ results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
432
+
433
+ results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
434
+
435
+
436
  results_df_name = "semantic_search_result.csv"
437
+ results_df_out.to_csv(results_df_name, index= None)
438
+ results_first_text = results_df_out[orig_df_col][0]
439
 
 
440
  return results_first_text, results_df_name
441
 
442
+ ## Gradio app - BM25 search
443
  block = gr.Blocks(theme = gr.themes.Base())
444
 
445
  with block:
 
450
  vectorstore_state = gr.State() # globals()["vectorstore"]
451
  embeddings_state = gr.State() # globals()["embeddings"]
452
 
453
+ k_val = gr.State(9999)
454
+ out_passages = gr.State(9999)
455
  vec_score_cut_off = gr.State(100)
456
  vec_weight = gr.State(1)
457
 
 
479
  # Fast text search
480
  Enter a text query below to search through a text data column and find relevant terms. It will only find terms containing the exact text you enter. Your data should contain at least 20 entries for the search to consistently return results.
481
  """)
 
482
 
483
+ with gr.Tab(label="Keyword search"):
484
  with gr.Row():
485
  current_source = gr.Textbox(label="Current data source(s)", value="None")
486
 
 
491
 
492
  load_bm25_data_button = gr.Button(value="Load data")
493
 
 
494
  with gr.Row():
495
  load_finished_message = gr.Textbox(label="Load progress", scale = 2)
496
 
 
497
  with gr.Accordion(label = "Search data", open=True):
498
  with gr.Row():
499
  in_query = gr.Textbox(label="Enter your search term")
 
505
  output_single_text = gr.Textbox(label="Top result")
506
  output_file = gr.File(label="File output")
507
 
 
508
  with gr.Tab("Fuzzy/semantic search"):
509
+ with gr.Row():
510
+ current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
511
+
512
+ with gr.Accordion("Load in data", open = True):
513
  in_semantic_file = gr.File(label="Upload data file for semantic search")
514
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
515
  load_semantic_data_button = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
 
522
  semantic_output_single_text = gr.Textbox(label="Top result")
523
  semantic_output_file = gr.File(label="File output")
524
 
 
525
  with gr.Tab(label="Advanced options"):
526
  with gr.Accordion(label="Data load / save options", open = False):
527
  #with gr.Row():
 
571
 
572
  # Load in a csv/excel file for semantic search
573
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
574
+ load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
575
  then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs]).\
576
  then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
577
 
578
  # Semantic search query
579
+ semantic_submit.click(chroma_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
580
 
581
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
582
  in_bm25_column.change(dummy_function, in_bm25_column, None)
search_funcs/ingest.py CHANGED
@@ -249,9 +249,7 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
249
  if col != text_column:
250
  metadata[col] = value
251
 
252
- metadata_string = write_out_metadata_as_string(metadata)[0]
253
-
254
-
255
 
256
  # If chunk_size is provided, split the text into chunks
257
  if chunk_size:
 
249
  if col != text_column:
250
  metadata[col] = value
251
 
252
+ metadata_string = write_out_metadata_as_string(metadata)[0]
 
 
253
 
254
  # If chunk_size is provided, split the text into chunks
255
  if chunk_size: