Sean-Case
commited on
Commit
•
2a8aba8
1
Parent(s):
acfac99
Now outputs correct dataframe for semantic search. Can join on extra details
Browse files- app.py +68 -155
- search_funcs/ingest.py +1 -3
app.py
CHANGED
@@ -26,8 +26,14 @@ import search_funcs.chatfuncs as chatf
|
|
26 |
|
27 |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
28 |
import chromadb
|
|
|
|
|
29 |
|
30 |
-
#
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
|
33 |
|
@@ -257,25 +263,13 @@ def dummy_function(gradio_component):
|
|
257 |
def display_info(info_component):
|
258 |
gr.Info(info_component)
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
#
|
263 |
-
|
264 |
-
|
265 |
-
from chromadb import Documents, EmbeddingFunction, Embeddings
|
266 |
-
|
267 |
-
embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
|
268 |
-
|
269 |
-
class MyEmbeddingFunction(EmbeddingFunction):
|
270 |
-
def __call__(self, input) -> Embeddings:
|
271 |
-
|
272 |
-
|
273 |
-
embeddings = []
|
274 |
-
for text in input:
|
275 |
-
embeddings.append(embeddings_model.encode(text))
|
276 |
-
|
277 |
-
return embeddings
|
278 |
|
|
|
279 |
|
280 |
def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
281 |
'''
|
@@ -293,15 +287,17 @@ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
|
293 |
|
294 |
return embeddings
|
295 |
|
|
|
|
|
|
|
|
|
296 |
embeddings = load_embeddings(embeddings_name)
|
297 |
|
298 |
-
def docs_to_chroma_save(docs_out, embeddings=embeddings, progress=gr.Progress()):
|
299 |
'''
|
300 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
301 |
'''
|
302 |
|
303 |
-
|
304 |
-
|
305 |
print(f"> Total split documents: {len(docs_out)}")
|
306 |
|
307 |
#print(docs_out)
|
@@ -349,126 +345,23 @@ def jina_simple_retrieval(new_question_kworded, vectorstore, docs, k_val, out_pa
|
|
349 |
# Calculate cosine similarity with each string in the list
|
350 |
cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
|
351 |
|
352 |
-
|
353 |
-
|
354 |
print(cosine_similarities)
|
355 |
|
|
|
356 |
|
357 |
-
|
358 |
-
|
359 |
-
#embeddings=globals()["embeddings"]
|
360 |
-
doc_df = pd.DataFrame()
|
361 |
-
|
362 |
-
|
363 |
-
docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
|
364 |
-
|
365 |
-
print("Docs from similarity search:")
|
366 |
-
print(docs)
|
367 |
-
|
368 |
-
# Keep only documents with a certain score
|
369 |
-
docs_len = [len(x[0].page_content) for x in docs]
|
370 |
-
docs_scores = [x[1] for x in docs]
|
371 |
-
|
372 |
-
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
|
373 |
-
score_more_limit = pd.Series(docs_scores) < vec_score_cut_off
|
374 |
-
docs_keep = list(compress(docs, score_more_limit))
|
375 |
-
|
376 |
-
if not docs_keep:
|
377 |
-
return [], pd.DataFrame(), []
|
378 |
-
|
379 |
-
# Only keep sources that are at least 100 characters long
|
380 |
-
length_more_limit = pd.Series(docs_len) >= 100
|
381 |
-
docs_keep = list(compress(docs_keep, length_more_limit))
|
382 |
-
|
383 |
-
if not docs_keep:
|
384 |
-
return [], pd.DataFrame(), []
|
385 |
-
|
386 |
-
docs_keep_as_doc = [x[0] for x in docs_keep]
|
387 |
-
docs_keep_length = len(docs_keep_as_doc)
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
if docs_keep_length == 1:
|
392 |
-
|
393 |
-
content=[]
|
394 |
-
meta_url=[]
|
395 |
-
score=[]
|
396 |
-
|
397 |
-
for item in docs_keep:
|
398 |
-
content.append(item[0].page_content)
|
399 |
-
meta_url.append(item[0].metadata['source'])
|
400 |
-
score.append(item[1])
|
401 |
-
|
402 |
-
# Create df from 'winning' passages
|
403 |
-
|
404 |
-
doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
|
405 |
-
columns =['page_content', 'meta_url', 'score'])
|
406 |
-
|
407 |
-
docs_content = doc_df['page_content'].astype(str)
|
408 |
-
docs_url = doc_df['meta_url']
|
409 |
-
|
410 |
-
return docs_keep_as_doc, docs_content, docs_url
|
411 |
-
|
412 |
-
# Check for if more docs are removed than the desired output
|
413 |
-
if out_passages > docs_keep_length:
|
414 |
-
out_passages = docs_keep_length
|
415 |
-
k_val = docs_keep_length
|
416 |
-
|
417 |
-
vec_rank = [*range(1, docs_keep_length+1)]
|
418 |
-
vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
|
419 |
-
|
420 |
-
## Calculate final score based on three ranking methods
|
421 |
-
final_score = [a for a in zip(vec_score)]
|
422 |
-
final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
|
423 |
-
# Force final_rank to increment by 1 each time
|
424 |
-
final_rank = list(pd.Series(final_rank).rank(method='first'))
|
425 |
-
|
426 |
-
#print("final rank: " + str(final_rank))
|
427 |
-
#print("out_passages: " + str(out_passages))
|
428 |
-
|
429 |
-
best_rank_index_pos = []
|
430 |
-
|
431 |
-
for x in range(1,out_passages+1):
|
432 |
-
try:
|
433 |
-
best_rank_index_pos.append(final_rank.index(x))
|
434 |
-
except IndexError: # catch the error
|
435 |
-
pass
|
436 |
-
|
437 |
-
# Adjust best_rank_index_pos to
|
438 |
-
|
439 |
-
best_rank_pos_series = pd.Series(best_rank_index_pos)
|
440 |
-
|
441 |
-
|
442 |
-
docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
|
443 |
-
|
444 |
-
# Keep only 'best' options
|
445 |
-
docs_keep_as_doc = [x[0] for x in docs_keep_out]
|
446 |
-
|
447 |
-
# Make df of best options
|
448 |
-
doc_df = create_doc_df(docs_keep_out)
|
449 |
-
|
450 |
-
return docs_keep_as_doc, doc_df, docs_keep_out
|
451 |
-
|
452 |
-
def chroma_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passages,
|
453 |
-
vec_score_cut_off, vec_weight): # ,vectorstore, embeddings
|
454 |
|
455 |
query = embeddings.encode(new_question_kworded).tolist()
|
456 |
|
457 |
docs = vectorstore.query(
|
458 |
query_embeddings=query,
|
459 |
-
n_results=
|
460 |
#where={"metadata_field": "is_equal_to_this"},
|
461 |
#where_document={"$contains":"search_string"}
|
462 |
)
|
463 |
|
464 |
-
|
465 |
-
#cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
|
466 |
-
|
467 |
-
#print(docs)
|
468 |
-
|
469 |
-
#vectorstore=globals()["vectorstore"]
|
470 |
-
#embeddings=globals()["embeddings"]
|
471 |
-
df = pd.DataFrame(data={'ids': docs['ids'][0],
|
472 |
'documents': docs['documents'][0],
|
473 |
'metadatas':docs['metadatas'][0],
|
474 |
'distances':docs['distances'][0]#,
|
@@ -479,23 +372,18 @@ def chroma_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passage
|
|
479 |
dict_out = {'ids' : [df['ids']],
|
480 |
'documents': [df['documents']],
|
481 |
'metadatas': [df['metadatas']],
|
482 |
-
'distances': [df['distances']],
|
483 |
'embeddings': None
|
484 |
}
|
485 |
return dict_out
|
486 |
|
487 |
# Prepare the DataFrame by transposing
|
488 |
-
df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
|
489 |
-
|
490 |
-
#print(df_docs)
|
491 |
-
|
492 |
|
493 |
# Keep only documents with a certain score
|
494 |
|
495 |
docs_scores = df_docs["distances"] #.astype(float)
|
496 |
|
497 |
-
#print(docs_scores)
|
498 |
-
|
499 |
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
|
500 |
score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
|
501 |
docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
|
@@ -510,21 +398,48 @@ def chroma_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passage
|
|
510 |
length_more_limit = score_more_limit.loc[docs_len, :] #pd.Series(docs_len) >= 100
|
511 |
docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
|
512 |
|
513 |
-
#print(
|
514 |
-
|
515 |
-
print(length_more_limit)
|
516 |
|
517 |
if not docs_keep:
|
518 |
return 'No result found!', ""
|
519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
results_df_name = "semantic_search_result.csv"
|
521 |
-
|
522 |
-
results_first_text =
|
523 |
|
524 |
-
|
525 |
return results_first_text, results_df_name
|
526 |
|
527 |
-
|
528 |
block = gr.Blocks(theme = gr.themes.Base())
|
529 |
|
530 |
with block:
|
@@ -535,8 +450,8 @@ with block:
|
|
535 |
vectorstore_state = gr.State() # globals()["vectorstore"]
|
536 |
embeddings_state = gr.State() # globals()["embeddings"]
|
537 |
|
538 |
-
k_val = gr.State(
|
539 |
-
out_passages = gr.State(
|
540 |
vec_score_cut_off = gr.State(100)
|
541 |
vec_weight = gr.State(1)
|
542 |
|
@@ -564,9 +479,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
564 |
# Fast text search
|
565 |
Enter a text query below to search through a text data column and find relevant terms. It will only find terms containing the exact text you enter. Your data should contain at least 20 entries for the search to consistently return results.
|
566 |
""")
|
567 |
-
|
568 |
|
569 |
-
with gr.Tab(label="
|
570 |
with gr.Row():
|
571 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
572 |
|
@@ -577,11 +491,9 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
577 |
|
578 |
load_bm25_data_button = gr.Button(value="Load data")
|
579 |
|
580 |
-
|
581 |
with gr.Row():
|
582 |
load_finished_message = gr.Textbox(label="Load progress", scale = 2)
|
583 |
|
584 |
-
|
585 |
with gr.Accordion(label = "Search data", open=True):
|
586 |
with gr.Row():
|
587 |
in_query = gr.Textbox(label="Enter your search term")
|
@@ -593,9 +505,11 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
593 |
output_single_text = gr.Textbox(label="Top result")
|
594 |
output_file = gr.File(label="File output")
|
595 |
|
596 |
-
|
597 |
with gr.Tab("Fuzzy/semantic search"):
|
598 |
-
with gr.
|
|
|
|
|
|
|
599 |
in_semantic_file = gr.File(label="Upload data file for semantic search")
|
600 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
601 |
load_semantic_data_button = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
|
@@ -608,7 +522,6 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
608 |
semantic_output_single_text = gr.Textbox(label="Top result")
|
609 |
semantic_output_file = gr.File(label="File output")
|
610 |
|
611 |
-
|
612 |
with gr.Tab(label="Advanced options"):
|
613 |
with gr.Accordion(label="Data load / save options", open = False):
|
614 |
#with gr.Row():
|
@@ -658,12 +571,12 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
658 |
|
659 |
# Load in a csv/excel file for semantic search
|
660 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
661 |
-
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text,
|
662 |
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs]).\
|
663 |
then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
|
664 |
|
665 |
# Semantic search query
|
666 |
-
semantic_submit.click(chroma_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, k_val,out_passages, vec_score_cut_off, vec_weight], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
667 |
|
668 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
669 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
|
|
26 |
|
27 |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
28 |
import chromadb
|
29 |
+
#from typing_extensions import Protocol
|
30 |
+
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
31 |
|
32 |
+
# Remove Chroma database file. If it exists as it can cause issues
|
33 |
+
chromadb_file = "chroma.sqlite3"
|
34 |
+
|
35 |
+
if os.path.isfile(chromadb_file):
|
36 |
+
os.remove(chromadb_file)
|
37 |
|
38 |
def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
|
39 |
|
|
|
263 |
def display_info(info_component):
|
264 |
gr.Info(info_component)
|
265 |
|
266 |
+
# class MyEmbeddingFunction(EmbeddingFunction):
|
267 |
+
# def __call__(self, input) -> Embeddings:
|
268 |
+
# embeddings = []
|
269 |
+
# for text in input:
|
270 |
+
# embeddings.append(embeddings_model.encode(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
+
# return embeddings
|
273 |
|
274 |
def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
275 |
'''
|
|
|
287 |
|
288 |
return embeddings
|
289 |
|
290 |
+
# Load embeddings
|
291 |
+
embeddings_name = "jinaai/jina-embeddings-v2-small-en"
|
292 |
+
#embeddings_name = "BAAI/bge-base-en-v1.5"
|
293 |
+
embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
|
294 |
embeddings = load_embeddings(embeddings_name)
|
295 |
|
296 |
+
def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
|
297 |
'''
|
298 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
299 |
'''
|
300 |
|
|
|
|
|
301 |
print(f"> Total split documents: {len(docs_out)}")
|
302 |
|
303 |
#print(docs_out)
|
|
|
345 |
# Calculate cosine similarity with each string in the list
|
346 |
cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
|
347 |
|
|
|
|
|
348 |
print(cosine_similarities)
|
349 |
|
350 |
+
return cosine_similarities
|
351 |
|
352 |
+
def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
353 |
+
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None): # ,vectorstore, embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
query = embeddings.encode(new_question_kworded).tolist()
|
356 |
|
357 |
docs = vectorstore.query(
|
358 |
query_embeddings=query,
|
359 |
+
n_results= k_val # No practical limit on number of responses returned
|
360 |
#where={"metadata_field": "is_equal_to_this"},
|
361 |
#where_document={"$contains":"search_string"}
|
362 |
)
|
363 |
|
364 |
+
df_docs = pd.DataFrame(data={'ids': docs['ids'][0],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
'documents': docs['documents'][0],
|
366 |
'metadatas':docs['metadatas'][0],
|
367 |
'distances':docs['distances'][0]#,
|
|
|
372 |
dict_out = {'ids' : [df['ids']],
|
373 |
'documents': [df['documents']],
|
374 |
'metadatas': [df['metadatas']],
|
375 |
+
'distances': [round(df['distances'].astype(float), 2)],
|
376 |
'embeddings': None
|
377 |
}
|
378 |
return dict_out
|
379 |
|
380 |
# Prepare the DataFrame by transposing
|
381 |
+
#df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
|
|
|
|
|
|
|
382 |
|
383 |
# Keep only documents with a certain score
|
384 |
|
385 |
docs_scores = df_docs["distances"] #.astype(float)
|
386 |
|
|
|
|
|
387 |
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
|
388 |
score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
|
389 |
docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
|
|
|
398 |
length_more_limit = score_more_limit.loc[docs_len, :] #pd.Series(docs_len) >= 100
|
399 |
docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
|
400 |
|
401 |
+
#print(length_more_limit)
|
|
|
|
|
402 |
|
403 |
if not docs_keep:
|
404 |
return 'No result found!', ""
|
405 |
|
406 |
+
length_more_limit['ids'] = length_more_limit['ids'].astype(int)
|
407 |
+
|
408 |
+
#length_more_limit.to_csv("length_more_limit.csv", index = None)
|
409 |
+
|
410 |
+
# Explode the 'metadatas' dictionary into separate columns
|
411 |
+
df_metadata_expanded = df_docs['metadatas'].apply(pd.Series)
|
412 |
+
|
413 |
+
# Concatenate the original DataFrame with the expanded metadata DataFrame
|
414 |
+
results_df_out = pd.concat([df_docs.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
|
415 |
+
|
416 |
+
results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
|
417 |
+
|
418 |
+
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
|
419 |
+
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 2)
|
420 |
+
|
421 |
+
# Join back to original df
|
422 |
+
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|
423 |
+
|
424 |
+
# Join on additional files
|
425 |
+
if in_join_file:
|
426 |
+
join_filename = in_join_file.name
|
427 |
+
|
428 |
+
# Import data
|
429 |
+
join_df = read_file(join_filename)
|
430 |
+
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
431 |
+
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
432 |
+
|
433 |
+
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
434 |
+
|
435 |
+
|
436 |
results_df_name = "semantic_search_result.csv"
|
437 |
+
results_df_out.to_csv(results_df_name, index= None)
|
438 |
+
results_first_text = results_df_out[orig_df_col][0]
|
439 |
|
|
|
440 |
return results_first_text, results_df_name
|
441 |
|
442 |
+
## Gradio app - BM25 search
|
443 |
block = gr.Blocks(theme = gr.themes.Base())
|
444 |
|
445 |
with block:
|
|
|
450 |
vectorstore_state = gr.State() # globals()["vectorstore"]
|
451 |
embeddings_state = gr.State() # globals()["embeddings"]
|
452 |
|
453 |
+
k_val = gr.State(9999)
|
454 |
+
out_passages = gr.State(9999)
|
455 |
vec_score_cut_off = gr.State(100)
|
456 |
vec_weight = gr.State(1)
|
457 |
|
|
|
479 |
# Fast text search
|
480 |
Enter a text query below to search through a text data column and find relevant terms. It will only find terms containing the exact text you enter. Your data should contain at least 20 entries for the search to consistently return results.
|
481 |
""")
|
|
|
482 |
|
483 |
+
with gr.Tab(label="Keyword search"):
|
484 |
with gr.Row():
|
485 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
486 |
|
|
|
491 |
|
492 |
load_bm25_data_button = gr.Button(value="Load data")
|
493 |
|
|
|
494 |
with gr.Row():
|
495 |
load_finished_message = gr.Textbox(label="Load progress", scale = 2)
|
496 |
|
|
|
497 |
with gr.Accordion(label = "Search data", open=True):
|
498 |
with gr.Row():
|
499 |
in_query = gr.Textbox(label="Enter your search term")
|
|
|
505 |
output_single_text = gr.Textbox(label="Top result")
|
506 |
output_file = gr.File(label="File output")
|
507 |
|
|
|
508 |
with gr.Tab("Fuzzy/semantic search"):
|
509 |
+
with gr.Row():
|
510 |
+
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
511 |
+
|
512 |
+
with gr.Accordion("Load in data", open = True):
|
513 |
in_semantic_file = gr.File(label="Upload data file for semantic search")
|
514 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
515 |
load_semantic_data_button = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
|
|
|
522 |
semantic_output_single_text = gr.Textbox(label="Top result")
|
523 |
semantic_output_file = gr.File(label="File output")
|
524 |
|
|
|
525 |
with gr.Tab(label="Advanced options"):
|
526 |
with gr.Accordion(label="Data load / save options", open = False):
|
527 |
#with gr.Row():
|
|
|
571 |
|
572 |
# Load in a csv/excel file for semantic search
|
573 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
574 |
+
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
|
575 |
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs]).\
|
576 |
then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
|
577 |
|
578 |
# Semantic search query
|
579 |
+
semantic_submit.click(chroma_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
580 |
|
581 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
582 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
search_funcs/ingest.py
CHANGED
@@ -249,9 +249,7 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
249 |
if col != text_column:
|
250 |
metadata[col] = value
|
251 |
|
252 |
-
metadata_string = write_out_metadata_as_string(metadata)[0]
|
253 |
-
|
254 |
-
|
255 |
|
256 |
# If chunk_size is provided, split the text into chunks
|
257 |
if chunk_size:
|
|
|
249 |
if col != text_column:
|
250 |
metadata[col] = value
|
251 |
|
252 |
+
metadata_string = write_out_metadata_as_string(metadata)[0]
|
|
|
|
|
253 |
|
254 |
# If chunk_size is provided, split the text into chunks
|
255 |
if chunk_size:
|