seanpedrickcase
commited on
Commit
•
ceb8617
1
Parent(s):
2cb9977
Added semantic search using Jina
Browse files- .gitignore +2 -0
- app.py +359 -139
- requirements.txt +1 -0
- search_funcs/clean_funcs.py +69 -30
- search_funcs/ingest.py +71 -18
.gitignore
CHANGED
@@ -11,6 +11,8 @@
|
|
11 |
*.pkl
|
12 |
*.spec
|
13 |
*.ipynb
|
|
|
|
|
14 |
build/*
|
15 |
dist/*
|
16 |
__pycache__/*
|
|
|
11 |
*.pkl
|
12 |
*.spec
|
13 |
*.ipynb
|
14 |
+
*.npy
|
15 |
+
*.npz
|
16 |
build/*
|
17 |
dist/*
|
18 |
__pycache__/*
|
app.py
CHANGED
@@ -10,16 +10,32 @@ from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sent
|
|
10 |
from nltk import word_tokenize
|
11 |
#from sentence_transformers import SentenceTransformer
|
12 |
|
|
|
|
|
13 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
14 |
|
15 |
import gradio as gr
|
16 |
import pandas as pd
|
|
|
17 |
import os
|
18 |
import time
|
19 |
from chromadb.config import Settings
|
20 |
|
21 |
from transformers import AutoModel
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# model = AutoModel.from_pretrained('./model_and_tokenizer/int8-model.onnx', use_embedding_runtime=True)
|
24 |
# sentence_embeddings = model.generate(engine_input)['last_hidden_state:0']
|
25 |
|
@@ -33,7 +49,7 @@ import chromadb
|
|
33 |
#from typing_extensions import Protocol
|
34 |
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
35 |
|
36 |
-
from torch import cuda, backends
|
37 |
|
38 |
# Check for torch cuda
|
39 |
print(cuda.is_available())
|
@@ -51,17 +67,90 @@ chromadb_file = "chroma.sqlite3"
|
|
51 |
if os.path.isfile(chromadb_file):
|
52 |
os.remove(chromadb_file)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
|
61 |
#df = pd.read_parquet(file_in.name)
|
62 |
-
df_list = list(df[text_column].astype(str))
|
63 |
#df_list = df
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
if clean == "Yes":
|
66 |
df_list_clean = initial_clean(df_list)
|
67 |
|
@@ -69,23 +158,84 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
|
|
69 |
out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
|
70 |
|
71 |
#corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
|
72 |
-
corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
|
|
|
|
|
|
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
|
76 |
else:
|
|
|
|
|
|
|
|
|
|
|
77 |
#corpus = [word_tokenize(doc.lower()) for doc in df_list]
|
78 |
-
corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
out_file_name = None
|
80 |
|
|
|
|
|
81 |
|
|
|
|
|
82 |
|
83 |
-
print("Finished data clean")
|
84 |
|
85 |
if len(df_list) >= 20:
|
86 |
message = "Data loaded"
|
87 |
else:
|
88 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
|
|
|
|
89 |
|
90 |
return corpus, message, df, out_file_name
|
91 |
|
@@ -236,11 +386,17 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
236 |
When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
|
237 |
'''
|
238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
new_choices = []
|
240 |
concat_choices = []
|
241 |
|
242 |
|
243 |
-
df = read_file(
|
244 |
new_choices = list(df.columns)
|
245 |
|
246 |
#print(new_choices)
|
@@ -279,38 +435,6 @@ def dummy_function(gradio_component):
|
|
279 |
def display_info(info_component):
|
280 |
gr.Info(info_component)
|
281 |
|
282 |
-
# class MyEmbeddingFunction(EmbeddingFunction):
|
283 |
-
# def __call__(self, input) -> Embeddings:
|
284 |
-
# embeddings = []
|
285 |
-
# for text in input:
|
286 |
-
# embeddings.append(embeddings_model.encode(text))
|
287 |
-
|
288 |
-
# return embeddings
|
289 |
-
|
290 |
-
def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
291 |
-
'''
|
292 |
-
Load embeddings model and create a global variable based on it.
|
293 |
-
'''
|
294 |
-
|
295 |
-
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
296 |
-
|
297 |
-
#else:
|
298 |
-
embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
|
299 |
-
|
300 |
-
global embeddings
|
301 |
-
|
302 |
-
embeddings = embeddings_func
|
303 |
-
|
304 |
-
return embeddings
|
305 |
-
|
306 |
-
# Load embeddings
|
307 |
-
#embeddings_name =
|
308 |
-
embeddings_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en", trust_remote_code=True, device_map="auto")
|
309 |
-
#embeddings_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
310 |
-
#embeddings_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
|
311 |
-
|
312 |
-
embeddings = embeddings_model#load_embeddings(embeddings_name)
|
313 |
-
|
314 |
def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
|
315 |
'''
|
316 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
@@ -318,7 +442,7 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
|
|
318 |
|
319 |
print(f"> Total split documents: {len(docs_out)}")
|
320 |
|
321 |
-
print(docs_out)
|
322 |
|
323 |
page_contents = [doc.page_content for doc in docs_out]
|
324 |
page_meta = [doc.metadata for doc in docs_out]
|
@@ -330,13 +454,15 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
|
|
330 |
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
331 |
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
332 |
|
333 |
-
embeddings_list = embeddings.encode(sentences=page_contents, max_length=256).tolist() # For Jina embeddings
|
334 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
335 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
336 |
|
337 |
toc = time.perf_counter()
|
338 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
339 |
|
|
|
|
|
340 |
# Jina tiny
|
341 |
# This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length
|
342 |
# For 50k records:
|
@@ -345,6 +471,8 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
|
|
345 |
# 43 seconds at 256 max length
|
346 |
# 31 seconds at 128 max length
|
347 |
|
|
|
|
|
348 |
# BGE small
|
349 |
# 96 seconds for 50k records at 512 length
|
350 |
|
@@ -360,7 +488,7 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
|
|
360 |
chroma_tic = time.perf_counter()
|
361 |
|
362 |
# Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used.
|
363 |
-
client = chromadb.PersistentClient(path="./
|
364 |
anonymized_telemetry=False))
|
365 |
|
366 |
try:
|
@@ -408,117 +536,203 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
|
|
408 |
|
409 |
return out_message, collection
|
410 |
|
411 |
-
def
|
412 |
-
|
|
|
|
|
413 |
|
414 |
-
|
415 |
|
416 |
-
|
417 |
|
418 |
-
|
|
|
419 |
|
420 |
-
|
421 |
-
|
422 |
|
423 |
-
|
424 |
|
425 |
-
|
426 |
|
427 |
-
|
428 |
-
|
|
|
429 |
|
430 |
-
|
|
|
|
|
|
|
|
|
431 |
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
#where={"metadata_field": "is_equal_to_this"},
|
436 |
-
#where_document={"$contains":"search_string"}
|
437 |
-
)
|
438 |
|
439 |
-
|
440 |
-
|
441 |
-
'metadatas':docs['metadatas'][0],
|
442 |
-
'distances':docs['distances'][0]#,
|
443 |
-
#'embeddings': docs['embeddings']
|
444 |
-
})
|
445 |
-
|
446 |
-
def create_docs_keep_from_df(df):
|
447 |
-
dict_out = {'ids' : [df['ids']],
|
448 |
-
'documents': [df['documents']],
|
449 |
-
'metadatas': [df['metadatas']],
|
450 |
-
'distances': [round(df['distances'].astype(float), 2)],
|
451 |
-
'embeddings': None
|
452 |
-
}
|
453 |
-
return dict_out
|
454 |
-
|
455 |
-
# Prepare the DataFrame by transposing
|
456 |
-
#df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
|
457 |
|
458 |
-
|
459 |
|
460 |
-
|
461 |
-
|
462 |
-
|
|
|
463 |
|
464 |
-
|
465 |
-
score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
|
466 |
-
#docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
|
467 |
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
-
|
471 |
-
return 'No result found!', None
|
472 |
|
473 |
-
|
474 |
-
|
|
|
475 |
|
476 |
-
|
|
|
|
|
477 |
|
478 |
-
|
479 |
-
#docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
|
480 |
|
481 |
-
|
|
|
482 |
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
length_more_limit['ids'] = length_more_limit['ids'].astype(int)
|
487 |
|
488 |
-
|
489 |
|
490 |
-
|
491 |
-
|
492 |
|
493 |
-
|
494 |
-
print(df_metadata_expanded)
|
495 |
|
496 |
-
|
497 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
|
499 |
-
|
|
|
500 |
|
501 |
-
|
502 |
-
|
503 |
|
504 |
-
|
505 |
-
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|
506 |
|
507 |
-
|
508 |
-
|
509 |
-
join_filename = in_join_file.name
|
510 |
|
511 |
-
|
512 |
-
|
513 |
-
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
514 |
|
515 |
-
|
516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
|
518 |
-
|
519 |
|
520 |
-
|
|
|
521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
|
523 |
results_df_name = "semantic_search_result.csv"
|
524 |
results_df_out.to_csv(results_df_name, index= None)
|
@@ -526,6 +740,7 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
|
|
526 |
|
527 |
return results_first_text, results_df_name
|
528 |
|
|
|
529 |
## Gradio app - BM25 search
|
530 |
block = gr.Blocks(theme = gr.themes.Base())
|
531 |
|
@@ -539,7 +754,7 @@ with block:
|
|
539 |
|
540 |
k_val = gr.State(9999)
|
541 |
out_passages = gr.State(9999)
|
542 |
-
vec_score_cut_off = gr.State(
|
543 |
vec_weight = gr.State(1)
|
544 |
|
545 |
docs_keep_as_doc_state = gr.State()
|
@@ -572,10 +787,9 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
572 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
573 |
|
574 |
with gr.Accordion(label = "Load in data", open=True):
|
575 |
-
in_bm25_file = gr.File(label="Upload your search data here")
|
576 |
with gr.Row():
|
577 |
-
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
578 |
-
|
579 |
load_bm25_data_button = gr.Button(value="Load data")
|
580 |
|
581 |
with gr.Row():
|
@@ -583,10 +797,10 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
583 |
|
584 |
with gr.Accordion(label = "Search data", open=True):
|
585 |
with gr.Row():
|
586 |
-
|
587 |
mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
|
588 |
|
589 |
-
|
590 |
|
591 |
with gr.Row():
|
592 |
output_single_text = gr.Textbox(label="Top result")
|
@@ -597,11 +811,14 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
597 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
598 |
|
599 |
with gr.Accordion("Load in data", open = True):
|
600 |
-
in_semantic_file = gr.File(label="Upload data file for semantic search")
|
601 |
-
|
602 |
-
|
|
|
|
|
|
|
|
|
603 |
|
604 |
-
ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
|
605 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
606 |
semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
|
607 |
|
@@ -642,7 +859,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
642 |
in_alpha_button.click(display_info, inputs=in_alpha_info)
|
643 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
644 |
|
645 |
-
|
646 |
# Update dropdowns upon initial file load
|
647 |
in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
648 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
|
@@ -653,17 +870,20 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
653 |
then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
654 |
|
655 |
# BM25 search functions on click or enter
|
656 |
-
|
657 |
-
|
658 |
|
|
|
659 |
# Load in a csv/excel file for semantic search
|
660 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
661 |
-
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
|
662 |
-
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs,
|
663 |
-
then(
|
664 |
|
665 |
# Semantic search query
|
666 |
-
semantic_submit.click(
|
|
|
|
|
667 |
|
668 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
669 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
|
|
10 |
from nltk import word_tokenize
|
11 |
#from sentence_transformers import SentenceTransformer
|
12 |
|
13 |
+
# Try SpaCy alternative tokeniser
|
14 |
+
|
15 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
16 |
|
17 |
import gradio as gr
|
18 |
import pandas as pd
|
19 |
+
import numpy as np
|
20 |
import os
|
21 |
import time
|
22 |
from chromadb.config import Settings
|
23 |
|
24 |
from transformers import AutoModel
|
25 |
|
26 |
+
# Load the SpaCy mode
|
27 |
+
from spacy.cli import download
|
28 |
+
import spacy
|
29 |
+
spacy.prefer_gpu()
|
30 |
+
|
31 |
+
#os.system("python -m spacy download en_core_web_sm")
|
32 |
+
try:
|
33 |
+
nlp = spacy.load("en_core_web_sm")
|
34 |
+
except:
|
35 |
+
download("en_core_web_sm")
|
36 |
+
nlp = spacy.load("en_core_web_sm")
|
37 |
+
|
38 |
+
|
39 |
# model = AutoModel.from_pretrained('./model_and_tokenizer/int8-model.onnx', use_embedding_runtime=True)
|
40 |
# sentence_embeddings = model.generate(engine_input)['last_hidden_state:0']
|
41 |
|
|
|
49 |
#from typing_extensions import Protocol
|
50 |
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
51 |
|
52 |
+
from torch import cuda, backends, tensor, mm
|
53 |
|
54 |
# Check for torch cuda
|
55 |
print(cuda.is_available())
|
|
|
67 |
if os.path.isfile(chromadb_file):
|
68 |
os.remove(chromadb_file)
|
69 |
|
70 |
+
|
71 |
+
def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
72 |
+
'''
|
73 |
+
Load embeddings model and create a global variable based on it.
|
74 |
+
'''
|
75 |
+
|
76 |
+
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
77 |
+
|
78 |
+
#else:
|
79 |
+
embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
|
80 |
+
|
81 |
+
global embeddings
|
82 |
+
|
83 |
+
embeddings = embeddings_func
|
84 |
+
|
85 |
+
return embeddings
|
86 |
+
|
87 |
+
# Load embeddings
|
88 |
+
embeddings_name = "jinaai/jina-embeddings-v2-small-en"
|
89 |
+
embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
|
90 |
+
#embeddings_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
91 |
+
#embeddings_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
|
92 |
+
|
93 |
+
#tokenizer = AutoTokenizer.from_pretrained(embeddings_name, device_map = "auto")#to(torch_device) # From Jina
|
94 |
+
# Construction 2 - from SpaCy - https://spacy.io/api/tokenizer
|
95 |
+
|
96 |
+
|
97 |
+
#from spacy.lang.en import English
|
98 |
+
#nlp = #English()
|
99 |
+
# Create a Tokenizer with the default settings for English
|
100 |
+
# including punctuation rules and exceptions
|
101 |
+
tokenizer = nlp.tokenizer
|
102 |
+
|
103 |
+
embeddings = embeddings_model#load_embeddings(embeddings_name)
|
104 |
+
|
105 |
+
|
106 |
def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
|
107 |
|
108 |
+
file_list = [string.name for string in in_file]
|
109 |
+
|
110 |
+
print(file_list)
|
111 |
+
|
112 |
+
data_file_names = [string for string in file_list if "tokenised" not in string]
|
113 |
+
|
114 |
+
df = read_file(data_file_names[0])
|
115 |
+
|
116 |
+
## Load in pre-tokenised corpus if exists
|
117 |
+
tokenised_df = pd.DataFrame()
|
118 |
|
119 |
+
tokenised_file_names = [string for string in file_list if "tokenised" in string]
|
120 |
+
|
121 |
+
if tokenised_file_names:
|
122 |
+
tokenised_df = read_file(tokenised_file_names[0])
|
123 |
+
print("Tokenised df is: ", tokenised_df.head())
|
124 |
|
125 |
#df = pd.read_parquet(file_in.name)
|
126 |
+
df_list = list(df[text_column].astype(str).str.lower())
|
127 |
#df_list = df
|
128 |
|
129 |
+
import math
|
130 |
+
|
131 |
+
def get_total_batches(my_list, batch_size):
|
132 |
+
return math.ceil(len(my_list) / batch_size)
|
133 |
+
|
134 |
+
from itertools import islice
|
135 |
+
|
136 |
+
def batch(iterable, batch_size):
|
137 |
+
iterator = iter(iterable)
|
138 |
+
for first in iterator:
|
139 |
+
yield [first] + list(islice(iterator, batch_size - 1))
|
140 |
+
|
141 |
+
#def batch(my_list, batch_size):
|
142 |
+
# Splitting the list into batches
|
143 |
+
# for i in range(0, len(my_list), batch_size):
|
144 |
+
# batch = my_list[i:i + batch_size]
|
145 |
+
|
146 |
+
# Process each batch
|
147 |
+
# Replace this with your processing logic
|
148 |
+
#print("Processing batch:", batch)
|
149 |
+
|
150 |
+
batch_size = 256
|
151 |
+
|
152 |
+
tic = time.perf_counter()
|
153 |
+
|
154 |
if clean == "Yes":
|
155 |
df_list_clean = initial_clean(df_list)
|
156 |
|
|
|
158 |
out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
|
159 |
|
160 |
#corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
|
161 |
+
#corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
|
162 |
+
|
163 |
+
#total_batches = get_total_batches(df_list_clean, batch_size)
|
164 |
+
#data_batched = batch(df_list_clean, batch_size)
|
165 |
|
166 |
+
#print(data_batched)
|
167 |
+
|
168 |
+
#print(df_list_clean[0])
|
169 |
+
|
170 |
+
# Using encode_batch
|
171 |
+
#encodings = tokenizer.encode_batch(texts)
|
172 |
+
|
173 |
+
# Extracting tokens
|
174 |
+
#tokens_list = [encoding.tokens for encoding in encodings]
|
175 |
+
|
176 |
+
#corpus = [tokenizer(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
|
177 |
+
#corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
|
178 |
+
# print(df_list_clean)
|
179 |
+
# corpus = tokenizer.batch_encode_plus(df_list_clean).tokens
|
180 |
+
|
181 |
+
#corpus = [[token.text for token in nlp(text)] for text in df_list_clean]
|
182 |
+
|
183 |
+
# Tokenize texts in batches
|
184 |
+
if not tokenised_df.empty:
|
185 |
+
corpus = tokenised_df.iloc[:,0].tolist()
|
186 |
+
print("Corpus is: ", corpus[0:5])
|
187 |
+
|
188 |
+
else:
|
189 |
+
corpus = []
|
190 |
+
for doc in tokenizer.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
191 |
+
corpus.append([token.text for token in doc])
|
192 |
+
#for doc in nlp.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), batch_size=batch_size): # You can adjust batch_size based on your requirement
|
193 |
+
# corpus.append([token.text for token in doc])
|
194 |
|
195 |
|
196 |
else:
|
197 |
+
#total_batches = get_total_batches(df_list, batch_size)
|
198 |
+
#data_batched = batch(df_list, batch_size)
|
199 |
+
|
200 |
+
#print(data_batched)
|
201 |
+
|
202 |
#corpus = [word_tokenize(doc.lower()) for doc in df_list]
|
203 |
+
#corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
|
204 |
+
#corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
|
205 |
+
#corpus = tokenizer.batch_encode_plus(df_list).tokens # for jina
|
206 |
+
|
207 |
+
print(df_list[0])
|
208 |
+
#corpus = [[token.text for token in nlp(text)] for text in df_list]
|
209 |
+
|
210 |
+
# Tokenize texts in batches
|
211 |
+
if not tokenised_df.empty:
|
212 |
+
corpus = tokenised_df.iloc[:,0].tolist()
|
213 |
+
print("Corpus is: ", corpus[0:5])
|
214 |
+
|
215 |
+
else:
|
216 |
+
|
217 |
+
corpus = []
|
218 |
+
for doc in tokenizer.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
219 |
+
#for doc in nlp.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), #batch_size=batch_size): # You can adjust batch_size based on your requirement
|
220 |
+
corpus.append([token.text for token in doc])
|
221 |
+
|
222 |
+
#corpus = tokenizer(df_list)
|
223 |
out_file_name = None
|
224 |
|
225 |
+
print(corpus[0])
|
226 |
+
|
227 |
|
228 |
+
toc = time.perf_counter()
|
229 |
+
tokenizer_time_out = f"Tokenising the text took {toc - tic:0.1f} seconds"
|
230 |
|
231 |
+
print("Finished data clean. " + tokenizer_time_out)
|
232 |
|
233 |
if len(df_list) >= 20:
|
234 |
message = "Data loaded"
|
235 |
else:
|
236 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
237 |
+
|
238 |
+
pd.DataFrame(data={"Corpus":corpus}).to_parquet("keyword_search_tokenised_data.parquet")
|
239 |
|
240 |
return corpus, message, df, out_file_name
|
241 |
|
|
|
386 |
When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
|
387 |
'''
|
388 |
|
389 |
+
file_list = [string.name for string in in_file]
|
390 |
+
|
391 |
+
print(file_list)
|
392 |
+
|
393 |
+
data_file_names = [string for string in file_list if "tokenised" not in string]
|
394 |
+
|
395 |
new_choices = []
|
396 |
concat_choices = []
|
397 |
|
398 |
|
399 |
+
df = read_file(data_file_names[0])
|
400 |
new_choices = list(df.columns)
|
401 |
|
402 |
#print(new_choices)
|
|
|
435 |
def display_info(info_component):
|
436 |
gr.Info(info_component)
|
437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
|
439 |
'''
|
440 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
|
|
442 |
|
443 |
print(f"> Total split documents: {len(docs_out)}")
|
444 |
|
445 |
+
#print(docs_out)
|
446 |
|
447 |
page_contents = [doc.page_content for doc in docs_out]
|
448 |
page_meta = [doc.metadata for doc in docs_out]
|
|
|
454 |
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
455 |
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
456 |
|
457 |
+
embeddings_list = embeddings.encode(sentences=page_contents, max_length=256, show_progress_bar = True, batch_size = 32).tolist() # For Jina embeddings
|
458 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
459 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
460 |
|
461 |
toc = time.perf_counter()
|
462 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
463 |
|
464 |
+
#pd.Series(embeddings_list).to_csv("embeddings_out.csv")
|
465 |
+
|
466 |
# Jina tiny
|
467 |
# This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length
|
468 |
# For 50k records:
|
|
|
471 |
# 43 seconds at 256 max length
|
472 |
# 31 seconds at 128 max length
|
473 |
|
474 |
+
# The embedding took 1372.5 seconds at 256 max length for 655,020 case notes
|
475 |
+
|
476 |
# BGE small
|
477 |
# 96 seconds for 50k records at 512 length
|
478 |
|
|
|
488 |
chroma_tic = time.perf_counter()
|
489 |
|
490 |
# Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used.
|
491 |
+
client = chromadb.PersistentClient(path="./last_year", settings=Settings(
|
492 |
anonymized_telemetry=False))
|
493 |
|
494 |
try:
|
|
|
536 |
|
537 |
return out_message, collection
|
538 |
|
539 |
+
def docs_to_np_array(docs_out, in_file, embeddings = embeddings, progress=gr.Progress()):
|
540 |
+
'''
|
541 |
+
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
542 |
+
'''
|
543 |
|
544 |
+
print(f"> Total split documents: {len(docs_out)}")
|
545 |
|
546 |
+
#print(docs_out)
|
547 |
|
548 |
+
page_contents = [doc.page_content for doc in docs_out]
|
549 |
+
|
550 |
|
551 |
+
## Load in pre-embedded file if exists
|
552 |
+
file_list = [string.name for string in in_file]
|
553 |
|
554 |
+
print(file_list)
|
555 |
|
556 |
+
embeddings_file_names = [string for string in file_list if "embedding" in string]
|
557 |
|
558 |
+
if embeddings_file_names:
|
559 |
+
embeddings_out = np.load(embeddings_file_names[0])
|
560 |
+
print("embeddings loaded: ", embeddings_out)
|
561 |
|
562 |
+
if not embeddings_file_names:
|
563 |
+
tic = time.perf_counter()
|
564 |
+
#embeddings_list = []
|
565 |
+
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
566 |
+
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
567 |
|
568 |
+
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
569 |
+
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
570 |
+
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
|
|
|
|
|
|
571 |
|
572 |
+
toc = time.perf_counter()
|
573 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
|
575 |
+
np.savez_compressed('semantic_search_embeddings.npz', embeddings_out)
|
576 |
|
577 |
+
out_message = "Document processing complete. Ready to search."
|
578 |
+
print(out_message)
|
579 |
+
|
580 |
+
return out_message, embeddings_out
|
581 |
|
582 |
+
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
|
|
|
|
|
583 |
|
584 |
+
def create_docs_keep_from_df(df):
|
585 |
+
dict_out = {'ids' : [df['ids']],
|
586 |
+
'documents': [df['documents']],
|
587 |
+
'metadatas': [df['metadatas']],
|
588 |
+
'distances': [round(df['distances'].astype(float), 3)],
|
589 |
+
'embeddings': None
|
590 |
+
}
|
591 |
+
return dict_out
|
592 |
+
|
593 |
+
# Prepare the DataFrame by transposing
|
594 |
+
#df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
|
595 |
|
596 |
+
# Keep only documents with a certain score
|
|
|
597 |
|
598 |
+
#print(df_docs)
|
599 |
+
|
600 |
+
docs_scores = df_docs["distances"] #.astype(float)
|
601 |
|
602 |
+
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
|
603 |
+
score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :]
|
604 |
+
#docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
|
605 |
|
606 |
+
#print(docs_keep)
|
|
|
607 |
|
608 |
+
if score_more_limit.empty:
|
609 |
+
return 'No result found!', None
|
610 |
|
611 |
+
# Only keep sources that are at least 100 characters long
|
612 |
+
docs_len = score_more_limit["documents"].str.len() >= 100
|
|
|
|
|
613 |
|
614 |
+
#print(docs_len)
|
615 |
|
616 |
+
length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100
|
617 |
+
#docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
|
618 |
|
619 |
+
#print(length_more_limit)
|
|
|
620 |
|
621 |
+
if length_more_limit.empty:
|
622 |
+
return 'No result found!', None
|
623 |
+
|
624 |
+
length_more_limit['ids'] = length_more_limit['ids'].astype(int)
|
625 |
+
|
626 |
+
#length_more_limit.to_csv("length_more_limit.csv", index = None)
|
627 |
+
|
628 |
+
# Explode the 'metadatas' dictionary into separate columns
|
629 |
+
df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)
|
630 |
|
631 |
+
#print(length_more_limit)
|
632 |
+
#print(df_metadata_expanded)
|
633 |
|
634 |
+
# Concatenate the original DataFrame with the expanded metadata DataFrame
|
635 |
+
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
|
636 |
|
637 |
+
results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
|
|
|
638 |
|
639 |
+
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
|
640 |
+
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
|
|
|
641 |
|
642 |
+
# Join back to original df
|
643 |
+
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|
|
|
644 |
|
645 |
+
# Join on additional files
|
646 |
+
if in_join_file:
|
647 |
+
join_filename = in_join_file.name
|
648 |
+
|
649 |
+
# Import data
|
650 |
+
join_df = read_file(join_filename)
|
651 |
+
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
652 |
+
|
653 |
+
# Duplicates dropped so as not to expand out dataframe
|
654 |
+
join_df = join_df.drop_duplicates(in_join_column)
|
655 |
+
|
656 |
+
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
657 |
+
|
658 |
+
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
659 |
|
660 |
+
return results_df_out
|
661 |
|
662 |
+
def jina_simple_retrieval(new_question_kworded, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
663 |
+
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings, progress=gr.Progress()): # ,vectorstore, embeddings
|
664 |
|
665 |
+
print("vectorstore loaded: ", vectorstore)
|
666 |
+
|
667 |
+
# Convert it to a PyTorch tensor and transfer to GPU
|
668 |
+
vectorstore_tensor = tensor(vectorstore).to(device)
|
669 |
+
|
670 |
+
# Load the sentence transformer model and move it to GPU
|
671 |
+
embeddings = embeddings.to(device)
|
672 |
+
|
673 |
+
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
674 |
+
query = embeddings.encode(new_question_kworded)
|
675 |
+
query_tensor = tensor(query).to(device)
|
676 |
+
|
677 |
+
if query_tensor.dim() == 1:
|
678 |
+
query_tensor = query_tensor.unsqueeze(0) # Reshape to 2D with one row
|
679 |
+
|
680 |
+
# Normalize the query tensor and vectorstore tensor
|
681 |
+
query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True)
|
682 |
+
vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True)
|
683 |
+
|
684 |
+
# Calculate cosine similarities (batch processing)
|
685 |
+
cosine_similarities = mm(query_norm, vectorstore_norm.T)
|
686 |
+
|
687 |
+
# Flatten the tensor to a 1D array
|
688 |
+
cosine_similarities = cosine_similarities.flatten()
|
689 |
+
|
690 |
+
# Convert to a NumPy array if it's still a PyTorch tensor
|
691 |
+
cosine_similarities = cosine_similarities.cpu().numpy()
|
692 |
+
|
693 |
+
# Create a Pandas Series
|
694 |
+
cosine_similarities_series = pd.Series(cosine_similarities)
|
695 |
+
|
696 |
+
# Pull out relevent info from docs
|
697 |
+
page_contents = [doc.page_content for doc in docs]
|
698 |
+
page_meta = [doc.metadata for doc in docs]
|
699 |
+
ids_range = range(0,len(page_contents))
|
700 |
+
ids = [str(element) for element in ids_range]
|
701 |
+
|
702 |
+
df_docs = pd.DataFrame(data={"ids": ids,
|
703 |
+
"documents": page_contents,
|
704 |
+
"metadatas":page_meta,
|
705 |
+
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]
|
706 |
+
|
707 |
+
|
708 |
+
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
709 |
+
|
710 |
+
results_df_name = "semantic_search_result.csv"
|
711 |
+
results_df_out.to_csv(results_df_name, index= None)
|
712 |
+
results_first_text = results_df_out.iloc[0, 1]
|
713 |
+
|
714 |
+
return results_first_text, results_df_name
|
715 |
+
|
716 |
+
def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
717 |
+
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None): # ,vectorstore, embeddings
|
718 |
+
|
719 |
+
query = embeddings.encode(new_question_kworded).tolist()
|
720 |
+
|
721 |
+
docs = vectorstore.query(
|
722 |
+
query_embeddings=query,
|
723 |
+
n_results= k_val # No practical limit on number of responses returned
|
724 |
+
#where={"metadata_field": "is_equal_to_this"},
|
725 |
+
#where_document={"$contains":"search_string"}
|
726 |
+
)
|
727 |
+
|
728 |
+
df_docs = pd.DataFrame(data={'ids': docs['ids'][0],
|
729 |
+
'documents': docs['documents'][0],
|
730 |
+
'metadatas':docs['metadatas'][0],
|
731 |
+
'distances':docs['distances'][0]#,
|
732 |
+
#'embeddings': docs['embeddings']
|
733 |
+
})
|
734 |
+
|
735 |
+
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
736 |
|
737 |
results_df_name = "semantic_search_result.csv"
|
738 |
results_df_out.to_csv(results_df_name, index= None)
|
|
|
740 |
|
741 |
return results_first_text, results_df_name
|
742 |
|
743 |
+
|
744 |
## Gradio app - BM25 search
|
745 |
block = gr.Blocks(theme = gr.themes.Base())
|
746 |
|
|
|
754 |
|
755 |
k_val = gr.State(9999)
|
756 |
out_passages = gr.State(9999)
|
757 |
+
vec_score_cut_off = gr.State(0.7)
|
758 |
vec_weight = gr.State(1)
|
759 |
|
760 |
docs_keep_as_doc_state = gr.State()
|
|
|
787 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
788 |
|
789 |
with gr.Accordion(label = "Load in data", open=True):
|
790 |
+
in_bm25_file = gr.File(label="Upload your search data here", file_count= 'multiple', file_types = ['.parquet', '.csv'])
|
791 |
with gr.Row():
|
792 |
+
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
|
|
793 |
load_bm25_data_button = gr.Button(value="Load data")
|
794 |
|
795 |
with gr.Row():
|
|
|
797 |
|
798 |
with gr.Accordion(label = "Search data", open=True):
|
799 |
with gr.Row():
|
800 |
+
keyword_query = gr.Textbox(label="Enter your search term")
|
801 |
mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
|
802 |
|
803 |
+
keyword_search_button = gr.Button(value="Search text")
|
804 |
|
805 |
with gr.Row():
|
806 |
output_single_text = gr.Textbox(label="Top result")
|
|
|
811 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
812 |
|
813 |
with gr.Accordion("Load in data", open = True):
|
814 |
+
in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz'])
|
815 |
+
|
816 |
+
with gr.Row():
|
817 |
+
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
818 |
+
load_semantic_data_button = gr.Button(value="Load in data file", variant="secondary")
|
819 |
+
|
820 |
+
ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
|
821 |
|
|
|
822 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
823 |
semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
|
824 |
|
|
|
859 |
in_alpha_button.click(display_info, inputs=in_alpha_info)
|
860 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
861 |
|
862 |
+
### BM25 SEARCH ###
|
863 |
# Update dropdowns upon initial file load
|
864 |
in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
865 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
|
|
|
870 |
then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
871 |
|
872 |
# BM25 search functions on click or enter
|
873 |
+
keyword_search_button.click(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="search")
|
874 |
+
keyword_query.submit(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
|
875 |
|
876 |
+
### SEMANTIC SEARCH ###
|
877 |
# Load in a csv/excel file for semantic search
|
878 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
879 |
+
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic, ingest_embed_out]).\
|
880 |
+
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, ingest_embed_out]).\
|
881 |
+
then(docs_to_np_array, inputs=[ingest_docs, in_semantic_file], outputs=[ingest_embed_out, vectorstore_state])
|
882 |
|
883 |
# Semantic search query
|
884 |
+
semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
885 |
+
|
886 |
+
semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
887 |
|
888 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
889 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ chromadb
|
|
8 |
torch
|
9 |
accelerate
|
10 |
sentence-transformers
|
|
|
11 |
gradio==3.50.0
|
|
|
8 |
torch
|
9 |
accelerate
|
10 |
sentence-transformers
|
11 |
+
spacy
|
12 |
gradio==3.50.0
|
search_funcs/clean_funcs.py
CHANGED
@@ -19,6 +19,7 @@
|
|
19 |
import nltk
|
20 |
import re
|
21 |
import string
|
|
|
22 |
from nltk.stem import WordNetLemmatizer
|
23 |
from nltk.stem import PorterStemmer
|
24 |
from nltk.corpus import wordnet as wn
|
@@ -122,15 +123,25 @@ def initial_clean(texts):
|
|
122 |
clean_texts.append(text)
|
123 |
return clean_texts
|
124 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
# Pre-compiling the regular expressions for efficiency
|
126 |
-
email_start_pattern = re.compile(
|
127 |
-
email_end_pattern = re.compile(
|
128 |
-
html_pattern = re.compile(
|
129 |
-
email_pattern = re.compile(
|
130 |
-
num_pattern = re.compile(
|
131 |
-
postcode_pattern = re.compile(
|
132 |
-
warning_pattern = re.compile(
|
133 |
-
nbsp_pattern = re.compile(
|
134 |
|
135 |
def stem_sentence(sentence):
|
136 |
|
@@ -143,8 +154,6 @@ def stem_sentences(sentences, progress=gr.Progress()):
|
|
143 |
stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
|
144 |
return stemmed_sentences
|
145 |
|
146 |
-
|
147 |
-
|
148 |
def get_lemma_text(text):
|
149 |
# Tokenize the input string into words
|
150 |
tokens = word_tokenize(text)
|
@@ -178,30 +187,60 @@ def get_lemma_tokens(tokens):
|
|
178 |
lemmas.append(lemma)
|
179 |
return lemmas
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
def initial_clean(texts , progress=gr.Progress()):
|
182 |
-
|
183 |
|
184 |
-
i = 1
|
185 |
#progress(0, desc="Cleaning texts")
|
186 |
-
for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
-
i += 1
|
204 |
-
return clean_texts
|
205 |
|
206 |
# Sample execution
|
207 |
#sample_texts = [
|
|
|
19 |
import nltk
|
20 |
import re
|
21 |
import string
|
22 |
+
import polars as pl
|
23 |
from nltk.stem import WordNetLemmatizer
|
24 |
from nltk.stem import PorterStemmer
|
25 |
from nltk.corpus import wordnet as wn
|
|
|
123 |
clean_texts.append(text)
|
124 |
return clean_texts
|
125 |
'''
|
126 |
+
|
127 |
+
email_start_pattern_regex = r'.*importance:|.*subject:'
|
128 |
+
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
|
129 |
+
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
130 |
+
email_pattern_regex = r'\S*@\S*\s?'
|
131 |
+
num_pattern_regex = r'[0-9]+'
|
132 |
+
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
133 |
+
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
134 |
+
nbsp_pattern_regex = r' '
|
135 |
+
|
136 |
# Pre-compiling the regular expressions for efficiency
|
137 |
+
email_start_pattern = re.compile(email_start_pattern_regex)
|
138 |
+
email_end_pattern = re.compile(email_end_pattern_regex)
|
139 |
+
html_pattern = re.compile(html_pattern_regex)
|
140 |
+
email_pattern = re.compile(email_end_pattern_regex)
|
141 |
+
num_pattern = re.compile(num_pattern_regex)
|
142 |
+
postcode_pattern = re.compile(postcode_pattern_regex)
|
143 |
+
warning_pattern = re.compile(warning_pattern_regex)
|
144 |
+
nbsp_pattern = re.compile(nbsp_pattern_regex)
|
145 |
|
146 |
def stem_sentence(sentence):
|
147 |
|
|
|
154 |
stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
|
155 |
return stemmed_sentences
|
156 |
|
|
|
|
|
157 |
def get_lemma_text(text):
|
158 |
# Tokenize the input string into words
|
159 |
tokens = word_tokenize(text)
|
|
|
187 |
lemmas.append(lemma)
|
188 |
return lemmas
|
189 |
|
190 |
+
# def initial_clean(texts , progress=gr.Progress()):
|
191 |
+
# clean_texts = []
|
192 |
+
|
193 |
+
# i = 1
|
194 |
+
# #progress(0, desc="Cleaning texts")
|
195 |
+
# for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
|
196 |
+
# #print("Cleaning row: ", i)
|
197 |
+
# text = re.sub(email_start_pattern, '', text)
|
198 |
+
# text = re.sub(email_end_pattern, '', text)
|
199 |
+
# text = re.sub(postcode_pattern, '', text)
|
200 |
+
# text = remove_hyphens(text)
|
201 |
+
# text = re.sub(html_pattern, '', text)
|
202 |
+
# text = re.sub(email_pattern, '', text)
|
203 |
+
# text = re.sub(nbsp_pattern, '', text)
|
204 |
+
# #text = re.sub(warning_pattern, '', text)
|
205 |
+
# #text = stem_sentence(text)
|
206 |
+
# text = get_lemma_text(text)
|
207 |
+
# text = ' '.join(text)
|
208 |
+
# # Uncomment the next line if you want to remove numbers as well
|
209 |
+
# # text = re.sub(num_pattern, '', text)
|
210 |
+
# clean_texts.append(text)
|
211 |
+
|
212 |
+
# i += 1
|
213 |
+
# return clean_texts
|
214 |
+
|
215 |
+
|
216 |
def initial_clean(texts , progress=gr.Progress()):
|
217 |
+
texts = pl.Series(texts)#[]
|
218 |
|
219 |
+
#i = 1
|
220 |
#progress(0, desc="Cleaning texts")
|
221 |
+
#for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
|
222 |
+
#print("Cleaning row: ", i)
|
223 |
+
text = texts.str.replace_all(email_start_pattern_regex, '')
|
224 |
+
text = text.str.replace_all(email_end_pattern_regex, '')
|
225 |
+
#text = re.sub(postcode_pattern, '', text)
|
226 |
+
#text = remove_hyphens(text)
|
227 |
+
text = text.str.replace_all(html_pattern_regex, '')
|
228 |
+
text = text.str.replace_all(email_pattern_regex, '')
|
229 |
+
#text = re.sub(nbsp_pattern, '', text)
|
230 |
+
#text = re.sub(warning_pattern, '', text)
|
231 |
+
#text = stem_sentence(text)
|
232 |
+
#text = get_lemma_text(text)
|
233 |
+
#text = ' '.join(text)
|
234 |
+
# Uncomment the next line if you want to remove numbers as well
|
235 |
+
# text = re.sub(num_pattern, '', text)
|
236 |
+
#clean_texts.append(text)
|
237 |
+
|
238 |
+
#i += 1
|
239 |
+
|
240 |
+
text = text.to_list()
|
241 |
+
|
242 |
+
return text
|
243 |
|
|
|
|
|
244 |
|
245 |
# Sample execution
|
246 |
#sample_texts = [
|
search_funcs/ingest.py
CHANGED
@@ -3,9 +3,11 @@
|
|
3 |
import os
|
4 |
import time
|
5 |
import re
|
|
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
from typing import Type, List, Literal
|
|
|
9 |
|
10 |
from pydantic import BaseModel, Field
|
11 |
|
@@ -132,30 +134,43 @@ def parse_csv_or_excel(file_path, text_column = "text"):
|
|
132 |
|
133 |
#out_df = pd.DataFrame()
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
#for file_path in file_paths:
|
136 |
-
file_extension = determine_file_type(
|
137 |
-
file_name = get_file_path_end(
|
138 |
file_names = [file_name]
|
139 |
|
|
|
|
|
140 |
if file_extension == ".csv":
|
141 |
-
df = pd.read_csv(
|
142 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
143 |
df['source'] = file_name
|
144 |
df['page_section'] = ""
|
145 |
elif file_extension == ".xlsx":
|
146 |
-
df = pd.read_excel(
|
147 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
148 |
df['source'] = file_name
|
149 |
df['page_section'] = ""
|
150 |
elif file_extension == ".parquet":
|
151 |
-
df = pd.read_parquet(
|
152 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
153 |
df['source'] = file_name
|
154 |
df['page_section'] = ""
|
155 |
else:
|
156 |
print(f"Unsupported file type: {file_extension}")
|
157 |
return pd.DataFrame(), ['Please choose a valid file type']
|
158 |
-
|
|
|
|
|
|
|
|
|
159 |
|
160 |
def get_file_path_end(file_path):
|
161 |
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
@@ -221,18 +236,22 @@ def combine_metadata_columns(df, cols):
|
|
221 |
df['blank_column'] = ""
|
222 |
|
223 |
for n, col in enumerate(cols):
|
224 |
-
df[col] = df[col].astype(str).str.replace('"',"'").str.cat(df['blank_column'].astype(str), sep="")
|
225 |
|
226 |
df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '
|
227 |
|
228 |
|
229 |
-
df['metadatas'] = (df['metadatas'] + "}").str.replace(
|
230 |
|
231 |
return df['metadatas']
|
232 |
|
233 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
234 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
235 |
|
|
|
|
|
|
|
|
|
236 |
doc_sections = []
|
237 |
df[text_column] = df[text_column].astype(str) # Ensure column is a string column
|
238 |
|
@@ -247,33 +266,67 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
247 |
if col != text_column:
|
248 |
metadata[col] = value
|
249 |
|
250 |
-
|
251 |
|
252 |
# If chunk_size is provided, split the text into chunks
|
253 |
if chunk_size:
|
254 |
# Assuming you have a text splitter function similar to the PDF handling
|
255 |
text_splitter = RecursiveCharacterTextSplitter(
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
259 |
sections = text_splitter.split_text(doc_content)
|
260 |
|
261 |
|
262 |
# For each section, create a Document object
|
263 |
for i, section in enumerate(sections):
|
264 |
-
|
265 |
doc = Document(page_content=section,
|
266 |
-
|
267 |
doc_sections.append(doc)
|
|
|
|
|
|
|
268 |
else:
|
269 |
# If no chunk_size is provided, create a single Document object for the row
|
270 |
#doc_content = '. '.join([metadata_string, doc_content])
|
271 |
doc = Document(page_content=doc_content, metadata=metadata)
|
272 |
doc_sections.append(doc)
|
273 |
-
|
274 |
-
return doc_sections
|
275 |
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
|
279 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
@@ -296,7 +349,7 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.
|
|
296 |
|
297 |
# Create a list of Document objects
|
298 |
doc_sections = [Document(page_content=row['page_content'],
|
299 |
-
metadata=
|
300 |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
301 |
|
302 |
ingest_toc = time.perf_counter()
|
|
|
3 |
import os
|
4 |
import time
|
5 |
import re
|
6 |
+
import ast
|
7 |
import pandas as pd
|
8 |
import gradio as gr
|
9 |
from typing import Type, List, Literal
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
|
12 |
from pydantic import BaseModel, Field
|
13 |
|
|
|
134 |
|
135 |
#out_df = pd.DataFrame()
|
136 |
|
137 |
+
file_list = [string.name for string in file_path]
|
138 |
+
|
139 |
+
print(file_list)
|
140 |
+
|
141 |
+
data_file_names = [string for string in file_list if "tokenised" not in string]
|
142 |
+
|
143 |
+
|
144 |
#for file_path in file_paths:
|
145 |
+
file_extension = determine_file_type(data_file_names[0])
|
146 |
+
file_name = get_file_path_end(data_file_names[0])
|
147 |
file_names = [file_name]
|
148 |
|
149 |
+
print(file_extension)
|
150 |
+
|
151 |
if file_extension == ".csv":
|
152 |
+
df = pd.read_csv(data_file_names[0], low_memory=False)
|
153 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
154 |
df['source'] = file_name
|
155 |
df['page_section'] = ""
|
156 |
elif file_extension == ".xlsx":
|
157 |
+
df = pd.read_excel(data_file_names[0], engine='openpyxl')
|
158 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
159 |
df['source'] = file_name
|
160 |
df['page_section'] = ""
|
161 |
elif file_extension == ".parquet":
|
162 |
+
df = pd.read_parquet(data_file_names[0])
|
163 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
164 |
df['source'] = file_name
|
165 |
df['page_section'] = ""
|
166 |
else:
|
167 |
print(f"Unsupported file type: {file_extension}")
|
168 |
return pd.DataFrame(), ['Please choose a valid file type']
|
169 |
+
|
170 |
+
message = "Loaded in file. Now converting to document format."
|
171 |
+
print(message)
|
172 |
+
|
173 |
+
return df, file_names, message
|
174 |
|
175 |
def get_file_path_end(file_path):
|
176 |
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
|
|
236 |
df['blank_column'] = ""
|
237 |
|
238 |
for n, col in enumerate(cols):
|
239 |
+
df[col] = df[col].astype(str).str.replace('"',"'").str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\r\n', ' ').str.cat(df['blank_column'].astype(str), sep="")
|
240 |
|
241 |
df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '
|
242 |
|
243 |
|
244 |
+
df['metadatas'] = (df['metadatas'] + "}").str.replace(', }', '}')
|
245 |
|
246 |
return df['metadatas']
|
247 |
|
248 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
249 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
250 |
|
251 |
+
#print(df.head())
|
252 |
+
|
253 |
+
print("Converting to documents.")
|
254 |
+
|
255 |
doc_sections = []
|
256 |
df[text_column] = df[text_column].astype(str) # Ensure column is a string column
|
257 |
|
|
|
266 |
if col != text_column:
|
267 |
metadata[col] = value
|
268 |
|
269 |
+
metadata_string = write_out_metadata_as_string(metadata)[0]
|
270 |
|
271 |
# If chunk_size is provided, split the text into chunks
|
272 |
if chunk_size:
|
273 |
# Assuming you have a text splitter function similar to the PDF handling
|
274 |
text_splitter = RecursiveCharacterTextSplitter(
|
275 |
+
chunk_size=chunk_size,
|
276 |
+
chunk_overlap=chunk_overlap,
|
277 |
+
split_strat=split_strat,
|
278 |
+
start_index=start_index
|
279 |
+
) #Other arguments as required by the splitter
|
280 |
+
|
281 |
sections = text_splitter.split_text(doc_content)
|
282 |
|
283 |
|
284 |
# For each section, create a Document object
|
285 |
for i, section in enumerate(sections):
|
286 |
+
section = '. '.join([metadata_string, section])
|
287 |
doc = Document(page_content=section,
|
288 |
+
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
289 |
doc_sections.append(doc)
|
290 |
+
|
291 |
+
#print("Chunking currently disabled")
|
292 |
+
|
293 |
else:
|
294 |
# If no chunk_size is provided, create a single Document object for the row
|
295 |
#doc_content = '. '.join([metadata_string, doc_content])
|
296 |
doc = Document(page_content=doc_content, metadata=metadata)
|
297 |
doc_sections.append(doc)
|
|
|
|
|
298 |
|
299 |
+
message = "Data converted to document format. Now creating/loading document embeddings."
|
300 |
+
print(message)
|
301 |
+
|
302 |
+
return doc_sections, message
|
303 |
+
|
304 |
+
|
305 |
+
|
306 |
+
def clean_line_breaks(text):
|
307 |
+
# Replace \n and \r\n with a space
|
308 |
+
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
|
309 |
+
|
310 |
+
def parse_metadata(row):
|
311 |
+
try:
|
312 |
+
# Ensure the 'title' field is a string and clean line breaks
|
313 |
+
#if 'TITLE' in row:
|
314 |
+
# row['TITLE'] = clean_line_breaks(row['TITLE'])
|
315 |
+
|
316 |
+
# Convert the row to a string if it's not already
|
317 |
+
row_str = str(row) if not isinstance(row, str) else row
|
318 |
+
|
319 |
+
row_str.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
|
320 |
+
|
321 |
+
# Parse the string
|
322 |
+
metadata = ast.literal_eval(row_str)
|
323 |
+
# Process metadata
|
324 |
+
return metadata
|
325 |
+
except SyntaxError as e:
|
326 |
+
print(f"Failed to parse metadata: {row_str}")
|
327 |
+
print(f"Error: {e}")
|
328 |
+
# Handle the error or log it
|
329 |
+
return None # or some default value
|
330 |
|
331 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
|
332 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
|
|
349 |
|
350 |
# Create a list of Document objects
|
351 |
doc_sections = [Document(page_content=row['page_content'],
|
352 |
+
metadata= parse_metadata(row["metadata"]))
|
353 |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
354 |
|
355 |
ingest_toc = time.perf_counter()
|