seanpedrickcase commited on
Commit
ceb8617
1 Parent(s): 2cb9977

Added semantic search using Jina

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +359 -139
  3. requirements.txt +1 -0
  4. search_funcs/clean_funcs.py +69 -30
  5. search_funcs/ingest.py +71 -18
.gitignore CHANGED
@@ -11,6 +11,8 @@
11
  *.pkl
12
  *.spec
13
  *.ipynb
 
 
14
  build/*
15
  dist/*
16
  __pycache__/*
 
11
  *.pkl
12
  *.spec
13
  *.ipynb
14
+ *.npy
15
+ *.npz
16
  build/*
17
  dist/*
18
  __pycache__/*
app.py CHANGED
@@ -10,16 +10,32 @@ from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sent
10
  from nltk import word_tokenize
11
  #from sentence_transformers import SentenceTransformer
12
 
 
 
13
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
14
 
15
  import gradio as gr
16
  import pandas as pd
 
17
  import os
18
  import time
19
  from chromadb.config import Settings
20
 
21
  from transformers import AutoModel
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # model = AutoModel.from_pretrained('./model_and_tokenizer/int8-model.onnx', use_embedding_runtime=True)
24
  # sentence_embeddings = model.generate(engine_input)['last_hidden_state:0']
25
 
@@ -33,7 +49,7 @@ import chromadb
33
  #from typing_extensions import Protocol
34
  #from chromadb import Documents, EmbeddingFunction, Embeddings
35
 
36
- from torch import cuda, backends
37
 
38
  # Check for torch cuda
39
  print(cuda.is_available())
@@ -51,17 +67,90 @@ chromadb_file = "chroma.sqlite3"
51
  if os.path.isfile(chromadb_file):
52
  os.remove(chromadb_file)
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
55
 
56
- filename = in_file.name
57
- # Import data
 
 
 
 
 
 
 
 
58
 
59
- df = read_file(filename)
 
 
 
 
60
 
61
  #df = pd.read_parquet(file_in.name)
62
- df_list = list(df[text_column].astype(str))
63
  #df_list = df
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  if clean == "Yes":
66
  df_list_clean = initial_clean(df_list)
67
 
@@ -69,23 +158,84 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
69
  out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
70
 
71
  #corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
72
- corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
  else:
 
 
 
 
 
77
  #corpus = [word_tokenize(doc.lower()) for doc in df_list]
78
- corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  out_file_name = None
80
 
 
 
81
 
 
 
82
 
83
- print("Finished data clean")
84
 
85
  if len(df_list) >= 20:
86
  message = "Data loaded"
87
  else:
88
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
 
 
89
 
90
  return corpus, message, df, out_file_name
91
 
@@ -236,11 +386,17 @@ def put_columns_in_df(in_file, in_bm25_column):
236
  When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
237
  '''
238
 
 
 
 
 
 
 
239
  new_choices = []
240
  concat_choices = []
241
 
242
 
243
- df = read_file(in_file.name)
244
  new_choices = list(df.columns)
245
 
246
  #print(new_choices)
@@ -279,38 +435,6 @@ def dummy_function(gradio_component):
279
  def display_info(info_component):
280
  gr.Info(info_component)
281
 
282
- # class MyEmbeddingFunction(EmbeddingFunction):
283
- # def __call__(self, input) -> Embeddings:
284
- # embeddings = []
285
- # for text in input:
286
- # embeddings.append(embeddings_model.encode(text))
287
-
288
- # return embeddings
289
-
290
- def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
291
- '''
292
- Load embeddings model and create a global variable based on it.
293
- '''
294
-
295
- # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
296
-
297
- #else:
298
- embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
299
-
300
- global embeddings
301
-
302
- embeddings = embeddings_func
303
-
304
- return embeddings
305
-
306
- # Load embeddings
307
- #embeddings_name =
308
- embeddings_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en", trust_remote_code=True, device_map="auto")
309
- #embeddings_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
310
- #embeddings_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
311
-
312
- embeddings = embeddings_model#load_embeddings(embeddings_name)
313
-
314
  def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
315
  '''
316
  Takes a Langchain document class and saves it into a Chroma sqlite file.
@@ -318,7 +442,7 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
318
 
319
  print(f"> Total split documents: {len(docs_out)}")
320
 
321
- print(docs_out)
322
 
323
  page_contents = [doc.page_content for doc in docs_out]
324
  page_meta = [doc.metadata for doc in docs_out]
@@ -330,13 +454,15 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
330
  #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
331
  # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
332
 
333
- embeddings_list = embeddings.encode(sentences=page_contents, max_length=256).tolist() # For Jina embeddings
334
  #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
335
  #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
336
 
337
  toc = time.perf_counter()
338
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
339
 
 
 
340
  # Jina tiny
341
  # This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length
342
  # For 50k records:
@@ -345,6 +471,8 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
345
  # 43 seconds at 256 max length
346
  # 31 seconds at 128 max length
347
 
 
 
348
  # BGE small
349
  # 96 seconds for 50k records at 512 length
350
 
@@ -360,7 +488,7 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
360
  chroma_tic = time.perf_counter()
361
 
362
  # Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used.
363
- client = chromadb.PersistentClient(path="./db", settings=Settings(
364
  anonymized_telemetry=False))
365
 
366
  try:
@@ -408,117 +536,203 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
408
 
409
  return out_message, collection
410
 
411
- def jina_simple_retrieval(new_question_kworded, vectorstore, docs, k_val, out_passages,
412
- vec_score_cut_off, vec_weight): # ,vectorstore, embeddings
 
 
413
 
414
- from numpy.linalg import norm
415
 
416
- cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
417
 
418
- query = embeddings.encode(new_question_kworded)
 
419
 
420
- # Calculate cosine similarity with each string in the list
421
- cosine_similarities = [cos_sim(query, string_vector) for string_vector in vectorstore]
422
 
423
- print(cosine_similarities)
424
 
425
- return cosine_similarities
426
 
427
- def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
428
- vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None): # ,vectorstore, embeddings
 
429
 
430
- query = embeddings.encode(new_question_kworded).tolist()
 
 
 
 
431
 
432
- docs = vectorstore.query(
433
- query_embeddings=query,
434
- n_results= k_val # No practical limit on number of responses returned
435
- #where={"metadata_field": "is_equal_to_this"},
436
- #where_document={"$contains":"search_string"}
437
- )
438
 
439
- df_docs = pd.DataFrame(data={'ids': docs['ids'][0],
440
- 'documents': docs['documents'][0],
441
- 'metadatas':docs['metadatas'][0],
442
- 'distances':docs['distances'][0]#,
443
- #'embeddings': docs['embeddings']
444
- })
445
-
446
- def create_docs_keep_from_df(df):
447
- dict_out = {'ids' : [df['ids']],
448
- 'documents': [df['documents']],
449
- 'metadatas': [df['metadatas']],
450
- 'distances': [round(df['distances'].astype(float), 2)],
451
- 'embeddings': None
452
- }
453
- return dict_out
454
-
455
- # Prepare the DataFrame by transposing
456
- #df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
457
 
458
- # Keep only documents with a certain score
459
 
460
- print(df_docs)
461
-
462
- docs_scores = df_docs["distances"] #.astype(float)
 
463
 
464
- # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
465
- score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
466
- #docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
467
 
468
- #print(docs_keep)
 
 
 
 
 
 
 
 
 
 
469
 
470
- if score_more_limit.empty:
471
- return 'No result found!', None
472
 
473
- # Only keep sources that are at least 100 characters long
474
- docs_len = score_more_limit["documents"].str.len() >= 100
 
475
 
476
- print(docs_len)
 
 
477
 
478
- length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100
479
- #docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
480
 
481
- #print(length_more_limit)
 
482
 
483
- if length_more_limit.empty:
484
- return 'No result found!', None
485
-
486
- length_more_limit['ids'] = length_more_limit['ids'].astype(int)
487
 
488
- #length_more_limit.to_csv("length_more_limit.csv", index = None)
489
 
490
- # Explode the 'metadatas' dictionary into separate columns
491
- df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)
492
 
493
- print(length_more_limit)
494
- print(df_metadata_expanded)
495
 
496
- # Concatenate the original DataFrame with the expanded metadata DataFrame
497
- results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
 
 
 
 
 
 
 
498
 
499
- results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
 
500
 
501
- results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
502
- results_df_out['distances'] = round(results_df_out['distances'].astype(float), 2)
503
 
504
- # Join back to original df
505
- # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
506
 
507
- # Join on additional files
508
- if in_join_file:
509
- join_filename = in_join_file.name
510
 
511
- # Import data
512
- join_df = read_file(join_filename)
513
- join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
514
 
515
- # Duplicates dropped so as not to expand out dataframe
516
- join_df = join_df.drop_duplicates(in_join_column)
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
- results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
519
 
520
- results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
 
521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
  results_df_name = "semantic_search_result.csv"
524
  results_df_out.to_csv(results_df_name, index= None)
@@ -526,6 +740,7 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
526
 
527
  return results_first_text, results_df_name
528
 
 
529
  ## Gradio app - BM25 search
530
  block = gr.Blocks(theme = gr.themes.Base())
531
 
@@ -539,7 +754,7 @@ with block:
539
 
540
  k_val = gr.State(9999)
541
  out_passages = gr.State(9999)
542
- vec_score_cut_off = gr.State(70)
543
  vec_weight = gr.State(1)
544
 
545
  docs_keep_as_doc_state = gr.State()
@@ -572,10 +787,9 @@ depends on factors such as the type of documents or queries. Information taken f
572
  current_source = gr.Textbox(label="Current data source(s)", value="None")
573
 
574
  with gr.Accordion(label = "Load in data", open=True):
575
- in_bm25_file = gr.File(label="Upload your search data here")
576
  with gr.Row():
577
- in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
578
-
579
  load_bm25_data_button = gr.Button(value="Load data")
580
 
581
  with gr.Row():
@@ -583,10 +797,10 @@ depends on factors such as the type of documents or queries. Information taken f
583
 
584
  with gr.Accordion(label = "Search data", open=True):
585
  with gr.Row():
586
- in_query = gr.Textbox(label="Enter your search term")
587
  mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
588
 
589
- search_button = gr.Button(value="Search text")
590
 
591
  with gr.Row():
592
  output_single_text = gr.Textbox(label="Top result")
@@ -597,11 +811,14 @@ depends on factors such as the type of documents or queries. Information taken f
597
  current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
598
 
599
  with gr.Accordion("Load in data", open = True):
600
- in_semantic_file = gr.File(label="Upload data file for semantic search")
601
- in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
602
- load_semantic_data_button = gr.Button(value="Load in data file", variant="secondary", scale=0)
 
 
 
 
603
 
604
- ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
605
  semantic_query = gr.Textbox(label="Enter semantic search query here")
606
  semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
607
 
@@ -642,7 +859,7 @@ depends on factors such as the type of documents or queries. Information taken f
642
  in_alpha_button.click(display_info, inputs=in_alpha_info)
643
  in_no_search_results_button.click(display_info, inputs=in_no_search_info)
644
 
645
-
646
  # Update dropdowns upon initial file load
647
  in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
648
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
@@ -653,17 +870,20 @@ depends on factors such as the type of documents or queries. Information taken f
653
  then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
654
 
655
  # BM25 search functions on click or enter
656
- search_button.click(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="search")
657
- in_query.submit(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
658
 
 
659
  # Load in a csv/excel file for semantic search
660
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
661
- load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
662
- then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, load_finished_message]).\
663
- then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
664
 
665
  # Semantic search query
666
- semantic_submit.click(chroma_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
 
 
667
 
668
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
669
  in_bm25_column.change(dummy_function, in_bm25_column, None)
 
10
  from nltk import word_tokenize
11
  #from sentence_transformers import SentenceTransformer
12
 
13
+ # Try SpaCy alternative tokeniser
14
+
15
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
16
 
17
  import gradio as gr
18
  import pandas as pd
19
+ import numpy as np
20
  import os
21
  import time
22
  from chromadb.config import Settings
23
 
24
  from transformers import AutoModel
25
 
26
+ # Load the SpaCy mode
27
+ from spacy.cli import download
28
+ import spacy
29
+ spacy.prefer_gpu()
30
+
31
+ #os.system("python -m spacy download en_core_web_sm")
32
+ try:
33
+ nlp = spacy.load("en_core_web_sm")
34
+ except:
35
+ download("en_core_web_sm")
36
+ nlp = spacy.load("en_core_web_sm")
37
+
38
+
39
  # model = AutoModel.from_pretrained('./model_and_tokenizer/int8-model.onnx', use_embedding_runtime=True)
40
  # sentence_embeddings = model.generate(engine_input)['last_hidden_state:0']
41
 
 
49
  #from typing_extensions import Protocol
50
  #from chromadb import Documents, EmbeddingFunction, Embeddings
51
 
52
+ from torch import cuda, backends, tensor, mm
53
 
54
  # Check for torch cuda
55
  print(cuda.is_available())
 
67
  if os.path.isfile(chromadb_file):
68
  os.remove(chromadb_file)
69
 
70
+
71
+ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
72
+ '''
73
+ Load embeddings model and create a global variable based on it.
74
+ '''
75
+
76
+ # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
77
+
78
+ #else:
79
+ embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
80
+
81
+ global embeddings
82
+
83
+ embeddings = embeddings_func
84
+
85
+ return embeddings
86
+
87
+ # Load embeddings
88
+ embeddings_name = "jinaai/jina-embeddings-v2-small-en"
89
+ embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
90
+ #embeddings_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
91
+ #embeddings_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
92
+
93
+ #tokenizer = AutoTokenizer.from_pretrained(embeddings_name, device_map = "auto")#to(torch_device) # From Jina
94
+ # Construction 2 - from SpaCy - https://spacy.io/api/tokenizer
95
+
96
+
97
+ #from spacy.lang.en import English
98
+ #nlp = #English()
99
+ # Create a Tokenizer with the default settings for English
100
+ # including punctuation rules and exceptions
101
+ tokenizer = nlp.tokenizer
102
+
103
+ embeddings = embeddings_model#load_embeddings(embeddings_name)
104
+
105
+
106
  def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
107
 
108
+ file_list = [string.name for string in in_file]
109
+
110
+ print(file_list)
111
+
112
+ data_file_names = [string for string in file_list if "tokenised" not in string]
113
+
114
+ df = read_file(data_file_names[0])
115
+
116
+ ## Load in pre-tokenised corpus if exists
117
+ tokenised_df = pd.DataFrame()
118
 
119
+ tokenised_file_names = [string for string in file_list if "tokenised" in string]
120
+
121
+ if tokenised_file_names:
122
+ tokenised_df = read_file(tokenised_file_names[0])
123
+ print("Tokenised df is: ", tokenised_df.head())
124
 
125
  #df = pd.read_parquet(file_in.name)
126
+ df_list = list(df[text_column].astype(str).str.lower())
127
  #df_list = df
128
 
129
+ import math
130
+
131
+ def get_total_batches(my_list, batch_size):
132
+ return math.ceil(len(my_list) / batch_size)
133
+
134
+ from itertools import islice
135
+
136
+ def batch(iterable, batch_size):
137
+ iterator = iter(iterable)
138
+ for first in iterator:
139
+ yield [first] + list(islice(iterator, batch_size - 1))
140
+
141
+ #def batch(my_list, batch_size):
142
+ # Splitting the list into batches
143
+ # for i in range(0, len(my_list), batch_size):
144
+ # batch = my_list[i:i + batch_size]
145
+
146
+ # Process each batch
147
+ # Replace this with your processing logic
148
+ #print("Processing batch:", batch)
149
+
150
+ batch_size = 256
151
+
152
+ tic = time.perf_counter()
153
+
154
  if clean == "Yes":
155
  df_list_clean = initial_clean(df_list)
156
 
 
158
  out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
159
 
160
  #corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
161
+ #corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
162
+
163
+ #total_batches = get_total_batches(df_list_clean, batch_size)
164
+ #data_batched = batch(df_list_clean, batch_size)
165
 
166
+ #print(data_batched)
167
+
168
+ #print(df_list_clean[0])
169
+
170
+ # Using encode_batch
171
+ #encodings = tokenizer.encode_batch(texts)
172
+
173
+ # Extracting tokens
174
+ #tokens_list = [encoding.tokens for encoding in encodings]
175
+
176
+ #corpus = [tokenizer(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
177
+ #corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
178
+ # print(df_list_clean)
179
+ # corpus = tokenizer.batch_encode_plus(df_list_clean).tokens
180
+
181
+ #corpus = [[token.text for token in nlp(text)] for text in df_list_clean]
182
+
183
+ # Tokenize texts in batches
184
+ if not tokenised_df.empty:
185
+ corpus = tokenised_df.iloc[:,0].tolist()
186
+ print("Corpus is: ", corpus[0:5])
187
+
188
+ else:
189
+ corpus = []
190
+ for doc in tokenizer.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
191
+ corpus.append([token.text for token in doc])
192
+ #for doc in nlp.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), batch_size=batch_size): # You can adjust batch_size based on your requirement
193
+ # corpus.append([token.text for token in doc])
194
 
195
 
196
  else:
197
+ #total_batches = get_total_batches(df_list, batch_size)
198
+ #data_batched = batch(df_list, batch_size)
199
+
200
+ #print(data_batched)
201
+
202
  #corpus = [word_tokenize(doc.lower()) for doc in df_list]
203
+ #corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
204
+ #corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
205
+ #corpus = tokenizer.batch_encode_plus(df_list).tokens # for jina
206
+
207
+ print(df_list[0])
208
+ #corpus = [[token.text for token in nlp(text)] for text in df_list]
209
+
210
+ # Tokenize texts in batches
211
+ if not tokenised_df.empty:
212
+ corpus = tokenised_df.iloc[:,0].tolist()
213
+ print("Corpus is: ", corpus[0:5])
214
+
215
+ else:
216
+
217
+ corpus = []
218
+ for doc in tokenizer.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
219
+ #for doc in nlp.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), #batch_size=batch_size): # You can adjust batch_size based on your requirement
220
+ corpus.append([token.text for token in doc])
221
+
222
+ #corpus = tokenizer(df_list)
223
  out_file_name = None
224
 
225
+ print(corpus[0])
226
+
227
 
228
+ toc = time.perf_counter()
229
+ tokenizer_time_out = f"Tokenising the text took {toc - tic:0.1f} seconds"
230
 
231
+ print("Finished data clean. " + tokenizer_time_out)
232
 
233
  if len(df_list) >= 20:
234
  message = "Data loaded"
235
  else:
236
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
237
+
238
+ pd.DataFrame(data={"Corpus":corpus}).to_parquet("keyword_search_tokenised_data.parquet")
239
 
240
  return corpus, message, df, out_file_name
241
 
 
386
  When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
387
  '''
388
 
389
+ file_list = [string.name for string in in_file]
390
+
391
+ print(file_list)
392
+
393
+ data_file_names = [string for string in file_list if "tokenised" not in string]
394
+
395
  new_choices = []
396
  concat_choices = []
397
 
398
 
399
+ df = read_file(data_file_names[0])
400
  new_choices = list(df.columns)
401
 
402
  #print(new_choices)
 
435
  def display_info(info_component):
436
  gr.Info(info_component)
437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
439
  '''
440
  Takes a Langchain document class and saves it into a Chroma sqlite file.
 
442
 
443
  print(f"> Total split documents: {len(docs_out)}")
444
 
445
+ #print(docs_out)
446
 
447
  page_contents = [doc.page_content for doc in docs_out]
448
  page_meta = [doc.metadata for doc in docs_out]
 
454
  #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
455
  # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
456
 
457
+ embeddings_list = embeddings.encode(sentences=page_contents, max_length=256, show_progress_bar = True, batch_size = 32).tolist() # For Jina embeddings
458
  #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
459
  #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
460
 
461
  toc = time.perf_counter()
462
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
463
 
464
+ #pd.Series(embeddings_list).to_csv("embeddings_out.csv")
465
+
466
  # Jina tiny
467
  # This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length
468
  # For 50k records:
 
471
  # 43 seconds at 256 max length
472
  # 31 seconds at 128 max length
473
 
474
+ # The embedding took 1372.5 seconds at 256 max length for 655,020 case notes
475
+
476
  # BGE small
477
  # 96 seconds for 50k records at 512 length
478
 
 
488
  chroma_tic = time.perf_counter()
489
 
490
  # Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used.
491
+ client = chromadb.PersistentClient(path="./last_year", settings=Settings(
492
  anonymized_telemetry=False))
493
 
494
  try:
 
536
 
537
  return out_message, collection
538
 
539
+ def docs_to_np_array(docs_out, in_file, embeddings = embeddings, progress=gr.Progress()):
540
+ '''
541
+ Takes a Langchain document class and saves it into a Chroma sqlite file.
542
+ '''
543
 
544
+ print(f"> Total split documents: {len(docs_out)}")
545
 
546
+ #print(docs_out)
547
 
548
+ page_contents = [doc.page_content for doc in docs_out]
549
+
550
 
551
+ ## Load in pre-embedded file if exists
552
+ file_list = [string.name for string in in_file]
553
 
554
+ print(file_list)
555
 
556
+ embeddings_file_names = [string for string in file_list if "embedding" in string]
557
 
558
+ if embeddings_file_names:
559
+ embeddings_out = np.load(embeddings_file_names[0])
560
+ print("embeddings loaded: ", embeddings_out)
561
 
562
+ if not embeddings_file_names:
563
+ tic = time.perf_counter()
564
+ #embeddings_list = []
565
+ #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
566
+ # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
567
 
568
+ embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
569
+ #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
570
+ #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
 
 
 
571
 
572
+ toc = time.perf_counter()
573
+ time_out = f"The embedding took {toc - tic:0.1f} seconds"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
 
575
+ np.savez_compressed('semantic_search_embeddings.npz', embeddings_out)
576
 
577
+ out_message = "Document processing complete. Ready to search."
578
+ print(out_message)
579
+
580
+ return out_message, embeddings_out
581
 
582
+ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
 
 
583
 
584
+ def create_docs_keep_from_df(df):
585
+ dict_out = {'ids' : [df['ids']],
586
+ 'documents': [df['documents']],
587
+ 'metadatas': [df['metadatas']],
588
+ 'distances': [round(df['distances'].astype(float), 3)],
589
+ 'embeddings': None
590
+ }
591
+ return dict_out
592
+
593
+ # Prepare the DataFrame by transposing
594
+ #df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
595
 
596
+ # Keep only documents with a certain score
 
597
 
598
+ #print(df_docs)
599
+
600
+ docs_scores = df_docs["distances"] #.astype(float)
601
 
602
+ # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
603
+ score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :]
604
+ #docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
605
 
606
+ #print(docs_keep)
 
607
 
608
+ if score_more_limit.empty:
609
+ return 'No result found!', None
610
 
611
+ # Only keep sources that are at least 100 characters long
612
+ docs_len = score_more_limit["documents"].str.len() >= 100
 
 
613
 
614
+ #print(docs_len)
615
 
616
+ length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100
617
+ #docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
618
 
619
+ #print(length_more_limit)
 
620
 
621
+ if length_more_limit.empty:
622
+ return 'No result found!', None
623
+
624
+ length_more_limit['ids'] = length_more_limit['ids'].astype(int)
625
+
626
+ #length_more_limit.to_csv("length_more_limit.csv", index = None)
627
+
628
+ # Explode the 'metadatas' dictionary into separate columns
629
+ df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)
630
 
631
+ #print(length_more_limit)
632
+ #print(df_metadata_expanded)
633
 
634
+ # Concatenate the original DataFrame with the expanded metadata DataFrame
635
+ results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
636
 
637
+ results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
 
638
 
639
+ results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
640
+ results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
 
641
 
642
+ # Join back to original df
643
+ # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
 
644
 
645
+ # Join on additional files
646
+ if in_join_file:
647
+ join_filename = in_join_file.name
648
+
649
+ # Import data
650
+ join_df = read_file(join_filename)
651
+ join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
652
+
653
+ # Duplicates dropped so as not to expand out dataframe
654
+ join_df = join_df.drop_duplicates(in_join_column)
655
+
656
+ results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
657
+
658
+ results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
659
 
660
+ return results_df_out
661
 
662
+ def jina_simple_retrieval(new_question_kworded, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
663
+ vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings, progress=gr.Progress()): # ,vectorstore, embeddings
664
 
665
+ print("vectorstore loaded: ", vectorstore)
666
+
667
+ # Convert it to a PyTorch tensor and transfer to GPU
668
+ vectorstore_tensor = tensor(vectorstore).to(device)
669
+
670
+ # Load the sentence transformer model and move it to GPU
671
+ embeddings = embeddings.to(device)
672
+
673
+ # Encode the query using the sentence transformer and convert to a PyTorch tensor
674
+ query = embeddings.encode(new_question_kworded)
675
+ query_tensor = tensor(query).to(device)
676
+
677
+ if query_tensor.dim() == 1:
678
+ query_tensor = query_tensor.unsqueeze(0) # Reshape to 2D with one row
679
+
680
+ # Normalize the query tensor and vectorstore tensor
681
+ query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True)
682
+ vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True)
683
+
684
+ # Calculate cosine similarities (batch processing)
685
+ cosine_similarities = mm(query_norm, vectorstore_norm.T)
686
+
687
+ # Flatten the tensor to a 1D array
688
+ cosine_similarities = cosine_similarities.flatten()
689
+
690
+ # Convert to a NumPy array if it's still a PyTorch tensor
691
+ cosine_similarities = cosine_similarities.cpu().numpy()
692
+
693
+ # Create a Pandas Series
694
+ cosine_similarities_series = pd.Series(cosine_similarities)
695
+
696
+ # Pull out relevent info from docs
697
+ page_contents = [doc.page_content for doc in docs]
698
+ page_meta = [doc.metadata for doc in docs]
699
+ ids_range = range(0,len(page_contents))
700
+ ids = [str(element) for element in ids_range]
701
+
702
+ df_docs = pd.DataFrame(data={"ids": ids,
703
+ "documents": page_contents,
704
+ "metadatas":page_meta,
705
+ "distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]
706
+
707
+
708
+ results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
709
+
710
+ results_df_name = "semantic_search_result.csv"
711
+ results_df_out.to_csv(results_df_name, index= None)
712
+ results_first_text = results_df_out.iloc[0, 1]
713
+
714
+ return results_first_text, results_df_name
715
+
716
+ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
717
+ vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None): # ,vectorstore, embeddings
718
+
719
+ query = embeddings.encode(new_question_kworded).tolist()
720
+
721
+ docs = vectorstore.query(
722
+ query_embeddings=query,
723
+ n_results= k_val # No practical limit on number of responses returned
724
+ #where={"metadata_field": "is_equal_to_this"},
725
+ #where_document={"$contains":"search_string"}
726
+ )
727
+
728
+ df_docs = pd.DataFrame(data={'ids': docs['ids'][0],
729
+ 'documents': docs['documents'][0],
730
+ 'metadatas':docs['metadatas'][0],
731
+ 'distances':docs['distances'][0]#,
732
+ #'embeddings': docs['embeddings']
733
+ })
734
+
735
+ results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
736
 
737
  results_df_name = "semantic_search_result.csv"
738
  results_df_out.to_csv(results_df_name, index= None)
 
740
 
741
  return results_first_text, results_df_name
742
 
743
+
744
  ## Gradio app - BM25 search
745
  block = gr.Blocks(theme = gr.themes.Base())
746
 
 
754
 
755
  k_val = gr.State(9999)
756
  out_passages = gr.State(9999)
757
+ vec_score_cut_off = gr.State(0.7)
758
  vec_weight = gr.State(1)
759
 
760
  docs_keep_as_doc_state = gr.State()
 
787
  current_source = gr.Textbox(label="Current data source(s)", value="None")
788
 
789
  with gr.Accordion(label = "Load in data", open=True):
790
+ in_bm25_file = gr.File(label="Upload your search data here", file_count= 'multiple', file_types = ['.parquet', '.csv'])
791
  with gr.Row():
792
+ in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
 
793
  load_bm25_data_button = gr.Button(value="Load data")
794
 
795
  with gr.Row():
 
797
 
798
  with gr.Accordion(label = "Search data", open=True):
799
  with gr.Row():
800
+ keyword_query = gr.Textbox(label="Enter your search term")
801
  mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
802
 
803
+ keyword_search_button = gr.Button(value="Search text")
804
 
805
  with gr.Row():
806
  output_single_text = gr.Textbox(label="Top result")
 
811
  current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
812
 
813
  with gr.Accordion("Load in data", open = True):
814
+ in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz'])
815
+
816
+ with gr.Row():
817
+ in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
818
+ load_semantic_data_button = gr.Button(value="Load in data file", variant="secondary")
819
+
820
+ ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
821
 
 
822
  semantic_query = gr.Textbox(label="Enter semantic search query here")
823
  semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
824
 
 
859
  in_alpha_button.click(display_info, inputs=in_alpha_info)
860
  in_no_search_results_button.click(display_info, inputs=in_no_search_info)
861
 
862
+ ### BM25 SEARCH ###
863
  # Update dropdowns upon initial file load
864
  in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
865
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
 
870
  then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
871
 
872
  # BM25 search functions on click or enter
873
+ keyword_search_button.click(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="search")
874
+ keyword_query.submit(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
875
 
876
+ ### SEMANTIC SEARCH ###
877
  # Load in a csv/excel file for semantic search
878
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
879
+ load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic, ingest_embed_out]).\
880
+ then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, ingest_embed_out]).\
881
+ then(docs_to_np_array, inputs=[ingest_docs, in_semantic_file], outputs=[ingest_embed_out, vectorstore_state])
882
 
883
  # Semantic search query
884
+ semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
885
+
886
+ semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
887
 
888
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
889
  in_bm25_column.change(dummy_function, in_bm25_column, None)
requirements.txt CHANGED
@@ -8,4 +8,5 @@ chromadb
8
  torch
9
  accelerate
10
  sentence-transformers
 
11
  gradio==3.50.0
 
8
  torch
9
  accelerate
10
  sentence-transformers
11
+ spacy
12
  gradio==3.50.0
search_funcs/clean_funcs.py CHANGED
@@ -19,6 +19,7 @@
19
  import nltk
20
  import re
21
  import string
 
22
  from nltk.stem import WordNetLemmatizer
23
  from nltk.stem import PorterStemmer
24
  from nltk.corpus import wordnet as wn
@@ -122,15 +123,25 @@ def initial_clean(texts):
122
  clean_texts.append(text)
123
  return clean_texts
124
  '''
 
 
 
 
 
 
 
 
 
 
125
  # Pre-compiling the regular expressions for efficiency
126
- email_start_pattern = re.compile('.*importance:|.*subject:')
127
- email_end_pattern = re.compile('kind regards.*|many thanks.*|sincerely.*')
128
- html_pattern = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0')
129
- email_pattern = re.compile('\S*@\S*\s?')
130
- num_pattern = re.compile(r'[0-9]+')
131
- postcode_pattern = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)')
132
- warning_pattern = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.')
133
- nbsp_pattern = re.compile(r'&nbsp;')
134
 
135
  def stem_sentence(sentence):
136
 
@@ -143,8 +154,6 @@ def stem_sentences(sentences, progress=gr.Progress()):
143
  stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
144
  return stemmed_sentences
145
 
146
-
147
-
148
  def get_lemma_text(text):
149
  # Tokenize the input string into words
150
  tokens = word_tokenize(text)
@@ -178,30 +187,60 @@ def get_lemma_tokens(tokens):
178
  lemmas.append(lemma)
179
  return lemmas
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def initial_clean(texts , progress=gr.Progress()):
182
- clean_texts = []
183
 
184
- i = 1
185
  #progress(0, desc="Cleaning texts")
186
- for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
187
- #print("Cleaning row: ", i)
188
- text = re.sub(email_start_pattern, '', text)
189
- text = re.sub(email_end_pattern, '', text)
190
- text = re.sub(postcode_pattern, '', text)
191
- text = remove_hyphens(text)
192
- text = re.sub(html_pattern, '', text)
193
- text = re.sub(email_pattern, '', text)
194
- text = re.sub(nbsp_pattern, '', text)
195
- #text = re.sub(warning_pattern, '', text)
196
- #text = stem_sentence(text)
197
- text = get_lemma_text(text)
198
- text = ' '.join(text)
199
- # Uncomment the next line if you want to remove numbers as well
200
- # text = re.sub(num_pattern, '', text)
201
- clean_texts.append(text)
 
 
 
 
 
 
202
 
203
- i += 1
204
- return clean_texts
205
 
206
  # Sample execution
207
  #sample_texts = [
 
19
  import nltk
20
  import re
21
  import string
22
+ import polars as pl
23
  from nltk.stem import WordNetLemmatizer
24
  from nltk.stem import PorterStemmer
25
  from nltk.corpus import wordnet as wn
 
123
  clean_texts.append(text)
124
  return clean_texts
125
  '''
126
+
127
+ email_start_pattern_regex = r'.*importance:|.*subject:'
128
+ email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
129
+ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
130
+ email_pattern_regex = r'\S*@\S*\s?'
131
+ num_pattern_regex = r'[0-9]+'
132
+ postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
133
+ warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
134
+ nbsp_pattern_regex = r'&nbsp;'
135
+
136
  # Pre-compiling the regular expressions for efficiency
137
+ email_start_pattern = re.compile(email_start_pattern_regex)
138
+ email_end_pattern = re.compile(email_end_pattern_regex)
139
+ html_pattern = re.compile(html_pattern_regex)
140
+ email_pattern = re.compile(email_end_pattern_regex)
141
+ num_pattern = re.compile(num_pattern_regex)
142
+ postcode_pattern = re.compile(postcode_pattern_regex)
143
+ warning_pattern = re.compile(warning_pattern_regex)
144
+ nbsp_pattern = re.compile(nbsp_pattern_regex)
145
 
146
  def stem_sentence(sentence):
147
 
 
154
  stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
155
  return stemmed_sentences
156
 
 
 
157
  def get_lemma_text(text):
158
  # Tokenize the input string into words
159
  tokens = word_tokenize(text)
 
187
  lemmas.append(lemma)
188
  return lemmas
189
 
190
+ # def initial_clean(texts , progress=gr.Progress()):
191
+ # clean_texts = []
192
+
193
+ # i = 1
194
+ # #progress(0, desc="Cleaning texts")
195
+ # for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
196
+ # #print("Cleaning row: ", i)
197
+ # text = re.sub(email_start_pattern, '', text)
198
+ # text = re.sub(email_end_pattern, '', text)
199
+ # text = re.sub(postcode_pattern, '', text)
200
+ # text = remove_hyphens(text)
201
+ # text = re.sub(html_pattern, '', text)
202
+ # text = re.sub(email_pattern, '', text)
203
+ # text = re.sub(nbsp_pattern, '', text)
204
+ # #text = re.sub(warning_pattern, '', text)
205
+ # #text = stem_sentence(text)
206
+ # text = get_lemma_text(text)
207
+ # text = ' '.join(text)
208
+ # # Uncomment the next line if you want to remove numbers as well
209
+ # # text = re.sub(num_pattern, '', text)
210
+ # clean_texts.append(text)
211
+
212
+ # i += 1
213
+ # return clean_texts
214
+
215
+
216
  def initial_clean(texts , progress=gr.Progress()):
217
+ texts = pl.Series(texts)#[]
218
 
219
+ #i = 1
220
  #progress(0, desc="Cleaning texts")
221
+ #for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
222
+ #print("Cleaning row: ", i)
223
+ text = texts.str.replace_all(email_start_pattern_regex, '')
224
+ text = text.str.replace_all(email_end_pattern_regex, '')
225
+ #text = re.sub(postcode_pattern, '', text)
226
+ #text = remove_hyphens(text)
227
+ text = text.str.replace_all(html_pattern_regex, '')
228
+ text = text.str.replace_all(email_pattern_regex, '')
229
+ #text = re.sub(nbsp_pattern, '', text)
230
+ #text = re.sub(warning_pattern, '', text)
231
+ #text = stem_sentence(text)
232
+ #text = get_lemma_text(text)
233
+ #text = ' '.join(text)
234
+ # Uncomment the next line if you want to remove numbers as well
235
+ # text = re.sub(num_pattern, '', text)
236
+ #clean_texts.append(text)
237
+
238
+ #i += 1
239
+
240
+ text = text.to_list()
241
+
242
+ return text
243
 
 
 
244
 
245
  # Sample execution
246
  #sample_texts = [
search_funcs/ingest.py CHANGED
@@ -3,9 +3,11 @@
3
  import os
4
  import time
5
  import re
 
6
  import pandas as pd
7
  import gradio as gr
8
  from typing import Type, List, Literal
 
9
 
10
  from pydantic import BaseModel, Field
11
 
@@ -132,30 +134,43 @@ def parse_csv_or_excel(file_path, text_column = "text"):
132
 
133
  #out_df = pd.DataFrame()
134
 
 
 
 
 
 
 
 
135
  #for file_path in file_paths:
136
- file_extension = determine_file_type(file_path.name)
137
- file_name = get_file_path_end(file_path.name)
138
  file_names = [file_name]
139
 
 
 
140
  if file_extension == ".csv":
141
- df = pd.read_csv(file_path.name, low_memory=False)
142
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
143
  df['source'] = file_name
144
  df['page_section'] = ""
145
  elif file_extension == ".xlsx":
146
- df = pd.read_excel(file_path.name, engine='openpyxl')
147
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
148
  df['source'] = file_name
149
  df['page_section'] = ""
150
  elif file_extension == ".parquet":
151
- df = pd.read_parquet(file_path.name)
152
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
153
  df['source'] = file_name
154
  df['page_section'] = ""
155
  else:
156
  print(f"Unsupported file type: {file_extension}")
157
  return pd.DataFrame(), ['Please choose a valid file type']
158
- return df, file_names
 
 
 
 
159
 
160
  def get_file_path_end(file_path):
161
  match = re.search(r'(.*[\/\\])?(.+)$', file_path)
@@ -221,18 +236,22 @@ def combine_metadata_columns(df, cols):
221
  df['blank_column'] = ""
222
 
223
  for n, col in enumerate(cols):
224
- df[col] = df[col].astype(str).str.replace('"',"'").str.cat(df['blank_column'].astype(str), sep="")
225
 
226
  df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '
227
 
228
 
229
- df['metadatas'] = (df['metadatas'] + "}").str.replace(", }", "}")
230
 
231
  return df['metadatas']
232
 
233
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
234
  """Converts a DataFrame's content to a list of Documents with metadata."""
235
 
 
 
 
 
236
  doc_sections = []
237
  df[text_column] = df[text_column].astype(str) # Ensure column is a string column
238
 
@@ -247,33 +266,67 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
247
  if col != text_column:
248
  metadata[col] = value
249
 
250
- # metadata_string = write_out_metadata_as_string(metadata)[0]
251
 
252
  # If chunk_size is provided, split the text into chunks
253
  if chunk_size:
254
  # Assuming you have a text splitter function similar to the PDF handling
255
  text_splitter = RecursiveCharacterTextSplitter(
256
- chunk_size=chunk_size,
257
- # Other arguments as required by the splitter
258
- )
 
 
 
259
  sections = text_splitter.split_text(doc_content)
260
 
261
 
262
  # For each section, create a Document object
263
  for i, section in enumerate(sections):
264
- #section = '. '.join([metadata_string, section])
265
  doc = Document(page_content=section,
266
- metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
267
  doc_sections.append(doc)
 
 
 
268
  else:
269
  # If no chunk_size is provided, create a single Document object for the row
270
  #doc_content = '. '.join([metadata_string, doc_content])
271
  doc = Document(page_content=doc_content, metadata=metadata)
272
  doc_sections.append(doc)
273
-
274
- return doc_sections
275
 
276
- import ast
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
279
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
@@ -296,7 +349,7 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.
296
 
297
  # Create a list of Document objects
298
  doc_sections = [Document(page_content=row['page_content'],
299
- metadata= ast.literal_eval(row["metadata"]))
300
  for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
301
 
302
  ingest_toc = time.perf_counter()
 
3
  import os
4
  import time
5
  import re
6
+ import ast
7
  import pandas as pd
8
  import gradio as gr
9
  from typing import Type, List, Literal
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
 
12
  from pydantic import BaseModel, Field
13
 
 
134
 
135
  #out_df = pd.DataFrame()
136
 
137
+ file_list = [string.name for string in file_path]
138
+
139
+ print(file_list)
140
+
141
+ data_file_names = [string for string in file_list if "tokenised" not in string]
142
+
143
+
144
  #for file_path in file_paths:
145
+ file_extension = determine_file_type(data_file_names[0])
146
+ file_name = get_file_path_end(data_file_names[0])
147
  file_names = [file_name]
148
 
149
+ print(file_extension)
150
+
151
  if file_extension == ".csv":
152
+ df = pd.read_csv(data_file_names[0], low_memory=False)
153
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
154
  df['source'] = file_name
155
  df['page_section'] = ""
156
  elif file_extension == ".xlsx":
157
+ df = pd.read_excel(data_file_names[0], engine='openpyxl')
158
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
159
  df['source'] = file_name
160
  df['page_section'] = ""
161
  elif file_extension == ".parquet":
162
+ df = pd.read_parquet(data_file_names[0])
163
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
164
  df['source'] = file_name
165
  df['page_section'] = ""
166
  else:
167
  print(f"Unsupported file type: {file_extension}")
168
  return pd.DataFrame(), ['Please choose a valid file type']
169
+
170
+ message = "Loaded in file. Now converting to document format."
171
+ print(message)
172
+
173
+ return df, file_names, message
174
 
175
  def get_file_path_end(file_path):
176
  match = re.search(r'(.*[\/\\])?(.+)$', file_path)
 
236
  df['blank_column'] = ""
237
 
238
  for n, col in enumerate(cols):
239
+ df[col] = df[col].astype(str).str.replace('"',"'").str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\r\n', ' ').str.cat(df['blank_column'].astype(str), sep="")
240
 
241
  df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '
242
 
243
 
244
+ df['metadatas'] = (df['metadatas'] + "}").str.replace(', }', '}')
245
 
246
  return df['metadatas']
247
 
248
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
249
  """Converts a DataFrame's content to a list of Documents with metadata."""
250
 
251
+ #print(df.head())
252
+
253
+ print("Converting to documents.")
254
+
255
  doc_sections = []
256
  df[text_column] = df[text_column].astype(str) # Ensure column is a string column
257
 
 
266
  if col != text_column:
267
  metadata[col] = value
268
 
269
+ metadata_string = write_out_metadata_as_string(metadata)[0]
270
 
271
  # If chunk_size is provided, split the text into chunks
272
  if chunk_size:
273
  # Assuming you have a text splitter function similar to the PDF handling
274
  text_splitter = RecursiveCharacterTextSplitter(
275
+ chunk_size=chunk_size,
276
+ chunk_overlap=chunk_overlap,
277
+ split_strat=split_strat,
278
+ start_index=start_index
279
+ ) #Other arguments as required by the splitter
280
+
281
  sections = text_splitter.split_text(doc_content)
282
 
283
 
284
  # For each section, create a Document object
285
  for i, section in enumerate(sections):
286
+ section = '. '.join([metadata_string, section])
287
  doc = Document(page_content=section,
288
+ metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
289
  doc_sections.append(doc)
290
+
291
+ #print("Chunking currently disabled")
292
+
293
  else:
294
  # If no chunk_size is provided, create a single Document object for the row
295
  #doc_content = '. '.join([metadata_string, doc_content])
296
  doc = Document(page_content=doc_content, metadata=metadata)
297
  doc_sections.append(doc)
 
 
298
 
299
+ message = "Data converted to document format. Now creating/loading document embeddings."
300
+ print(message)
301
+
302
+ return doc_sections, message
303
+
304
+
305
+
306
+ def clean_line_breaks(text):
307
+ # Replace \n and \r\n with a space
308
+ return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
309
+
310
+ def parse_metadata(row):
311
+ try:
312
+ # Ensure the 'title' field is a string and clean line breaks
313
+ #if 'TITLE' in row:
314
+ # row['TITLE'] = clean_line_breaks(row['TITLE'])
315
+
316
+ # Convert the row to a string if it's not already
317
+ row_str = str(row) if not isinstance(row, str) else row
318
+
319
+ row_str.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
320
+
321
+ # Parse the string
322
+ metadata = ast.literal_eval(row_str)
323
+ # Process metadata
324
+ return metadata
325
+ except SyntaxError as e:
326
+ print(f"Failed to parse metadata: {row_str}")
327
+ print(f"Error: {e}")
328
+ # Handle the error or log it
329
+ return None # or some default value
330
 
331
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
332
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
 
349
 
350
  # Create a list of Document objects
351
  doc_sections = [Document(page_content=row['page_content'],
352
+ metadata= parse_metadata(row["metadata"]))
353
  for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
354
 
355
  ingest_toc = time.perf_counter()