seanpedrickcase commited on
Commit
2bcd818
1 Parent(s): 4ce2224

Updated to Gradio 4.16.0. Now works correctly with BGE embeddings

Browse files
.gitignore CHANGED
@@ -21,4 +21,5 @@ __pycache__/*
21
  db/*
22
  experiments/*
23
  model/*
24
- build_deps/*
 
 
21
  db/*
22
  experiments/*
23
  model/*
24
+ build_deps/*
25
+ build_deps_old/*
app.py CHANGED
@@ -1,10 +1,4 @@
1
  from typing import Type
2
- from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
3
- #from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
4
- #from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
5
- from search_funcs.helper_functions import dummy_function, display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
6
- from search_funcs.spacy_search_funcs import spacy_fuzzy_search
7
-
8
 
9
  import gradio as gr
10
  import pandas as pd
@@ -12,6 +6,12 @@ import numpy as np
12
 
13
  PandasDataFrame = Type[pd.DataFrame]
14
 
 
 
 
 
 
 
15
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
16
  temp_folder_path = get_temp_folder_path()
17
  empty_folder(temp_folder_path)
@@ -20,6 +20,7 @@ empty_folder(temp_folder_path)
20
  block = gr.Blocks(theme = gr.themes.Base())
21
 
22
  with block:
 
23
 
24
  ingest_text = gr.State()
25
  ingest_metadata = gr.State()
@@ -79,38 +80,40 @@ depends on factors such as the type of documents or queries. Information taken f
79
  with gr.Accordion(label = "Search data", open=True):
80
  keyword_query = gr.Textbox(label="Enter your search term")
81
  with gr.Row():
82
- keyword_search_button = gr.Button(value="Keyword search", variant="primary")
83
- fuzzy_search_button = gr.Button(value="Fuzzy search (much slower)", variant="secondary")
84
  with gr.Row():
85
  output_single_text = gr.Textbox(label="Top result")
86
  output_file = gr.File(label="File output")
87
 
88
 
89
- # with gr.Tab("Semantic search"):
90
- # gr.Markdown(
91
- # """
92
- # **Thematic/semantic search**
93
 
94
- # This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
95
- # """)
96
- # with gr.Row():
97
- # current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
98
 
99
- # with gr.Accordion("Load in data", open = True):
100
- # in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
 
 
 
101
 
102
- # with gr.Row():
103
- # in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
104
- # load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
105
 
106
- # semantic_load_progress = gr.Textbox(label="Load progress")
107
 
108
- # semantic_query = gr.Textbox(label="Enter semantic search query here")
109
- # semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
110
 
111
- # with gr.Row():
112
- # semantic_output_single_text = gr.Textbox(label="Top result")
113
- # semantic_output_file = gr.File(label="File output")
114
 
115
  with gr.Tab(label="Advanced options"):
116
  with gr.Accordion(label="Data load / save options", open = True):
@@ -136,8 +139,8 @@ depends on factors such as the type of documents or queries. Information taken f
136
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
137
  with gr.Accordion(label="Fuzzy search options", open = False):
138
  no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
139
- # with gr.Accordion(label="Semantic search options", open = False):
140
- # semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.75, minimum=0, maximum=0.95, step=0.01)
141
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
142
  in_join_file = gr.File(label="Upload your data to join here")
143
  in_join_message = gr.Textbox(label="Join file load progress")
@@ -166,26 +169,19 @@ depends on factors such as the type of documents or queries. Information taken f
166
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
167
 
168
  # Fuzzy search functions on click
169
-
170
  fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
171
 
172
  ### SEMANTIC SEARCH ###
 
173
  # Load in a csv/excel file for semantic search
174
- # in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress, current_source])
175
- # load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
176
- # then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
177
- # then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
178
 
179
- # # Semantic search query
180
- # semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
181
- # semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
182
-
183
- # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
184
- in_bm25_column.change(dummy_function, in_bm25_column, None)
185
- search_df_join_column.change(dummy_function, search_df_join_column, None)
186
- in_join_column.change(dummy_function, in_join_column, None)
187
- # in_semantic_column.change(dummy_function, in_join_column, None)
188
 
189
  block.queue().launch(debug=True)
190
 
191
-
 
1
  from typing import Type
 
 
 
 
 
 
2
 
3
  import gradio as gr
4
  import pandas as pd
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
+ from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
10
+ from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
11
+ from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
12
+ from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
13
+ from search_funcs.spacy_search_funcs import spacy_fuzzy_search
14
+
15
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
16
  temp_folder_path = get_temp_folder_path()
17
  empty_folder(temp_folder_path)
 
20
  block = gr.Blocks(theme = gr.themes.Base())
21
 
22
  with block:
23
+ print("Please don't close this window! Open the below link in the web browser of your choice.")
24
 
25
  ingest_text = gr.State()
26
  ingest_metadata = gr.State()
 
80
  with gr.Accordion(label = "Search data", open=True):
81
  keyword_query = gr.Textbox(label="Enter your search term")
82
  with gr.Row():
83
+ keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
84
+ fuzzy_search_button = gr.Button(value="Fuzzy search (slow, < 10k rows)", variant="secondary", scale = 0)
85
  with gr.Row():
86
  output_single_text = gr.Textbox(label="Top result")
87
  output_file = gr.File(label="File output")
88
 
89
 
90
+ with gr.Tab("Semantic search"):
91
+ gr.Markdown(
92
+ """
93
+ **Thematic/semantic search**
94
 
95
+ This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
96
+ """)
97
+
 
98
 
99
+ with gr.Row():
100
+ current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
101
+
102
+ with gr.Accordion("Load in data", open = True):
103
+ in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
104
 
105
+ with gr.Row():
106
+ in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
107
+ load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
108
 
109
+ semantic_load_progress = gr.Textbox(label="Load progress")
110
 
111
+ semantic_query = gr.Textbox(label="Enter semantic search query here")
112
+ semantic_submit = gr.Button(value="Start semantic search", variant="primary")
113
 
114
+ with gr.Row():
115
+ semantic_output_single_text = gr.Textbox(label="Top result")
116
+ semantic_output_file = gr.File(label="File output")
117
 
118
  with gr.Tab(label="Advanced options"):
119
  with gr.Accordion(label="Data load / save options", open = True):
 
139
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
140
  with gr.Accordion(label="Fuzzy search options", open = False):
141
  no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
142
+ with gr.Accordion(label="Semantic search options", open = False):
143
+ semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.6, minimum=0, maximum=0.95, step=0.01)
144
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
145
  in_join_file = gr.File(label="Upload your data to join here")
146
  in_join_message = gr.Textbox(label="Join file load progress")
 
169
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
170
 
171
  # Fuzzy search functions on click
 
172
  fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
173
 
174
  ### SEMANTIC SEARCH ###
175
+
176
  # Load in a csv/excel file for semantic search
177
+ in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
178
+ load_semantic_data_button.click(
179
+ csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
180
+ then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
181
 
182
+ # Semantic search query
183
+ semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
184
+ semantic_query.submit(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
 
 
 
 
 
 
185
 
186
  block.queue().launch(debug=True)
187
 
 
how_to_create_exe_dist.txt CHANGED
@@ -6,24 +6,27 @@
6
 
7
  NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
8
 
9
- 4. In file explorer, navigate to the miniconda/envs/new_env/Lib/site-packages/gradio-client/ folder
10
 
11
- 5. Copy types.json from the gradio_client folder to the folder containing the data_text_search.py file
12
 
13
- 6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for gradio and en_core_web_sm (a spaCy model).
14
 
15
- 7. pip install pyinstaller
 
 
16
 
17
- 8. In command line, cd to the folder that contains app.py. Then run the following:
18
 
19
- For one single file:
20
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
 
 
 
 
21
 
22
- If not using embedding model:
23
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --onefile --clean --noconfirm --name DataSearchApp_0.2.2_keyword app.py
24
 
25
- For a small exe with a folder of dependencies:
26
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
27
 
28
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
29
 
 
6
 
7
  NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
8
 
9
+ 6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for gradio and en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
10
 
11
+ 7. pip install pyinstaller
12
 
13
+ 8. In command line, cd to the folder that contains app.py.
14
 
15
+ 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
+
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.2.3 app.py
18
 
19
+ b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
20
 
21
+ a = Analysis(
22
+ ...
23
+ module_collection_mode={
24
+ 'gradio': 'py', # Collect gradio package as source .py files
25
+ }
26
+ )
27
 
28
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.2.3.spec
 
29
 
 
 
30
 
31
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
32
 
requirements.txt CHANGED
@@ -1,10 +1,11 @@
1
- pandas==2.1.4
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
- # transformers==4.32.1
6
- # accelerate==0.26.0
7
- # torch==2.1.2
8
  spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
- gradio==4.16.0
 
 
1
+ pandas==2.2.0
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
+ #transformers==4.37.2
6
+ #accelerate==0.26.0
7
+ torch==2.1.2
8
  spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
+ gradio==4.16.0
11
+ sentence_transformers==2.3.1
search_funcs/bm25_functions.py CHANGED
@@ -6,7 +6,6 @@ import sys
6
  import gzip
7
  import time
8
  import pandas as pd
9
- import numpy as np
10
  from numpy import inf
11
  import gradio as gr
12
 
@@ -15,7 +14,7 @@ from datetime import datetime
15
  today_rev = datetime.now().strftime("%Y%m%d")
16
 
17
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
18
- from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
19
 
20
  # Load the SpaCy model
21
  from spacy.cli.download import download
 
6
  import gzip
7
  import time
8
  import pandas as pd
 
9
  from numpy import inf
10
  import gradio as gr
11
 
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
17
+ from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end
18
 
19
  # Load the SpaCy model
20
  from spacy.cli.download import download
search_funcs/helper_functions.py CHANGED
@@ -30,8 +30,6 @@ def empty_folder(directory_path):
30
  #print(f'Failed to delete {file_path}. Reason: {e}')
31
  print('')
32
 
33
-
34
-
35
  def get_file_path_end(file_path):
36
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
37
  basename = os.path.basename(file_path)
 
30
  #print(f'Failed to delete {file_path}. Reason: {e}')
31
  print('')
32
 
 
 
33
  def get_file_path_end(file_path):
34
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
35
  basename = os.path.basename(file_path)
search_funcs/semantic_functions.py CHANGED
@@ -5,14 +5,14 @@ from typing import Type
5
  import gradio as gr
6
  import numpy as np
7
  from datetime import datetime
8
- import accelerate
 
 
 
 
9
 
10
  today_rev = datetime.now().strftime("%Y%m%d")
11
 
12
- from transformers import AutoModel
13
-
14
- from torch import cuda, backends, tensor, mm
15
-
16
  # Check for torch cuda
17
  print("Is CUDA enabled? ", cuda.is_available())
18
  print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
@@ -29,47 +29,122 @@ print("Device used is: ", torch_device)
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
32
- # Load embeddings
33
  # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
34
  # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
35
- embeddings_name = "jinaai/jina-embeddings-v2-small-en"
36
- local_embeddings_location = "model/jina/"
37
- revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
38
 
39
- try:
40
- embeddings_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
41
- except:
42
- embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
43
 
44
-
45
- def get_file_path_end(file_path):
46
- # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
47
- basename = os.path.basename(file_path)
 
 
 
 
 
 
 
 
 
48
 
49
- # Then, split the basename and its extension and return only the basename without the extension
50
- filename_without_extension, _ = os.path.splitext(basename)
 
 
51
 
52
- #print(filename_without_extension)
53
-
54
- return filename_without_extension
 
 
 
 
 
 
 
 
55
 
56
- def load_embeddings(embeddings_name = embeddings_name):
 
57
  '''
58
- Load embeddings model and create a global variable based on it.
59
  '''
 
 
 
 
 
60
 
61
- # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
62
-
63
- #else:
64
- embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
65
 
66
- global embeddings
67
 
68
- embeddings = embeddings_func
69
 
70
- return embeddings
 
71
 
72
- def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  '''
74
  Takes a Langchain document class and saves it into a Chroma sqlite file.
75
  '''
@@ -79,7 +154,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
79
  return out_message, None, None
80
 
81
 
82
- progress(0.7, desc = "Loading/creating embeddings")
83
 
84
  print(f"> Total split documents: {len(docs_out)}")
85
 
@@ -108,7 +183,11 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
108
  #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
109
  # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
110
 
111
- embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
 
 
 
 
112
  #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
113
  #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
114
 
@@ -120,10 +199,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
120
  if return_intermediate_files == "Yes":
121
  progress(0.9, desc = "Saving embeddings to file")
122
  if embeddings_super_compress == "No":
123
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
124
  np.savez_compressed(semantic_search_file_name, embeddings_out)
125
  else:
126
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
127
  embeddings_out_round = np.round(embeddings_out, 3)
128
  embeddings_out_round *= 100 # Rounding not currently used
129
  np.savez_compressed(semantic_search_file_name, embeddings_out_round)
@@ -218,6 +297,88 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
218
 
219
  return results_df_out
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
222
  vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
223
 
 
5
  import gradio as gr
6
  import numpy as np
7
  from datetime import datetime
8
+ #from transformers import AutoModel, AutoTokenizer
9
+ from search_funcs.helper_functions import get_file_path_end
10
+ #import torch
11
+ from torch import cuda, backends#, tensor, mm, utils
12
+ from sentence_transformers import SentenceTransformer
13
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
 
 
 
 
16
  # Check for torch cuda
17
  print("Is CUDA enabled? ", cuda.is_available())
18
  print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
 
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
32
+ # Load embeddings - Jina - deprecated
33
  # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
34
  # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
35
+ # embeddings_name = "jinaai/jina-embeddings-v2-small-en"
36
+ # local_embeddings_location = "model/jina/"
37
+ # revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
38
 
39
+ # try:
40
+ # embeddings_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
41
+ # except:
42
+ # embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
43
 
44
+ # Load embeddings
45
+ embeddings_name = "BAAI/bge-small-en-v1.5"
46
+ local_embeddings_location = "model/bge/"
47
+
48
+ #try:
49
+ # tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
50
+ # embeddings_model = AutoModel.from_pretrained(local_embeddings_location, local_files_only=True).to(torch_device)
51
+ #except:
52
+ # tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
53
+ # embeddings_model = AutoModel.from_pretrained(embeddings_name).to(torch_device)
54
+
55
+ # Not using SentenceTransformer here
56
+ embeddings_model = SentenceTransformer(embeddings_name)
57
 
58
+ # def calc_bge_norm_embeddings(docs, embeddings_model=embeddings_model, tokenizer=tokenizer, progress=gr.Progress(track_tqdm=True)):
59
+ # # Tokenize sentences
60
+ # print("Tokenising")
61
+ # encoded_input = tokenizer(docs, padding=True, truncation=True, return_tensors='pt', max_length=32).to(torch_device)
62
 
63
+ # # Compute token embeddings
64
+ # print("Calculating embeddings")
65
+ # with torch.no_grad():
66
+ # model_output = embeddings_model(**encoded_input).to(torch_device)
67
+ # # Perform pooling. In this case, cls pooling.
68
+ # embeddings_out = model_output[0][:, 0]
69
+ # # normalize embeddings
70
+ # embeddings_out = torch.nn.functional.normalize(embeddings_out, p=2, dim=1)
71
+ # #print("Sentence embeddings:", embeddings_out)
72
+
73
+ # return embeddings_out
74
 
75
+
76
+ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
77
  '''
78
+ Takes a Langchain document class and saves it into a Chroma sqlite file.
79
  '''
80
+ if not in_file:
81
+ out_message = "No input file found. Please load in at least one file."
82
+ print(out_message)
83
+ return out_message, None, None
84
+
85
 
86
+ progress(0.6, desc = "Loading/creating embeddings")
87
+
88
+ print(f"> Total split documents: {len(docs_out)}")
 
89
 
90
+ #print(docs_out)
91
 
92
+ page_contents = [doc.page_content for doc in docs_out]
93
 
94
+ ## Load in pre-embedded file if exists
95
+ file_list = [string.name for string in in_file]
96
 
97
+ #print(file_list)
98
+
99
+ embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
100
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
101
+ data_file_name = data_file_names[0]
102
+ data_file_name_no_ext = get_file_path_end(data_file_name)
103
+
104
+ out_message = "Document processing complete. Ready to search."
105
+
106
+ # print("embeddings loaded: ", embeddings_out)
107
+
108
+ if embeddings_state.size == 0:
109
+ tic = time.perf_counter()
110
+ print("Starting to embed documents.")
111
+ #embeddings_list = []
112
+ #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
113
+ # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
114
+
115
+ embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
116
+ #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
117
+ #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
118
+
119
+ toc = time.perf_counter()
120
+ time_out = f"The embedding took {toc - tic:0.1f} seconds"
121
+ print(time_out)
122
+
123
+ # If you want to save your files for next time
124
+ if return_intermediate_files == "Yes":
125
+ progress(0.9, desc = "Saving embeddings to file")
126
+ if embeddings_super_compress == "No":
127
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
128
+ np.savez_compressed(semantic_search_file_name, embeddings_out)
129
+ else:
130
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
131
+ embeddings_out_round = np.round(embeddings_out, 3)
132
+ embeddings_out_round *= 100 # Rounding not currently used
133
+ np.savez_compressed(semantic_search_file_name, embeddings_out_round)
134
+
135
+ return out_message, embeddings_out, semantic_search_file_name
136
+
137
+ return out_message, embeddings_out, None
138
+ else:
139
+ # Just return existing embeddings if already exist
140
+ embeddings_out = embeddings_state
141
+
142
+ print(out_message)
143
+
144
+ return out_message, embeddings_out, None#, None
145
+
146
+
147
+ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
148
  '''
149
  Takes a Langchain document class and saves it into a Chroma sqlite file.
150
  '''
 
154
  return out_message, None, None
155
 
156
 
157
+ progress(0.6, desc = "Loading/creating embeddings")
158
 
159
  print(f"> Total split documents: {len(docs_out)}")
160
 
 
183
  #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
184
  # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
185
 
186
+
187
+
188
+ #embeddings_out = calc_bge_norm_embeddings(page_contents, embeddings_model, tokenizer)
189
+
190
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = 32, normalize_embeddings=True) # For BGE
191
  #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
192
  #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
193
 
 
199
  if return_intermediate_files == "Yes":
200
  progress(0.9, desc = "Saving embeddings to file")
201
  if embeddings_super_compress == "No":
202
+ semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
203
  np.savez_compressed(semantic_search_file_name, embeddings_out)
204
  else:
205
+ semantic_search_file_name = data_file_name_no_ext + '_bge_embedding_compress.npz'
206
  embeddings_out_round = np.round(embeddings_out, 3)
207
  embeddings_out_round *= 100 # Rounding not currently used
208
  np.savez_compressed(semantic_search_file_name, embeddings_out_round)
 
297
 
298
  return results_df_out
299
 
300
+ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
301
+ vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
302
+
303
+ # print("vectorstore loaded: ", vectorstore)
304
+ progress(0, desc = "Conducting semantic search")
305
+
306
+ print("Searching")
307
+
308
+ # Convert it to a PyTorch tensor and transfer to GPU
309
+ #vectorstore_tensor = tensor(vectorstore).to(device)
310
+
311
+ # Load the sentence transformer model and move it to GPU
312
+ embeddings = embeddings.to(device)
313
+
314
+ # Encode the query using the sentence transformer and convert to a PyTorch tensor
315
+ query = embeddings.encode(query_str, normalize_embeddings=True)
316
+
317
+ # query = calc_bge_norm_embeddings(query_str, embeddings_model=embeddings_model, tokenizer=tokenizer)
318
+
319
+ #query_tensor = tensor(query).to(device)
320
+
321
+ # if query_tensor.dim() == 1:
322
+ # query_tensor = query_tensor.unsqueeze(0) # Reshape to 2D with one row
323
+
324
+ # Sentence transformers method, not used:
325
+ cosine_similarities = query @ vectorstore.T
326
+ #cosine_similarities = util.cos_sim(query_tensor, vectorstore_tensor)[0]
327
+ #top_results = torch.topk(cos_scores, k=top_k)
328
+
329
+
330
+ # Normalize the query tensor and vectorstore tensor
331
+ #query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True)
332
+ #vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True)
333
+
334
+ # Calculate cosine similarities (batch processing)
335
+ #cosine_similarities = mm(query_norm, vectorstore_norm.T)
336
+ #cosine_similarities = mm(query_tensor, vectorstore_tensor.T)
337
+
338
+ # Flatten the tensor to a 1D array
339
+ cosine_similarities = cosine_similarities.flatten()
340
+
341
+ # Convert to a NumPy array if it's still a PyTorch tensor
342
+ #cosine_similarities = cosine_similarities.cpu().numpy()
343
+
344
+ # Create a Pandas Series
345
+ cosine_similarities_series = pd.Series(cosine_similarities)
346
+
347
+ # Pull out relevent info from docs
348
+ page_contents = [doc.page_content for doc in docs]
349
+ page_meta = [doc.metadata for doc in docs]
350
+ ids_range = range(0,len(page_contents))
351
+ ids = [str(element) for element in ids_range]
352
+
353
+ df_docs = pd.DataFrame(data={"ids": ids,
354
+ "documents": page_contents,
355
+ "metadatas":page_meta,
356
+ "distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]
357
+
358
+
359
+ results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
360
+
361
+ print("Search complete")
362
+
363
+ # If nothing found, return error message
364
+ if results_df_out.empty:
365
+ return 'No result found!', None
366
+
367
+ query_str_file = query_str.replace(" ", "_")
368
+
369
+ results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
370
+
371
+ print("Saving search output to file")
372
+ progress(0.7, desc = "Saving search output to file")
373
+
374
+ results_df_out.to_excel(results_df_name, index= None)
375
+ results_first_text = results_df_out.iloc[0, 1]
376
+
377
+ print("Returning results")
378
+
379
+ return results_first_text, results_df_name
380
+
381
+
382
  def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
383
  vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
384
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -1,12 +1,11 @@
1
- # Install/ import stuff we need
2
-
3
- import os
4
  import time
5
  import re
6
  import ast
7
  import gzip
8
  import pandas as pd
9
  import gradio as gr
 
10
  from typing import Type, List, Literal
11
  #from langchain.text_splitter import RecursiveCharacterTextSplitter
12
 
@@ -36,19 +35,6 @@ from search_funcs.helper_functions import get_file_path_end_with_ext, detect_fil
36
  from search_funcs.bm25_functions import save_prepared_bm25_data
37
  from search_funcs.clean_funcs import initial_clean
38
 
39
- ## Parse files
40
- # def detect_file_type(file_path):
41
- # """
42
- # Determine the file type based on its extension.
43
-
44
- # Parameters:
45
- # file_path (str): Path to the file.
46
-
47
- # Returns:
48
- # str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
49
- # """
50
- # return os.path.splitext(file_path)[1].lower()
51
-
52
  def parse_file_not_used(file_paths, text_column='text'):
53
  """
54
  Accepts a list of file paths, determines each file's type based on its extension,
@@ -124,8 +110,6 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
124
  Pandas DataFrame: Dataframe output from file read
125
  """
126
 
127
- #out_df = pd.DataFrame()
128
-
129
  file_list = [string.name for string in file_path]
130
 
131
  #print(file_list)
@@ -137,40 +121,10 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
137
  #for file_path in file_paths:
138
  file_name = get_file_path_end_with_ext(data_file_name)
139
 
140
- #print(file_extension)
141
-
142
- # if file_extension == "csv":
143
- # df = pd.read_csv(data_file_names[0], low_memory=False)
144
- # if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
145
- # df['source'] = file_name
146
- # df['page_section'] = ""
147
- # elif file_extension == "xlsx":
148
- # df = pd.read_excel(data_file_names[0], engine='openpyxl')
149
- # if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
150
- # df['source'] = file_name
151
- # df['page_section'] = ""
152
- # elif file_extension == "parquet":
153
- # df = pd.read_parquet(data_file_names[0])
154
- # if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
155
- # df['source'] = file_name
156
- # df['page_section'] = ""
157
- # else:
158
- # print(f"Unsupported file type: {file_extension}")
159
- # return pd.DataFrame(), ['Please choose a valid file type']
160
-
161
- df = data_state
162
- #df['source'] = file_name
163
- #df['page_section'] = ""
164
-
165
  message = "Loaded in file. Now converting to document format."
166
  print(message)
167
 
168
- return df, file_name, message
169
-
170
-
171
- # +
172
- # Convert parsed text to docs
173
- # -
174
 
175
  def write_out_metadata_as_string(metadata_in):
176
  # If metadata_in is a single dictionary, wrap it in a list
@@ -241,63 +195,10 @@ def parse_metadata(row):
241
  # Handle the error or log it
242
  return None # or some default value
243
 
244
- # def csv_excel_text_to_docs_deprecated(df, text_column='text', chunk_size=None) -> List[Document]:
245
- # """Converts a DataFrame's content to a list of Documents with metadata."""
246
-
247
- # print("Converting to documents.")
248
-
249
- # doc_sections = []
250
- # df[text_column] = df[text_column].astype(str) # Ensure column is a string column
251
-
252
- # # For each row in the dataframe
253
- # for idx, row in df.iterrows():
254
- # # Extract the text content for the document
255
- # doc_content = row[text_column]
256
-
257
- # # Generate metadata containing other columns' data
258
- # metadata = {"row": idx + 1}
259
- # for col, value in row.items():
260
- # if col != text_column:
261
- # metadata[col] = value
262
-
263
- # metadata_string = write_out_metadata_as_string(metadata)[0]
264
-
265
- # # If chunk_size is provided, split the text into chunks
266
- # if chunk_size:
267
- # sections = split_string_into_chunks(doc_content, chunk_size, split_strat)
268
-
269
- # # Langchain usage deprecated
270
- # # text_splitter = RecursiveCharacterTextSplitter(
271
- # # chunk_size=chunk_size,
272
- # # chunk_overlap=chunk_overlap,
273
- # # split_strat=split_strat,
274
- # # start_index=start_index
275
- # # ) #Other arguments as required by the splitter
276
-
277
- # # sections = text_splitter.split_text(doc_content)
278
-
279
- # # For each section, create a Document object
280
- # for i, section in enumerate(sections):
281
- # section = '. '.join([metadata_string, section])
282
- # doc = Document(page_content=section,
283
- # metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
284
- # doc_sections.append(doc)
285
-
286
- # else:
287
- # # If no chunk_size is provided, create a single Document object for the row
288
- # #doc_content = '. '.join([metadata_string, doc_content])
289
- # doc = Document(page_content=doc_content, metadata=metadata)
290
- # doc_sections.append(doc)
291
-
292
- # message = "Data converted to document format. Now creating/loading document embeddings."
293
- # print(message)
294
-
295
- # return doc_sections, message
296
-
297
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
298
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
299
  if not in_file:
300
- return None, "Please load in at least one file.", data_state, None, None, None
301
 
302
  progress(0, desc = "Loading in data")
303
 
@@ -309,7 +210,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
309
  return doc_sections, "Please load in at least one csv/Excel/parquet data file."
310
 
311
  if not text_column:
312
- return None, "Please enter a column name to search", data_state, None, None, None
313
 
314
  data_file_name = data_file_names[0]
315
 
@@ -336,6 +237,8 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
336
  doc_sections = []
337
  df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
338
 
 
 
339
  if clean == "Yes":
340
  progress(0.1, desc = "Cleaning data")
341
  clean_tic = time.perf_counter()
@@ -343,21 +246,29 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
343
 
344
  #df = df.drop_duplicates(text_column)
345
 
346
- df[text_column] = initial_clean(df[text_column])
347
  df_list = list(df[text_column])
 
 
 
 
 
 
348
 
349
- # Save to file if you have cleaned the data
350
  out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
351
 
 
 
 
352
  clean_toc = time.perf_counter()
353
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
354
  print(clean_time_out)
355
 
356
- cols = [col for col in df.columns if col != text_column]
357
 
358
  df["metadata"] = combine_metadata_columns(df, cols)
359
 
360
- df = df.rename(columns={text_column:"page_content"})
361
 
362
  #print(df[["page_content", "metadata"]].to_dict(orient='records'))
363
 
@@ -367,7 +278,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
367
  progress(0.3, desc = "Converting data to document format")
368
 
369
  # Create a list of Document objects
370
- doc_sections = [Document(page_content=row['page_content'],
371
  metadata= parse_metadata(row["metadata"]))
372
  for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
373
 
@@ -387,7 +298,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
387
  #print(page_content_series_string[0])
388
  #metadata_series_string = pd.Series(doc_sections[1]).astype(str)
389
 
390
- import pickle
391
 
392
  if clean == "No":
393
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
@@ -399,7 +309,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
399
  elif clean == "Yes":
400
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
401
 
402
- with gzip.open(file_name + "_prepared_docs_clean.pkl.gz", 'wb') as file:
403
  pickle.dump(doc_sections, file)
404
 
405
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
@@ -407,7 +317,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
407
 
408
  return doc_sections, "Finished preparing documents."
409
 
410
-
411
  def document_to_dataframe(documents):
412
  '''
413
  Convert an object in document format to pandas dataframe
@@ -429,12 +338,3 @@ def document_to_dataframe(documents):
429
  # Create a DataFrame from the list of rows
430
  df = pd.DataFrame(rows)
431
  return df
432
-
433
- # Example usage
434
- #documents = [
435
- # Document(page_content="Example content 1", metadata={"author": "Author 1", "year": 2021}),
436
- # Document(page_content="Example content 2", metadata={"author": "Author 2", "year": 2022})
437
- #]
438
-
439
- #df = document_to_dataframe(documents)
440
- #df
 
1
+ # Install/ import packages
 
 
2
  import time
3
  import re
4
  import ast
5
  import gzip
6
  import pandas as pd
7
  import gradio as gr
8
+ import pickle
9
  from typing import Type, List, Literal
10
  #from langchain.text_splitter import RecursiveCharacterTextSplitter
11
 
 
35
  from search_funcs.bm25_functions import save_prepared_bm25_data
36
  from search_funcs.clean_funcs import initial_clean
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def parse_file_not_used(file_paths, text_column='text'):
39
  """
40
  Accepts a list of file paths, determines each file's type based on its extension,
 
110
  Pandas DataFrame: Dataframe output from file read
111
  """
112
 
 
 
113
  file_list = [string.name for string in file_path]
114
 
115
  #print(file_list)
 
121
  #for file_path in file_paths:
122
  file_name = get_file_path_end_with_ext(data_file_name)
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  message = "Loaded in file. Now converting to document format."
125
  print(message)
126
 
127
+ return data_state, file_name, message
 
 
 
 
 
128
 
129
  def write_out_metadata_as_string(metadata_in):
130
  # If metadata_in is a single dictionary, wrap it in a list
 
195
  # Handle the error or log it
196
  return None # or some default value
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
200
  if not in_file:
201
+ return None, "Please load in at least one file.", df, None, None, None
202
 
203
  progress(0, desc = "Loading in data")
204
 
 
210
  return doc_sections, "Please load in at least one csv/Excel/parquet data file."
211
 
212
  if not text_column:
213
+ return None, "Please enter a column name to search", df, None, None, None
214
 
215
  data_file_name = data_file_names[0]
216
 
 
237
  doc_sections = []
238
  df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
239
 
240
+ original_text_column = text_column
241
+
242
  if clean == "Yes":
243
  progress(0.1, desc = "Cleaning data")
244
  clean_tic = time.perf_counter()
 
246
 
247
  #df = df.drop_duplicates(text_column)
248
 
 
249
  df_list = list(df[text_column])
250
+ df_list = initial_clean(df_list)
251
+
252
+ # Get rid of old data and keep only the new
253
+ #df = df.drop(text_column, axis = 1)
254
+
255
+
256
 
257
+ # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
258
  out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
259
 
260
+ df[text_column] = df_list
261
+
262
+
263
  clean_toc = time.perf_counter()
264
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
265
  print(clean_time_out)
266
 
267
+ cols = [col for col in df.columns if col != original_text_column]
268
 
269
  df["metadata"] = combine_metadata_columns(df, cols)
270
 
271
+ #df = df.rename(columns={text_column:"page_content"})
272
 
273
  #print(df[["page_content", "metadata"]].to_dict(orient='records'))
274
 
 
278
  progress(0.3, desc = "Converting data to document format")
279
 
280
  # Create a list of Document objects
281
+ doc_sections = [Document(page_content=row[text_column],
282
  metadata= parse_metadata(row["metadata"]))
283
  for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
284
 
 
298
  #print(page_content_series_string[0])
299
  #metadata_series_string = pd.Series(doc_sections[1]).astype(str)
300
 
 
301
 
302
  if clean == "No":
303
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
 
309
  elif clean == "Yes":
310
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
311
 
312
+ with gzip.open(file_name + "cleaned_prepared_docs.pkl.gz", 'wb') as file:
313
  pickle.dump(doc_sections, file)
314
 
315
  #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
 
317
 
318
  return doc_sections, "Finished preparing documents."
319
 
 
320
  def document_to_dataframe(documents):
321
  '''
322
  Convert an object in document format to pandas dataframe
 
338
  # Create a DataFrame from the list of rows
339
  df = pd.DataFrame(rows)
340
  return df
 
 
 
 
 
 
 
 
 
search_funcs/spacy_search_funcs.py CHANGED
@@ -1,4 +1,6 @@
1
  import spacy
 
 
2
  from spacy.matcher import Matcher
3
  import numpy as np
4
  import gradio as gr
@@ -10,15 +12,27 @@ PandasDataFrame = Type[pd.DataFrame]
10
 
11
  today_rev = datetime.now().strftime("%Y%m%d")
12
 
13
- nlp = spacy.load("en_core_web_sm")
14
-
15
- string_query = "knife attack run fast"
16
- df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]
17
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
20
  ''' Conduct fuzzy match on a list of data.'''
21
 
 
 
 
 
22
  query = nlp(string_query)
23
  tokenised_query = [token.text for token in query]
24
  print(tokenised_query)
 
1
  import spacy
2
+ spacy.prefer_gpu()
3
+ from spacy.cli.download import download
4
  from spacy.matcher import Matcher
5
  import numpy as np
6
  import gradio as gr
 
12
 
13
  today_rev = datetime.now().strftime("%Y%m%d")
14
 
15
+ # Load the SpaCy model
 
 
 
16
 
17
+ #os.system("python -m spacy download en_core_web_sm")
18
+ try:
19
+ import en_core_web_sm
20
+ nlp = en_core_web_sm.load()
21
+ print("Successfully imported spaCy model")
22
+ #nlp = spacy.load("en_core_web_sm")
23
+ #print(nlp._path)
24
+ except:
25
+ download("en_core_web_sm")
26
+ nlp = spacy.load("en_core_web_sm")
27
+ print("Successfully imported spaCy model")
28
 
29
  def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
30
  ''' Conduct fuzzy match on a list of data.'''
31
 
32
+ if len(df_list) > 10000:
33
+ out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
34
+ return out_message, None
35
+
36
  query = nlp(string_query)
37
  tokenised_query = [token.text for token in query]
38
  print(tokenised_query)