Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Feb 5, 2024

Commit

2bcd818

1 Parent(s): 4ce2224

Updated to Gradio 4.16.0. Now works correctly with BGE embeddings

Browse files

Files changed (9) hide show

.gitignore +2 -1
app.py +40 -44
how_to_create_exe_dist.txt +14 -11
requirements.txt +6 -5
search_funcs/bm25_functions.py +1 -2
search_funcs/helper_functions.py +0 -2
search_funcs/semantic_functions.py +197 -36
search_funcs/semantic_ingest_functions.py +21 -121
search_funcs/spacy_search_funcs.py +18 -4

.gitignore CHANGED Viewed

@@ -21,4 +21,5 @@ __pycache__/*
 db/*
 experiments/*
 model/*
-build_deps/*

 db/*
 experiments/*
 model/*
+build_deps/*
+build_deps_old/*

app.py CHANGED Viewed

@@ -1,10 +1,4 @@
 from typing import Type
-from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
-#from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
-#from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
-from search_funcs.helper_functions import dummy_function, display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
-from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 import gradio as gr
 import pandas as pd
@@ -12,6 +6,12 @@ import numpy as np
 PandasDataFrame = Type[pd.DataFrame]
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
 temp_folder_path = get_temp_folder_path()
 empty_folder(temp_folder_path)
@@ -20,6 +20,7 @@ empty_folder(temp_folder_path)
 block = gr.Blocks(theme = gr.themes.Base())
 with block:
     ingest_text = gr.State()
     ingest_metadata = gr.State()
@@ -79,38 +80,40 @@ depends on factors such as the type of documents or queries. Information taken f
         with gr.Accordion(label = "Search data", open=True):
             keyword_query = gr.Textbox(label="Enter your search term")
             with gr.Row():
-                keyword_search_button = gr.Button(value="Keyword search", variant="primary")
-                fuzzy_search_button = gr.Button(value="Fuzzy search (much slower)", variant="secondary")
             with gr.Row():
                 output_single_text = gr.Textbox(label="Top result")
                 output_file = gr.File(label="File output")
-    # with gr.Tab("Semantic search"):
-    #     gr.Markdown(
-    # """
-    # **Thematic/semantic search**
-    # This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
-    # """)
-    #     with gr.Row():
-    #         current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
-    #     with gr.Accordion("Load in data", open = True):
-    #         in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
-    #         with gr.Row():
-    #             in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
-    #             load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
-    #         semantic_load_progress = gr.Textbox(label="Load progress")
-    #     semantic_query = gr.Textbox(label="Enter semantic search query here")
-    #     semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
-    #     with gr.Row():
-    #         semantic_output_single_text = gr.Textbox(label="Top result")
-    #         semantic_output_file = gr.File(label="File output")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label="Data load / save options", open = True):
@@ -136,8 +139,8 @@ depends on factors such as the type of documents or queries. Information taken f
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Fuzzy search options", open = False):
                 no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
-       # with gr.Accordion(label="Semantic search options", open = False):
-       #     semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.75, minimum=0, maximum=0.95, step=0.01)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
             in_join_message = gr.Textbox(label="Join file load progress")
@@ -166,26 +169,19 @@ depends on factors such as the type of documents or queries. Information taken f
     keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
     fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    # in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress, current_source])
-    # load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
-    #          then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
-    #          then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
-    # # Semantic search query
-    # semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
-    # semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
-    # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
-    in_bm25_column.change(dummy_function, in_bm25_column, None)
-    search_df_join_column.change(dummy_function, search_df_join_column, None)
-    in_join_column.change(dummy_function, in_join_column, None)
-    # in_semantic_column.change(dummy_function, in_join_column, None)
 block.queue().launch(debug=True)

 from typing import Type
 import gradio as gr
 import pandas as pd
 PandasDataFrame = Type[pd.DataFrame]
+from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
+from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
+from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
+from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
+from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
 temp_folder_path = get_temp_folder_path()
 empty_folder(temp_folder_path)
 block = gr.Blocks(theme = gr.themes.Base())
 with block:
+    print("Please don't close this window! Open the below link in the web browser of your choice.")
     ingest_text = gr.State()
     ingest_metadata = gr.State()
         with gr.Accordion(label = "Search data", open=True):
             keyword_query = gr.Textbox(label="Enter your search term")
             with gr.Row():
+                keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
+                fuzzy_search_button = gr.Button(value="Fuzzy search (slow, < 10k rows)", variant="secondary", scale = 0)
             with gr.Row():
                 output_single_text = gr.Textbox(label="Top result")
                 output_file = gr.File(label="File output")
+    with gr.Tab("Semantic search"):
+        gr.Markdown(
+    """
+    **Thematic/semantic search**
+    This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
+    """)
+        with gr.Row():
+            current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
+        with gr.Accordion("Load in data", open = True):
+            in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
+            with gr.Row():
+                in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
+                load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
+            semantic_load_progress = gr.Textbox(label="Load progress")
+        semantic_query = gr.Textbox(label="Enter semantic search query here")
+        semantic_submit = gr.Button(value="Start semantic search", variant="primary")
+        with gr.Row():
+            semantic_output_single_text = gr.Textbox(label="Top result")
+            semantic_output_file = gr.File(label="File output")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label="Data load / save options", open = True):
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Fuzzy search options", open = False):
                 no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
+        with gr.Accordion(label="Semantic search options", open = False):
+            semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.6, minimum=0, maximum=0.95, step=0.01)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
             in_join_message = gr.Textbox(label="Join file load progress")
     keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
     fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column,  search_df_join_column, semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
+    load_semantic_data_button.click(
+        csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
+        then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
+    # Semantic search query
+    semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
+    semantic_query.submit(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
 block.queue().launch(debug=True)

how_to_create_exe_dist.txt CHANGED Viewed

@@ -6,24 +6,27 @@
 NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
-4. In file explorer, navigate to the miniconda/envs/new_env/Lib/site-packages/gradio-client/ folder
-5. Copy types.json from the gradio_client folder to the folder containing the data_text_search.py file
-6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for gradio and en_core_web_sm (a spaCy model).
-7. pip install pyinstaller
-8. In command line, cd to the folder that contains app.py. Then run the following:
-For one single file:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
-If not using embedding model:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client"  --onefile --clean --noconfirm --name DataSearchApp_0.2.2_keyword app.py
-For a small exe with a folder of dependencies:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

 NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
+6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for gradio and en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
+7. pip install pyinstaller
+8. In command line, cd to the folder that contains app.py.
+9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.2.3 app.py
+b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
+a = Analysis(
+    ...
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    }
+)
+c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.2.3.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
-pandas==2.1.4
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
-# transformers==4.32.1
-# accelerate==0.26.0
-# torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio==4.16.0

+pandas==2.2.0
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
+#transformers==4.37.2
+#accelerate==0.26.0
+torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio==4.16.0
+sentence_transformers==2.3.1

search_funcs/bm25_functions.py CHANGED Viewed

@@ -6,7 +6,6 @@ import sys
 import gzip
 import time
 import pandas as pd
-import numpy as np
 from numpy import inf
 import gradio as gr
@@ -15,7 +14,7 @@ from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
-from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
 # Load the SpaCy model
 from spacy.cli.download import download

 import gzip
 import time
 import pandas as pd
 from numpy import inf
 import gradio as gr
 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
+from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end
 # Load the SpaCy model
 from spacy.cli.download import download

search_funcs/helper_functions.py CHANGED Viewed

@@ -30,8 +30,6 @@ def empty_folder(directory_path):
             #print(f'Failed to delete {file_path}. Reason: {e}')
             print('')
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)

             #print(f'Failed to delete {file_path}. Reason: {e}')
             print('')
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)

search_funcs/semantic_functions.py CHANGED Viewed

@@ -5,14 +5,14 @@ from typing import Type
 import gradio as gr
 import numpy as np
 from datetime import datetime
-import accelerate
 today_rev = datetime.now().strftime("%Y%m%d")
-from transformers import AutoModel
-from torch import cuda, backends, tensor, mm
 # Check for torch cuda
 print("Is CUDA enabled? ", cuda.is_available())
 print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
@@ -29,47 +29,122 @@ print("Device used is: ", torch_device)
 PandasDataFrame = Type[pd.DataFrame]
-# Load embeddings
 # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
 # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
-embeddings_name = "jinaai/jina-embeddings-v2-small-en"
-local_embeddings_location = "model/jina/"
-revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
-try:
-    embeddings_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
-except:
-    embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
-def get_file_path_end(file_path):
-    # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
-    basename = os.path.basename(file_path)
-    # Then, split the basename and its extension and return only the basename without the extension
-    filename_without_extension, _ = os.path.splitext(basename)
-    #print(filename_without_extension)
-    return filename_without_extension
-def load_embeddings(embeddings_name = embeddings_name):
     '''
-    Load embeddings model and create a global variable based on it.
     '''
-    # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
-    #else:
-    embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
-    global embeddings
-    embeddings = embeddings_func
-    return embeddings
-def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
     '''
     Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
@@ -79,7 +154,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
         return out_message, None, None
-    progress(0.7, desc = "Loading/creating embeddings")
     print(f"> Total split documents: {len(docs_out)}")
@@ -108,7 +183,11 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
         #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
         #    embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
-        embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
@@ -120,10 +199,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
         if return_intermediate_files == "Yes":
             progress(0.9, desc = "Saving embeddings to file")
             if embeddings_super_compress == "No":
-                semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
                 np.savez_compressed(semantic_search_file_name, embeddings_out)
             else:
-                semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
                 embeddings_out_round = np.round(embeddings_out, 3)
                 embeddings_out_round *= 100 # Rounding not currently used
                 np.savez_compressed(semantic_search_file_name, embeddings_out_round)
@@ -218,6 +297,88 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
     return results_df_out
 def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
                            vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings

 import gradio as gr
 import numpy as np
 from datetime import datetime
+#from transformers import AutoModel, AutoTokenizer
+from search_funcs.helper_functions import get_file_path_end
+#import torch
+from torch import cuda, backends#, tensor, mm, utils
+from sentence_transformers import SentenceTransformer
 today_rev = datetime.now().strftime("%Y%m%d")
 # Check for torch cuda
 print("Is CUDA enabled? ", cuda.is_available())
 print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
 PandasDataFrame = Type[pd.DataFrame]
+# Load embeddings - Jina - deprecated
 # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
 # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
+# embeddings_name = "jinaai/jina-embeddings-v2-small-en"
+# local_embeddings_location = "model/jina/"
+# revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
+# try:
+#     embeddings_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
+# except:
+#     embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
+# Load embeddings
+embeddings_name = "BAAI/bge-small-en-v1.5"
+local_embeddings_location = "model/bge/"
+#try:
+#    tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
+#    embeddings_model = AutoModel.from_pretrained(local_embeddings_location, local_files_only=True).to(torch_device)
+#except:
+#    tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
+#    embeddings_model = AutoModel.from_pretrained(embeddings_name).to(torch_device)
+# Not using SentenceTransformer here
+embeddings_model = SentenceTransformer(embeddings_name)
+# def calc_bge_norm_embeddings(docs, embeddings_model=embeddings_model, tokenizer=tokenizer, progress=gr.Progress(track_tqdm=True)):
+#     # Tokenize sentences
+#     print("Tokenising")
+#     encoded_input = tokenizer(docs, padding=True, truncation=True, return_tensors='pt', max_length=32).to(torch_device)
+#     # Compute token embeddings
+#     print("Calculating embeddings")
+#     with torch.no_grad():
+#         model_output = embeddings_model(**encoded_input).to(torch_device)
+#         # Perform pooling. In this case, cls pooling.
+#         embeddings_out = model_output[0][:, 0]
+#     # normalize embeddings
+#     embeddings_out = torch.nn.functional.normalize(embeddings_out, p=2, dim=1)
+#     #print("Sentence embeddings:", embeddings_out)
+#     return embeddings_out
+def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
     '''
+    Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
+    if not in_file:
+        out_message = "No input file found. Please load in at least one file."
+        print(out_message)
+        return out_message, None, None
+    progress(0.6, desc = "Loading/creating embeddings")
+    print(f"> Total split documents: {len(docs_out)}")
+    #print(docs_out)
+    page_contents = [doc.page_content for doc in docs_out]
+    ## Load in pre-embedded file if exists
+    file_list = [string.name for string in in_file]
+    #print(file_list)
+    embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
+    data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
+    data_file_name = data_file_names[0]
+    data_file_name_no_ext = get_file_path_end(data_file_name)
+    out_message = "Document processing complete. Ready to search."
+     # print("embeddings loaded: ", embeddings_out)
+    if embeddings_state.size == 0:
+        tic = time.perf_counter()
+        print("Starting to embed documents.")
+        #embeddings_list = []
+        #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
+        #    embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
+        embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
+        #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
+        #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
+        toc = time.perf_counter()
+        time_out = f"The embedding took {toc - tic:0.1f} seconds"
+        print(time_out)
+        # If you want to save your files for next time
+        if return_intermediate_files == "Yes":
+            progress(0.9, desc = "Saving embeddings to file")
+            if embeddings_super_compress == "No":
+                semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
+                np.savez_compressed(semantic_search_file_name, embeddings_out)
+            else:
+                semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
+                embeddings_out_round = np.round(embeddings_out, 3)
+                embeddings_out_round *= 100 # Rounding not currently used
+                np.savez_compressed(semantic_search_file_name, embeddings_out_round)
+            return out_message, embeddings_out, semantic_search_file_name
+        return out_message, embeddings_out, None
+    else:
+        # Just return existing embeddings if already exist
+        embeddings_out = embeddings_state
+    print(out_message)
+    return out_message, embeddings_out, None#, None
+def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
     '''
     Takes a Langchain document class and saves it into a Chroma sqlite file.
     '''
         return out_message, None, None
+    progress(0.6, desc = "Loading/creating embeddings")
     print(f"> Total split documents: {len(docs_out)}")
         #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
         #    embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
+        #embeddings_out = calc_bge_norm_embeddings(page_contents, embeddings_model, tokenizer)
+        embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = 32, normalize_embeddings=True) # For BGE
         #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
         if return_intermediate_files == "Yes":
             progress(0.9, desc = "Saving embeddings to file")
             if embeddings_super_compress == "No":
+                semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
                 np.savez_compressed(semantic_search_file_name, embeddings_out)
             else:
+                semantic_search_file_name = data_file_name_no_ext + '_bge_embedding_compress.npz'
                 embeddings_out_round = np.round(embeddings_out, 3)
                 embeddings_out_round *= 100 # Rounding not currently used
                 np.savez_compressed(semantic_search_file_name, embeddings_out_round)
     return results_df_out
+def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
+                           vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
+    # print("vectorstore loaded: ", vectorstore)
+    progress(0, desc = "Conducting semantic search")
+    print("Searching")
+    # Convert it to a PyTorch tensor and transfer to GPU
+    #vectorstore_tensor = tensor(vectorstore).to(device)
+    # Load the sentence transformer model and move it to GPU
+    embeddings = embeddings.to(device)
+    # Encode the query using the sentence transformer and convert to a PyTorch tensor
+    query = embeddings.encode(query_str, normalize_embeddings=True)
+    # query = calc_bge_norm_embeddings(query_str, embeddings_model=embeddings_model, tokenizer=tokenizer)
+    #query_tensor = tensor(query).to(device)
+    # if query_tensor.dim() == 1:
+    #     query_tensor = query_tensor.unsqueeze(0)  # Reshape to 2D with one row
+    # Sentence transformers method, not used:
+    cosine_similarities = query @ vectorstore.T
+    #cosine_similarities = util.cos_sim(query_tensor, vectorstore_tensor)[0]
+    #top_results = torch.topk(cos_scores, k=top_k)
+    # Normalize the query tensor and vectorstore tensor
+    #query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True)
+    #vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True)
+    # Calculate cosine similarities (batch processing)
+    #cosine_similarities = mm(query_norm, vectorstore_norm.T)
+    #cosine_similarities = mm(query_tensor, vectorstore_tensor.T)
+    # Flatten the tensor to a 1D array
+    cosine_similarities = cosine_similarities.flatten()
+    # Convert to a NumPy array if it's still a PyTorch tensor
+    #cosine_similarities = cosine_similarities.cpu().numpy()
+    # Create a Pandas Series
+    cosine_similarities_series = pd.Series(cosine_similarities)
+    # Pull out relevent info from docs
+    page_contents = [doc.page_content for doc in docs]
+    page_meta = [doc.metadata for doc in docs]
+    ids_range = range(0,len(page_contents))
+    ids = [str(element) for element in ids_range]
+    df_docs = pd.DataFrame(data={"ids": ids,
+                                "documents": page_contents,
+                                    "metadatas":page_meta,
+                                    "distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]
+    results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
+    print("Search complete")
+    # If nothing found, return error message
+    if results_df_out.empty:
+        return 'No result found!', None
+    query_str_file = query_str.replace(" ", "_")
+    results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
+    print("Saving search output to file")
+    progress(0.7, desc = "Saving search output to file")
+    results_df_out.to_excel(results_df_name, index= None)
+    results_first_text = results_df_out.iloc[0, 1]
+    print("Returning results")
+    return results_first_text, results_df_name
 def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
                            vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -1,12 +1,11 @@
-# Install/ import stuff we need
-import os
 import time
 import re
 import ast
 import gzip
 import pandas as pd
 import gradio as gr
 from typing import Type, List, Literal
 #from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -36,19 +35,6 @@ from search_funcs.helper_functions import get_file_path_end_with_ext, detect_fil
 from search_funcs.bm25_functions import save_prepared_bm25_data
 from search_funcs.clean_funcs import initial_clean
-## Parse files
-# def detect_file_type(file_path):
-#         """
-#         Determine the file type based on its extension.
-#         Parameters:
-#             file_path (str): Path to the file.
-#         Returns:
-#             str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
-#         """
-#         return os.path.splitext(file_path)[1].lower()
 def parse_file_not_used(file_paths, text_column='text'):
     """
     Accepts a list of file paths, determines each file's type based on its extension,
@@ -124,8 +110,6 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
             Pandas DataFrame: Dataframe output from file read
         """
-        #out_df = pd.DataFrame()
         file_list = [string.name for string in file_path]
         #print(file_list)
@@ -137,40 +121,10 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
         #for file_path in file_paths:
         file_name = get_file_path_end_with_ext(data_file_name)
-        #print(file_extension)
-        # if file_extension == "csv":
-        #         df = pd.read_csv(data_file_names[0], low_memory=False)
-        #         if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
-        #         df['source'] = file_name
-        #         df['page_section'] = ""
-        # elif file_extension == "xlsx":
-        #         df = pd.read_excel(data_file_names[0], engine='openpyxl')
-        #         if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
-        #         df['source'] = file_name
-        #         df['page_section'] = ""
-        # elif file_extension == "parquet":
-        #         df = pd.read_parquet(data_file_names[0])
-        #         if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
-        #         df['source'] = file_name
-        #         df['page_section'] = ""
-        # else:
-        #         print(f"Unsupported file type: {file_extension}")
-        #         return pd.DataFrame(), ['Please choose a valid file type']
-        df = data_state
-        #df['source'] = file_name
-        #df['page_section'] = ""
         message = "Loaded in file. Now converting to document format."
         print(message)
-        return df, file_name, message
-# +
-# Convert parsed text to docs
-# -
 def write_out_metadata_as_string(metadata_in):
     # If metadata_in is a single dictionary, wrap it in a list
@@ -241,63 +195,10 @@ def parse_metadata(row):
         # Handle the error or log it
         return None  # or some default value
-# def csv_excel_text_to_docs_deprecated(df, text_column='text', chunk_size=None) -> List[Document]:
-#     """Converts a DataFrame's content to a list of Documents with metadata."""
-#     print("Converting to documents.")
-#     doc_sections = []
-#     df[text_column] = df[text_column].astype(str) # Ensure column is a string column
-#     # For each row in the dataframe
-#     for idx, row in df.iterrows():
-#         # Extract the text content for the document
-#         doc_content = row[text_column]
-#         # Generate metadata containing other columns' data
-#         metadata = {"row": idx + 1}
-#         for col, value in row.items():
-#             if col != text_column:
-#                 metadata[col] = value
-#         metadata_string = write_out_metadata_as_string(metadata)[0]
-#         # If chunk_size is provided, split the text into chunks
-#         if chunk_size:
-#             sections = split_string_into_chunks(doc_content, chunk_size, split_strat)
-#             # Langchain usage deprecated
-#             # text_splitter = RecursiveCharacterTextSplitter(
-#             #    chunk_size=chunk_size,
-#             #    chunk_overlap=chunk_overlap,
-#             #    split_strat=split_strat,
-#             #    start_index=start_index
-#             # ) #Other arguments as required by the splitter
-#             # sections = text_splitter.split_text(doc_content)
-#             # For each section, create a Document object
-#             for i, section in enumerate(sections):
-#                 section = '. '.join([metadata_string, section])
-#                 doc = Document(page_content=section,
-#                               metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
-#                 doc_sections.append(doc)
-#         else:
-#             # If no chunk_size is provided, create a single Document object for the row
-#             #doc_content = '. '.join([metadata_string, doc_content])
-#             doc = Document(page_content=doc_content, metadata=metadata)
-#             doc_sections.append(doc)
-#         message = "Data converted to document format. Now creating/loading document embeddings."
-#         print(message)
-#     return doc_sections, message
 def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
     if not in_file:
-        return None, "Please load in at least one file.", data_state, None, None, None
     progress(0, desc = "Loading in data")
@@ -309,7 +210,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         return doc_sections, "Please load in at least one csv/Excel/parquet data file."
     if not text_column:
-        return None, "Please enter a column name to search", data_state, None, None, None
     data_file_name = data_file_names[0]
@@ -336,6 +237,8 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
     doc_sections = []
     df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
     if clean == "Yes":
         progress(0.1, desc = "Cleaning data")
         clean_tic = time.perf_counter()
@@ -343,21 +246,29 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         #df = df.drop_duplicates(text_column)
-        df[text_column] = initial_clean(df[text_column])
         df_list = list(df[text_column])
-        # Save to file if you have cleaned the data
         out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         print(clean_time_out)
-    cols = [col for col in df.columns if col != text_column]
     df["metadata"] = combine_metadata_columns(df, cols)
-    df = df.rename(columns={text_column:"page_content"})
     #print(df[["page_content", "metadata"]].to_dict(orient='records'))
@@ -367,7 +278,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
     progress(0.3, desc = "Converting data to document format")
     # Create a list of Document objects
-    doc_sections = [Document(page_content=row['page_content'],
                         metadata= parse_metadata(row["metadata"]))
                 for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
@@ -387,7 +298,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         #print(page_content_series_string[0])
         #metadata_series_string = pd.Series(doc_sections[1]).astype(str)
-        import pickle
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
@@ -399,7 +309,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
-            with gzip.open(file_name + "_prepared_docs_clean.pkl.gz", 'wb') as file:
                 pickle.dump(doc_sections, file)
             #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
@@ -407,7 +317,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
     return doc_sections, "Finished preparing documents."
 def document_to_dataframe(documents):
     '''
     Convert an object in document format to pandas dataframe
@@ -429,12 +338,3 @@ def document_to_dataframe(documents):
     # Create a DataFrame from the list of rows
     df = pd.DataFrame(rows)
     return df
-# Example usage
-#documents = [
-#    Document(page_content="Example content 1", metadata={"author": "Author 1", "year": 2021}),
-#    Document(page_content="Example content 2", metadata={"author": "Author 2", "year": 2022})
-#]
-#df = document_to_dataframe(documents)
-#df

+# Install/ import packages
 import time
 import re
 import ast
 import gzip
 import pandas as pd
 import gradio as gr
+import pickle
 from typing import Type, List, Literal
 #from langchain.text_splitter import RecursiveCharacterTextSplitter
 from search_funcs.bm25_functions import save_prepared_bm25_data
 from search_funcs.clean_funcs import initial_clean
 def parse_file_not_used(file_paths, text_column='text'):
     """
     Accepts a list of file paths, determines each file's type based on its extension,
             Pandas DataFrame: Dataframe output from file read
         """
         file_list = [string.name for string in file_path]
         #print(file_list)
         #for file_path in file_paths:
         file_name = get_file_path_end_with_ext(data_file_name)
         message = "Loaded in file. Now converting to document format."
         print(message)
+        return data_state, file_name, message
 def write_out_metadata_as_string(metadata_in):
     # If metadata_in is a single dictionary, wrap it in a list
         # Handle the error or log it
         return None  # or some default value
 def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
     if not in_file:
+        return None, "Please load in at least one file.", df, None, None, None
     progress(0, desc = "Loading in data")
         return doc_sections, "Please load in at least one csv/Excel/parquet data file."
     if not text_column:
+        return None, "Please enter a column name to search", df, None, None, None
     data_file_name = data_file_names[0]
     doc_sections = []
     df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
+    original_text_column = text_column
     if clean == "Yes":
         progress(0.1, desc = "Cleaning data")
         clean_tic = time.perf_counter()
         #df = df.drop_duplicates(text_column)
         df_list = list(df[text_column])
+        df_list = initial_clean(df_list)
+        # Get rid of old data and keep only the new
+        #df = df.drop(text_column, axis = 1)
+        # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
         out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
+        df[text_column] = df_list
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         print(clean_time_out)
+    cols = [col for col in df.columns if col != original_text_column]
     df["metadata"] = combine_metadata_columns(df, cols)
+    #df = df.rename(columns={text_column:"page_content"})
     #print(df[["page_content", "metadata"]].to_dict(orient='records'))
     progress(0.3, desc = "Converting data to document format")
     # Create a list of Document objects
+    doc_sections = [Document(page_content=row[text_column],
                         metadata= parse_metadata(row["metadata"]))
                 for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
         #print(page_content_series_string[0])
         #metadata_series_string = pd.Series(doc_sections[1]).astype(str)
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
+            with gzip.open(file_name + "cleaned_prepared_docs.pkl.gz", 'wb') as file:
                 pickle.dump(doc_sections, file)
             #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
     return doc_sections, "Finished preparing documents."
 def document_to_dataframe(documents):
     '''
     Convert an object in document format to pandas dataframe
     # Create a DataFrame from the list of rows
     df = pd.DataFrame(rows)
     return df

search_funcs/spacy_search_funcs.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import spacy
 from spacy.matcher import Matcher
 import numpy as np
 import gradio as gr
@@ -10,15 +12,27 @@ PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
-nlp = spacy.load("en_core_web_sm")
-string_query = "knife attack run fast"
-df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]
 def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
     query = nlp(string_query)
     tokenised_query = [token.text for token in query]
     print(tokenised_query)

 import spacy
+spacy.prefer_gpu()
+from spacy.cli.download import download
 from spacy.matcher import Matcher
 import numpy as np
 import gradio as gr
 today_rev = datetime.now().strftime("%Y%m%d")
+# Load the SpaCy model
+#os.system("python -m spacy download en_core_web_sm")
+try:
+	import en_core_web_sm
+	nlp = en_core_web_sm.load()
+	print("Successfully imported spaCy model")
+    #nlp = spacy.load("en_core_web_sm")
+    #print(nlp._path)
+except:
+	download("en_core_web_sm")
+	nlp = spacy.load("en_core_web_sm")
+	print("Successfully imported spaCy model")
 def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
+    if len(df_list) > 10000:
+         out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
+         return out_message, None
     query = nlp(string_query)
     tokenised_query = [token.text for token in query]
     print(tokenised_query)