seanpedrickcase
commited on
Commit
•
2bcd818
1
Parent(s):
4ce2224
Updated to Gradio 4.16.0. Now works correctly with BGE embeddings
Browse files- .gitignore +2 -1
- app.py +40 -44
- how_to_create_exe_dist.txt +14 -11
- requirements.txt +6 -5
- search_funcs/bm25_functions.py +1 -2
- search_funcs/helper_functions.py +0 -2
- search_funcs/semantic_functions.py +197 -36
- search_funcs/semantic_ingest_functions.py +21 -121
- search_funcs/spacy_search_funcs.py +18 -4
.gitignore
CHANGED
@@ -21,4 +21,5 @@ __pycache__/*
|
|
21 |
db/*
|
22 |
experiments/*
|
23 |
model/*
|
24 |
-
build_deps/*
|
|
|
|
21 |
db/*
|
22 |
experiments/*
|
23 |
model/*
|
24 |
+
build_deps/*
|
25 |
+
build_deps_old/*
|
app.py
CHANGED
@@ -1,10 +1,4 @@
|
|
1 |
from typing import Type
|
2 |
-
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
3 |
-
#from search_funcs.semantic_ingest_functions import parse_csv_or_excel, csv_excel_text_to_docs
|
4 |
-
#from search_funcs.semantic_functions import docs_to_jina_embed_np_array, jina_simple_retrieval
|
5 |
-
from search_funcs.helper_functions import dummy_function, display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
|
6 |
-
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
7 |
-
|
8 |
|
9 |
import gradio as gr
|
10 |
import pandas as pd
|
@@ -12,6 +6,12 @@ import numpy as np
|
|
12 |
|
13 |
PandasDataFrame = Type[pd.DataFrame]
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
|
16 |
temp_folder_path = get_temp_folder_path()
|
17 |
empty_folder(temp_folder_path)
|
@@ -20,6 +20,7 @@ empty_folder(temp_folder_path)
|
|
20 |
block = gr.Blocks(theme = gr.themes.Base())
|
21 |
|
22 |
with block:
|
|
|
23 |
|
24 |
ingest_text = gr.State()
|
25 |
ingest_metadata = gr.State()
|
@@ -79,38 +80,40 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
79 |
with gr.Accordion(label = "Search data", open=True):
|
80 |
keyword_query = gr.Textbox(label="Enter your search term")
|
81 |
with gr.Row():
|
82 |
-
keyword_search_button = gr.Button(value="Keyword search", variant="primary")
|
83 |
-
fuzzy_search_button = gr.Button(value="Fuzzy search (
|
84 |
with gr.Row():
|
85 |
output_single_text = gr.Textbox(label="Top result")
|
86 |
output_file = gr.File(label="File output")
|
87 |
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
# current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
98 |
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
|
106 |
-
|
107 |
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
|
115 |
with gr.Tab(label="Advanced options"):
|
116 |
with gr.Accordion(label="Data load / save options", open = True):
|
@@ -136,8 +139,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
136 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
137 |
with gr.Accordion(label="Fuzzy search options", open = False):
|
138 |
no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
|
139 |
-
|
140 |
-
|
141 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
142 |
in_join_file = gr.File(label="Upload your data to join here")
|
143 |
in_join_message = gr.Textbox(label="Join file load progress")
|
@@ -166,26 +169,19 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
166 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
167 |
|
168 |
# Fuzzy search functions on click
|
169 |
-
|
170 |
fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
|
171 |
|
172 |
### SEMANTIC SEARCH ###
|
|
|
173 |
# Load in a csv/excel file for semantic search
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
|
179 |
-
#
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
184 |
-
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
185 |
-
search_df_join_column.change(dummy_function, search_df_join_column, None)
|
186 |
-
in_join_column.change(dummy_function, in_join_column, None)
|
187 |
-
# in_semantic_column.change(dummy_function, in_join_column, None)
|
188 |
|
189 |
block.queue().launch(debug=True)
|
190 |
|
191 |
-
|
|
|
1 |
from typing import Type
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
+
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
10 |
+
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
11 |
+
from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
|
12 |
+
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
|
13 |
+
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
14 |
+
|
15 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
|
16 |
temp_folder_path = get_temp_folder_path()
|
17 |
empty_folder(temp_folder_path)
|
|
|
20 |
block = gr.Blocks(theme = gr.themes.Base())
|
21 |
|
22 |
with block:
|
23 |
+
print("Please don't close this window! Open the below link in the web browser of your choice.")
|
24 |
|
25 |
ingest_text = gr.State()
|
26 |
ingest_metadata = gr.State()
|
|
|
80 |
with gr.Accordion(label = "Search data", open=True):
|
81 |
keyword_query = gr.Textbox(label="Enter your search term")
|
82 |
with gr.Row():
|
83 |
+
keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
|
84 |
+
fuzzy_search_button = gr.Button(value="Fuzzy search (slow, < 10k rows)", variant="secondary", scale = 0)
|
85 |
with gr.Row():
|
86 |
output_single_text = gr.Textbox(label="Top result")
|
87 |
output_file = gr.File(label="File output")
|
88 |
|
89 |
|
90 |
+
with gr.Tab("Semantic search"):
|
91 |
+
gr.Markdown(
|
92 |
+
"""
|
93 |
+
**Thematic/semantic search**
|
94 |
|
95 |
+
This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
96 |
+
""")
|
97 |
+
|
|
|
98 |
|
99 |
+
with gr.Row():
|
100 |
+
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
101 |
+
|
102 |
+
with gr.Accordion("Load in data", open = True):
|
103 |
+
in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
|
104 |
|
105 |
+
with gr.Row():
|
106 |
+
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
107 |
+
load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
|
108 |
|
109 |
+
semantic_load_progress = gr.Textbox(label="Load progress")
|
110 |
|
111 |
+
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
112 |
+
semantic_submit = gr.Button(value="Start semantic search", variant="primary")
|
113 |
|
114 |
+
with gr.Row():
|
115 |
+
semantic_output_single_text = gr.Textbox(label="Top result")
|
116 |
+
semantic_output_file = gr.File(label="File output")
|
117 |
|
118 |
with gr.Tab(label="Advanced options"):
|
119 |
with gr.Accordion(label="Data load / save options", open = True):
|
|
|
139 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
140 |
with gr.Accordion(label="Fuzzy search options", open = False):
|
141 |
no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
|
142 |
+
with gr.Accordion(label="Semantic search options", open = False):
|
143 |
+
semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.6, minimum=0, maximum=0.95, step=0.01)
|
144 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
145 |
in_join_file = gr.File(label="Upload your data to join here")
|
146 |
in_join_message = gr.Textbox(label="Join file load progress")
|
|
|
169 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
170 |
|
171 |
# Fuzzy search functions on click
|
|
|
172 |
fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, keyword_data_list_state, keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy")
|
173 |
|
174 |
### SEMANTIC SEARCH ###
|
175 |
+
|
176 |
# Load in a csv/excel file for semantic search
|
177 |
+
in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
|
178 |
+
load_semantic_data_button.click(
|
179 |
+
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
|
180 |
+
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
|
181 |
|
182 |
+
# Semantic search query
|
183 |
+
semantic_submit.click(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
184 |
+
semantic_query.submit(bge_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
block.queue().launch(debug=True)
|
187 |
|
|
how_to_create_exe_dist.txt
CHANGED
@@ -6,24 +6,27 @@
|
|
6 |
|
7 |
NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
|
8 |
|
9 |
-
|
10 |
|
11 |
-
|
12 |
|
13 |
-
|
14 |
|
15 |
-
|
|
|
|
|
16 |
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --onefile --clean --noconfirm --name DataSearchApp_0.2.2_keyword app.py
|
24 |
|
25 |
-
For a small exe with a folder of dependencies:
|
26 |
-
python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
|
27 |
|
28 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
29 |
|
|
|
6 |
|
7 |
NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
|
8 |
|
9 |
+
6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for gradio and en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
|
10 |
|
11 |
+
7. pip install pyinstaller
|
12 |
|
13 |
+
8. In command line, cd to the folder that contains app.py.
|
14 |
|
15 |
+
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
+
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.2.3 app.py
|
18 |
|
19 |
+
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
20 |
|
21 |
+
a = Analysis(
|
22 |
+
...
|
23 |
+
module_collection_mode={
|
24 |
+
'gradio': 'py', # Collect gradio package as source .py files
|
25 |
+
}
|
26 |
+
)
|
27 |
|
28 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.2.3.spec
|
|
|
29 |
|
|
|
|
|
30 |
|
31 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
32 |
|
requirements.txt
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
-
pandas==2.
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
-
#
|
6 |
-
#
|
7 |
-
|
8 |
spacy==3.7.2
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
-
gradio==4.16.0
|
|
|
|
1 |
+
pandas==2.2.0
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
+
#transformers==4.37.2
|
6 |
+
#accelerate==0.26.0
|
7 |
+
torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
+
gradio==4.16.0
|
11 |
+
sentence_transformers==2.3.1
|
search_funcs/bm25_functions.py
CHANGED
@@ -6,7 +6,6 @@ import sys
|
|
6 |
import gzip
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
-
import numpy as np
|
10 |
from numpy import inf
|
11 |
import gradio as gr
|
12 |
|
@@ -15,7 +14,7 @@ from datetime import datetime
|
|
15 |
today_rev = datetime.now().strftime("%Y%m%d")
|
16 |
|
17 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
18 |
-
from search_funcs.helper_functions import
|
19 |
|
20 |
# Load the SpaCy model
|
21 |
from spacy.cli.download import download
|
|
|
6 |
import gzip
|
7 |
import time
|
8 |
import pandas as pd
|
|
|
9 |
from numpy import inf
|
10 |
import gradio as gr
|
11 |
|
|
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
16 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
17 |
+
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end
|
18 |
|
19 |
# Load the SpaCy model
|
20 |
from spacy.cli.download import download
|
search_funcs/helper_functions.py
CHANGED
@@ -30,8 +30,6 @@ def empty_folder(directory_path):
|
|
30 |
#print(f'Failed to delete {file_path}. Reason: {e}')
|
31 |
print('')
|
32 |
|
33 |
-
|
34 |
-
|
35 |
def get_file_path_end(file_path):
|
36 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
37 |
basename = os.path.basename(file_path)
|
|
|
30 |
#print(f'Failed to delete {file_path}. Reason: {e}')
|
31 |
print('')
|
32 |
|
|
|
|
|
33 |
def get_file_path_end(file_path):
|
34 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
35 |
basename = os.path.basename(file_path)
|
search_funcs/semantic_functions.py
CHANGED
@@ -5,14 +5,14 @@ from typing import Type
|
|
5 |
import gradio as gr
|
6 |
import numpy as np
|
7 |
from datetime import datetime
|
8 |
-
import
|
|
|
|
|
|
|
|
|
9 |
|
10 |
today_rev = datetime.now().strftime("%Y%m%d")
|
11 |
|
12 |
-
from transformers import AutoModel
|
13 |
-
|
14 |
-
from torch import cuda, backends, tensor, mm
|
15 |
-
|
16 |
# Check for torch cuda
|
17 |
print("Is CUDA enabled? ", cuda.is_available())
|
18 |
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
@@ -29,47 +29,122 @@ print("Device used is: ", torch_device)
|
|
29 |
|
30 |
PandasDataFrame = Type[pd.DataFrame]
|
31 |
|
32 |
-
# Load embeddings
|
33 |
# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
|
34 |
# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
|
35 |
-
embeddings_name = "jinaai/jina-embeddings-v2-small-en"
|
36 |
-
local_embeddings_location = "model/jina/"
|
37 |
-
revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
38 |
|
39 |
-
try:
|
40 |
-
|
41 |
-
except:
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
|
|
57 |
'''
|
58 |
-
|
59 |
'''
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
|
65 |
|
66 |
-
|
67 |
|
68 |
-
|
69 |
|
70 |
-
|
|
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
'''
|
74 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
75 |
'''
|
@@ -79,7 +154,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
|
|
79 |
return out_message, None, None
|
80 |
|
81 |
|
82 |
-
progress(0.
|
83 |
|
84 |
print(f"> Total split documents: {len(docs_out)}")
|
85 |
|
@@ -108,7 +183,11 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
|
|
108 |
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
109 |
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
110 |
|
111 |
-
|
|
|
|
|
|
|
|
|
112 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
113 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
114 |
|
@@ -120,10 +199,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_inte
|
|
120 |
if return_intermediate_files == "Yes":
|
121 |
progress(0.9, desc = "Saving embeddings to file")
|
122 |
if embeddings_super_compress == "No":
|
123 |
-
semantic_search_file_name = data_file_name_no_ext + '
|
124 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
125 |
else:
|
126 |
-
semantic_search_file_name = data_file_name_no_ext + '
|
127 |
embeddings_out_round = np.round(embeddings_out, 3)
|
128 |
embeddings_out_round *= 100 # Rounding not currently used
|
129 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
@@ -218,6 +297,88 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
|
|
218 |
|
219 |
return results_df_out
|
220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
222 |
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
|
223 |
|
|
|
5 |
import gradio as gr
|
6 |
import numpy as np
|
7 |
from datetime import datetime
|
8 |
+
#from transformers import AutoModel, AutoTokenizer
|
9 |
+
from search_funcs.helper_functions import get_file_path_end
|
10 |
+
#import torch
|
11 |
+
from torch import cuda, backends#, tensor, mm, utils
|
12 |
+
from sentence_transformers import SentenceTransformer
|
13 |
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
|
|
|
|
|
|
|
|
16 |
# Check for torch cuda
|
17 |
print("Is CUDA enabled? ", cuda.is_available())
|
18 |
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
|
|
29 |
|
30 |
PandasDataFrame = Type[pd.DataFrame]
|
31 |
|
32 |
+
# Load embeddings - Jina - deprecated
|
33 |
# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
|
34 |
# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
|
35 |
+
# embeddings_name = "jinaai/jina-embeddings-v2-small-en"
|
36 |
+
# local_embeddings_location = "model/jina/"
|
37 |
+
# revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
38 |
|
39 |
+
# try:
|
40 |
+
# embeddings_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
41 |
+
# except:
|
42 |
+
# embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
|
43 |
|
44 |
+
# Load embeddings
|
45 |
+
embeddings_name = "BAAI/bge-small-en-v1.5"
|
46 |
+
local_embeddings_location = "model/bge/"
|
47 |
+
|
48 |
+
#try:
|
49 |
+
# tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
|
50 |
+
# embeddings_model = AutoModel.from_pretrained(local_embeddings_location, local_files_only=True).to(torch_device)
|
51 |
+
#except:
|
52 |
+
# tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
|
53 |
+
# embeddings_model = AutoModel.from_pretrained(embeddings_name).to(torch_device)
|
54 |
+
|
55 |
+
# Not using SentenceTransformer here
|
56 |
+
embeddings_model = SentenceTransformer(embeddings_name)
|
57 |
|
58 |
+
# def calc_bge_norm_embeddings(docs, embeddings_model=embeddings_model, tokenizer=tokenizer, progress=gr.Progress(track_tqdm=True)):
|
59 |
+
# # Tokenize sentences
|
60 |
+
# print("Tokenising")
|
61 |
+
# encoded_input = tokenizer(docs, padding=True, truncation=True, return_tensors='pt', max_length=32).to(torch_device)
|
62 |
|
63 |
+
# # Compute token embeddings
|
64 |
+
# print("Calculating embeddings")
|
65 |
+
# with torch.no_grad():
|
66 |
+
# model_output = embeddings_model(**encoded_input).to(torch_device)
|
67 |
+
# # Perform pooling. In this case, cls pooling.
|
68 |
+
# embeddings_out = model_output[0][:, 0]
|
69 |
+
# # normalize embeddings
|
70 |
+
# embeddings_out = torch.nn.functional.normalize(embeddings_out, p=2, dim=1)
|
71 |
+
# #print("Sentence embeddings:", embeddings_out)
|
72 |
+
|
73 |
+
# return embeddings_out
|
74 |
|
75 |
+
|
76 |
+
def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
77 |
'''
|
78 |
+
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
79 |
'''
|
80 |
+
if not in_file:
|
81 |
+
out_message = "No input file found. Please load in at least one file."
|
82 |
+
print(out_message)
|
83 |
+
return out_message, None, None
|
84 |
+
|
85 |
|
86 |
+
progress(0.6, desc = "Loading/creating embeddings")
|
87 |
+
|
88 |
+
print(f"> Total split documents: {len(docs_out)}")
|
|
|
89 |
|
90 |
+
#print(docs_out)
|
91 |
|
92 |
+
page_contents = [doc.page_content for doc in docs_out]
|
93 |
|
94 |
+
## Load in pre-embedded file if exists
|
95 |
+
file_list = [string.name for string in in_file]
|
96 |
|
97 |
+
#print(file_list)
|
98 |
+
|
99 |
+
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
100 |
+
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
101 |
+
data_file_name = data_file_names[0]
|
102 |
+
data_file_name_no_ext = get_file_path_end(data_file_name)
|
103 |
+
|
104 |
+
out_message = "Document processing complete. Ready to search."
|
105 |
+
|
106 |
+
# print("embeddings loaded: ", embeddings_out)
|
107 |
+
|
108 |
+
if embeddings_state.size == 0:
|
109 |
+
tic = time.perf_counter()
|
110 |
+
print("Starting to embed documents.")
|
111 |
+
#embeddings_list = []
|
112 |
+
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
113 |
+
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
114 |
+
|
115 |
+
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
116 |
+
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
117 |
+
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
118 |
+
|
119 |
+
toc = time.perf_counter()
|
120 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
121 |
+
print(time_out)
|
122 |
+
|
123 |
+
# If you want to save your files for next time
|
124 |
+
if return_intermediate_files == "Yes":
|
125 |
+
progress(0.9, desc = "Saving embeddings to file")
|
126 |
+
if embeddings_super_compress == "No":
|
127 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
128 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
129 |
+
else:
|
130 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
|
131 |
+
embeddings_out_round = np.round(embeddings_out, 3)
|
132 |
+
embeddings_out_round *= 100 # Rounding not currently used
|
133 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
134 |
+
|
135 |
+
return out_message, embeddings_out, semantic_search_file_name
|
136 |
+
|
137 |
+
return out_message, embeddings_out, None
|
138 |
+
else:
|
139 |
+
# Just return existing embeddings if already exist
|
140 |
+
embeddings_out = embeddings_state
|
141 |
+
|
142 |
+
print(out_message)
|
143 |
+
|
144 |
+
return out_message, embeddings_out, None#, None
|
145 |
+
|
146 |
+
|
147 |
+
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
148 |
'''
|
149 |
Takes a Langchain document class and saves it into a Chroma sqlite file.
|
150 |
'''
|
|
|
154 |
return out_message, None, None
|
155 |
|
156 |
|
157 |
+
progress(0.6, desc = "Loading/creating embeddings")
|
158 |
|
159 |
print(f"> Total split documents: {len(docs_out)}")
|
160 |
|
|
|
183 |
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
184 |
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
185 |
|
186 |
+
|
187 |
+
|
188 |
+
#embeddings_out = calc_bge_norm_embeddings(page_contents, embeddings_model, tokenizer)
|
189 |
+
|
190 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = 32, normalize_embeddings=True) # For BGE
|
191 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
192 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
193 |
|
|
|
199 |
if return_intermediate_files == "Yes":
|
200 |
progress(0.9, desc = "Saving embeddings to file")
|
201 |
if embeddings_super_compress == "No":
|
202 |
+
semantic_search_file_name = data_file_name_no_ext + '_bge_embeddings.npz'
|
203 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
204 |
else:
|
205 |
+
semantic_search_file_name = data_file_name_no_ext + '_bge_embedding_compress.npz'
|
206 |
embeddings_out_round = np.round(embeddings_out, 3)
|
207 |
embeddings_out_round *= 100 # Rounding not currently used
|
208 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
|
|
297 |
|
298 |
return results_df_out
|
299 |
|
300 |
+
def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
301 |
+
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
|
302 |
+
|
303 |
+
# print("vectorstore loaded: ", vectorstore)
|
304 |
+
progress(0, desc = "Conducting semantic search")
|
305 |
+
|
306 |
+
print("Searching")
|
307 |
+
|
308 |
+
# Convert it to a PyTorch tensor and transfer to GPU
|
309 |
+
#vectorstore_tensor = tensor(vectorstore).to(device)
|
310 |
+
|
311 |
+
# Load the sentence transformer model and move it to GPU
|
312 |
+
embeddings = embeddings.to(device)
|
313 |
+
|
314 |
+
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
315 |
+
query = embeddings.encode(query_str, normalize_embeddings=True)
|
316 |
+
|
317 |
+
# query = calc_bge_norm_embeddings(query_str, embeddings_model=embeddings_model, tokenizer=tokenizer)
|
318 |
+
|
319 |
+
#query_tensor = tensor(query).to(device)
|
320 |
+
|
321 |
+
# if query_tensor.dim() == 1:
|
322 |
+
# query_tensor = query_tensor.unsqueeze(0) # Reshape to 2D with one row
|
323 |
+
|
324 |
+
# Sentence transformers method, not used:
|
325 |
+
cosine_similarities = query @ vectorstore.T
|
326 |
+
#cosine_similarities = util.cos_sim(query_tensor, vectorstore_tensor)[0]
|
327 |
+
#top_results = torch.topk(cos_scores, k=top_k)
|
328 |
+
|
329 |
+
|
330 |
+
# Normalize the query tensor and vectorstore tensor
|
331 |
+
#query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True)
|
332 |
+
#vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True)
|
333 |
+
|
334 |
+
# Calculate cosine similarities (batch processing)
|
335 |
+
#cosine_similarities = mm(query_norm, vectorstore_norm.T)
|
336 |
+
#cosine_similarities = mm(query_tensor, vectorstore_tensor.T)
|
337 |
+
|
338 |
+
# Flatten the tensor to a 1D array
|
339 |
+
cosine_similarities = cosine_similarities.flatten()
|
340 |
+
|
341 |
+
# Convert to a NumPy array if it's still a PyTorch tensor
|
342 |
+
#cosine_similarities = cosine_similarities.cpu().numpy()
|
343 |
+
|
344 |
+
# Create a Pandas Series
|
345 |
+
cosine_similarities_series = pd.Series(cosine_similarities)
|
346 |
+
|
347 |
+
# Pull out relevent info from docs
|
348 |
+
page_contents = [doc.page_content for doc in docs]
|
349 |
+
page_meta = [doc.metadata for doc in docs]
|
350 |
+
ids_range = range(0,len(page_contents))
|
351 |
+
ids = [str(element) for element in ids_range]
|
352 |
+
|
353 |
+
df_docs = pd.DataFrame(data={"ids": ids,
|
354 |
+
"documents": page_contents,
|
355 |
+
"metadatas":page_meta,
|
356 |
+
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]
|
357 |
+
|
358 |
+
|
359 |
+
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
360 |
+
|
361 |
+
print("Search complete")
|
362 |
+
|
363 |
+
# If nothing found, return error message
|
364 |
+
if results_df_out.empty:
|
365 |
+
return 'No result found!', None
|
366 |
+
|
367 |
+
query_str_file = query_str.replace(" ", "_")
|
368 |
+
|
369 |
+
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
370 |
+
|
371 |
+
print("Saving search output to file")
|
372 |
+
progress(0.7, desc = "Saving search output to file")
|
373 |
+
|
374 |
+
results_df_out.to_excel(results_df_name, index= None)
|
375 |
+
results_first_text = results_df_out.iloc[0, 1]
|
376 |
+
|
377 |
+
print("Returning results")
|
378 |
+
|
379 |
+
return results_first_text, results_df_name
|
380 |
+
|
381 |
+
|
382 |
def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
|
383 |
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
|
384 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
-
# Install/ import
|
2 |
-
|
3 |
-
import os
|
4 |
import time
|
5 |
import re
|
6 |
import ast
|
7 |
import gzip
|
8 |
import pandas as pd
|
9 |
import gradio as gr
|
|
|
10 |
from typing import Type, List, Literal
|
11 |
#from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
|
@@ -36,19 +35,6 @@ from search_funcs.helper_functions import get_file_path_end_with_ext, detect_fil
|
|
36 |
from search_funcs.bm25_functions import save_prepared_bm25_data
|
37 |
from search_funcs.clean_funcs import initial_clean
|
38 |
|
39 |
-
## Parse files
|
40 |
-
# def detect_file_type(file_path):
|
41 |
-
# """
|
42 |
-
# Determine the file type based on its extension.
|
43 |
-
|
44 |
-
# Parameters:
|
45 |
-
# file_path (str): Path to the file.
|
46 |
-
|
47 |
-
# Returns:
|
48 |
-
# str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
|
49 |
-
# """
|
50 |
-
# return os.path.splitext(file_path)[1].lower()
|
51 |
-
|
52 |
def parse_file_not_used(file_paths, text_column='text'):
|
53 |
"""
|
54 |
Accepts a list of file paths, determines each file's type based on its extension,
|
@@ -124,8 +110,6 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
|
|
124 |
Pandas DataFrame: Dataframe output from file read
|
125 |
"""
|
126 |
|
127 |
-
#out_df = pd.DataFrame()
|
128 |
-
|
129 |
file_list = [string.name for string in file_path]
|
130 |
|
131 |
#print(file_list)
|
@@ -137,40 +121,10 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
|
|
137 |
#for file_path in file_paths:
|
138 |
file_name = get_file_path_end_with_ext(data_file_name)
|
139 |
|
140 |
-
#print(file_extension)
|
141 |
-
|
142 |
-
# if file_extension == "csv":
|
143 |
-
# df = pd.read_csv(data_file_names[0], low_memory=False)
|
144 |
-
# if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
145 |
-
# df['source'] = file_name
|
146 |
-
# df['page_section'] = ""
|
147 |
-
# elif file_extension == "xlsx":
|
148 |
-
# df = pd.read_excel(data_file_names[0], engine='openpyxl')
|
149 |
-
# if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
150 |
-
# df['source'] = file_name
|
151 |
-
# df['page_section'] = ""
|
152 |
-
# elif file_extension == "parquet":
|
153 |
-
# df = pd.read_parquet(data_file_names[0])
|
154 |
-
# if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
155 |
-
# df['source'] = file_name
|
156 |
-
# df['page_section'] = ""
|
157 |
-
# else:
|
158 |
-
# print(f"Unsupported file type: {file_extension}")
|
159 |
-
# return pd.DataFrame(), ['Please choose a valid file type']
|
160 |
-
|
161 |
-
df = data_state
|
162 |
-
#df['source'] = file_name
|
163 |
-
#df['page_section'] = ""
|
164 |
-
|
165 |
message = "Loaded in file. Now converting to document format."
|
166 |
print(message)
|
167 |
|
168 |
-
return
|
169 |
-
|
170 |
-
|
171 |
-
# +
|
172 |
-
# Convert parsed text to docs
|
173 |
-
# -
|
174 |
|
175 |
def write_out_metadata_as_string(metadata_in):
|
176 |
# If metadata_in is a single dictionary, wrap it in a list
|
@@ -241,63 +195,10 @@ def parse_metadata(row):
|
|
241 |
# Handle the error or log it
|
242 |
return None # or some default value
|
243 |
|
244 |
-
# def csv_excel_text_to_docs_deprecated(df, text_column='text', chunk_size=None) -> List[Document]:
|
245 |
-
# """Converts a DataFrame's content to a list of Documents with metadata."""
|
246 |
-
|
247 |
-
# print("Converting to documents.")
|
248 |
-
|
249 |
-
# doc_sections = []
|
250 |
-
# df[text_column] = df[text_column].astype(str) # Ensure column is a string column
|
251 |
-
|
252 |
-
# # For each row in the dataframe
|
253 |
-
# for idx, row in df.iterrows():
|
254 |
-
# # Extract the text content for the document
|
255 |
-
# doc_content = row[text_column]
|
256 |
-
|
257 |
-
# # Generate metadata containing other columns' data
|
258 |
-
# metadata = {"row": idx + 1}
|
259 |
-
# for col, value in row.items():
|
260 |
-
# if col != text_column:
|
261 |
-
# metadata[col] = value
|
262 |
-
|
263 |
-
# metadata_string = write_out_metadata_as_string(metadata)[0]
|
264 |
-
|
265 |
-
# # If chunk_size is provided, split the text into chunks
|
266 |
-
# if chunk_size:
|
267 |
-
# sections = split_string_into_chunks(doc_content, chunk_size, split_strat)
|
268 |
-
|
269 |
-
# # Langchain usage deprecated
|
270 |
-
# # text_splitter = RecursiveCharacterTextSplitter(
|
271 |
-
# # chunk_size=chunk_size,
|
272 |
-
# # chunk_overlap=chunk_overlap,
|
273 |
-
# # split_strat=split_strat,
|
274 |
-
# # start_index=start_index
|
275 |
-
# # ) #Other arguments as required by the splitter
|
276 |
-
|
277 |
-
# # sections = text_splitter.split_text(doc_content)
|
278 |
-
|
279 |
-
# # For each section, create a Document object
|
280 |
-
# for i, section in enumerate(sections):
|
281 |
-
# section = '. '.join([metadata_string, section])
|
282 |
-
# doc = Document(page_content=section,
|
283 |
-
# metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
284 |
-
# doc_sections.append(doc)
|
285 |
-
|
286 |
-
# else:
|
287 |
-
# # If no chunk_size is provided, create a single Document object for the row
|
288 |
-
# #doc_content = '. '.join([metadata_string, doc_content])
|
289 |
-
# doc = Document(page_content=doc_content, metadata=metadata)
|
290 |
-
# doc_sections.append(doc)
|
291 |
-
|
292 |
-
# message = "Data converted to document format. Now creating/loading document embeddings."
|
293 |
-
# print(message)
|
294 |
-
|
295 |
-
# return doc_sections, message
|
296 |
-
|
297 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
298 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
299 |
if not in_file:
|
300 |
-
return None, "Please load in at least one file.",
|
301 |
|
302 |
progress(0, desc = "Loading in data")
|
303 |
|
@@ -309,7 +210,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
309 |
return doc_sections, "Please load in at least one csv/Excel/parquet data file."
|
310 |
|
311 |
if not text_column:
|
312 |
-
return None, "Please enter a column name to search",
|
313 |
|
314 |
data_file_name = data_file_names[0]
|
315 |
|
@@ -336,6 +237,8 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
336 |
doc_sections = []
|
337 |
df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
|
338 |
|
|
|
|
|
339 |
if clean == "Yes":
|
340 |
progress(0.1, desc = "Cleaning data")
|
341 |
clean_tic = time.perf_counter()
|
@@ -343,21 +246,29 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
343 |
|
344 |
#df = df.drop_duplicates(text_column)
|
345 |
|
346 |
-
df[text_column] = initial_clean(df[text_column])
|
347 |
df_list = list(df[text_column])
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
-
# Save to file if you have cleaned the data
|
350 |
out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
351 |
|
|
|
|
|
|
|
352 |
clean_toc = time.perf_counter()
|
353 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
354 |
print(clean_time_out)
|
355 |
|
356 |
-
cols = [col for col in df.columns if col !=
|
357 |
|
358 |
df["metadata"] = combine_metadata_columns(df, cols)
|
359 |
|
360 |
-
df = df.rename(columns={text_column:"page_content"})
|
361 |
|
362 |
#print(df[["page_content", "metadata"]].to_dict(orient='records'))
|
363 |
|
@@ -367,7 +278,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
367 |
progress(0.3, desc = "Converting data to document format")
|
368 |
|
369 |
# Create a list of Document objects
|
370 |
-
doc_sections = [Document(page_content=row[
|
371 |
metadata= parse_metadata(row["metadata"]))
|
372 |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
373 |
|
@@ -387,7 +298,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
387 |
#print(page_content_series_string[0])
|
388 |
#metadata_series_string = pd.Series(doc_sections[1]).astype(str)
|
389 |
|
390 |
-
import pickle
|
391 |
|
392 |
if clean == "No":
|
393 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
@@ -399,7 +309,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
399 |
elif clean == "Yes":
|
400 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
401 |
|
402 |
-
with gzip.open(file_name + "
|
403 |
pickle.dump(doc_sections, file)
|
404 |
|
405 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|
@@ -407,7 +317,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
407 |
|
408 |
return doc_sections, "Finished preparing documents."
|
409 |
|
410 |
-
|
411 |
def document_to_dataframe(documents):
|
412 |
'''
|
413 |
Convert an object in document format to pandas dataframe
|
@@ -429,12 +338,3 @@ def document_to_dataframe(documents):
|
|
429 |
# Create a DataFrame from the list of rows
|
430 |
df = pd.DataFrame(rows)
|
431 |
return df
|
432 |
-
|
433 |
-
# Example usage
|
434 |
-
#documents = [
|
435 |
-
# Document(page_content="Example content 1", metadata={"author": "Author 1", "year": 2021}),
|
436 |
-
# Document(page_content="Example content 2", metadata={"author": "Author 2", "year": 2022})
|
437 |
-
#]
|
438 |
-
|
439 |
-
#df = document_to_dataframe(documents)
|
440 |
-
#df
|
|
|
1 |
+
# Install/ import packages
|
|
|
|
|
2 |
import time
|
3 |
import re
|
4 |
import ast
|
5 |
import gzip
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
+
import pickle
|
9 |
from typing import Type, List, Literal
|
10 |
#from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
|
|
|
35 |
from search_funcs.bm25_functions import save_prepared_bm25_data
|
36 |
from search_funcs.clean_funcs import initial_clean
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def parse_file_not_used(file_paths, text_column='text'):
|
39 |
"""
|
40 |
Accepts a list of file paths, determines each file's type based on its extension,
|
|
|
110 |
Pandas DataFrame: Dataframe output from file read
|
111 |
"""
|
112 |
|
|
|
|
|
113 |
file_list = [string.name for string in file_path]
|
114 |
|
115 |
#print(file_list)
|
|
|
121 |
#for file_path in file_paths:
|
122 |
file_name = get_file_path_end_with_ext(data_file_name)
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
message = "Loaded in file. Now converting to document format."
|
125 |
print(message)
|
126 |
|
127 |
+
return data_state, file_name, message
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
def write_out_metadata_as_string(metadata_in):
|
130 |
# If metadata_in is a single dictionary, wrap it in a list
|
|
|
195 |
# Handle the error or log it
|
196 |
return None # or some default value
|
197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
200 |
if not in_file:
|
201 |
+
return None, "Please load in at least one file.", df, None, None, None
|
202 |
|
203 |
progress(0, desc = "Loading in data")
|
204 |
|
|
|
210 |
return doc_sections, "Please load in at least one csv/Excel/parquet data file."
|
211 |
|
212 |
if not text_column:
|
213 |
+
return None, "Please enter a column name to search", df, None, None, None
|
214 |
|
215 |
data_file_name = data_file_names[0]
|
216 |
|
|
|
237 |
doc_sections = []
|
238 |
df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
|
239 |
|
240 |
+
original_text_column = text_column
|
241 |
+
|
242 |
if clean == "Yes":
|
243 |
progress(0.1, desc = "Cleaning data")
|
244 |
clean_tic = time.perf_counter()
|
|
|
246 |
|
247 |
#df = df.drop_duplicates(text_column)
|
248 |
|
|
|
249 |
df_list = list(df[text_column])
|
250 |
+
df_list = initial_clean(df_list)
|
251 |
+
|
252 |
+
# Get rid of old data and keep only the new
|
253 |
+
#df = df.drop(text_column, axis = 1)
|
254 |
+
|
255 |
+
|
256 |
|
257 |
+
# Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
|
258 |
out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
|
259 |
|
260 |
+
df[text_column] = df_list
|
261 |
+
|
262 |
+
|
263 |
clean_toc = time.perf_counter()
|
264 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
265 |
print(clean_time_out)
|
266 |
|
267 |
+
cols = [col for col in df.columns if col != original_text_column]
|
268 |
|
269 |
df["metadata"] = combine_metadata_columns(df, cols)
|
270 |
|
271 |
+
#df = df.rename(columns={text_column:"page_content"})
|
272 |
|
273 |
#print(df[["page_content", "metadata"]].to_dict(orient='records'))
|
274 |
|
|
|
278 |
progress(0.3, desc = "Converting data to document format")
|
279 |
|
280 |
# Create a list of Document objects
|
281 |
+
doc_sections = [Document(page_content=row[text_column],
|
282 |
metadata= parse_metadata(row["metadata"]))
|
283 |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
284 |
|
|
|
298 |
#print(page_content_series_string[0])
|
299 |
#metadata_series_string = pd.Series(doc_sections[1]).astype(str)
|
300 |
|
|
|
301 |
|
302 |
if clean == "No":
|
303 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
|
|
309 |
elif clean == "Yes":
|
310 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
311 |
|
312 |
+
with gzip.open(file_name + "cleaned_prepared_docs.pkl.gz", 'wb') as file:
|
313 |
pickle.dump(doc_sections, file)
|
314 |
|
315 |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
|
|
|
317 |
|
318 |
return doc_sections, "Finished preparing documents."
|
319 |
|
|
|
320 |
def document_to_dataframe(documents):
|
321 |
'''
|
322 |
Convert an object in document format to pandas dataframe
|
|
|
338 |
# Create a DataFrame from the list of rows
|
339 |
df = pd.DataFrame(rows)
|
340 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
search_funcs/spacy_search_funcs.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import spacy
|
|
|
|
|
2 |
from spacy.matcher import Matcher
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
@@ -10,15 +12,27 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
10 |
|
11 |
today_rev = datetime.now().strftime("%Y%m%d")
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
string_query = "knife attack run fast"
|
16 |
-
df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
20 |
''' Conduct fuzzy match on a list of data.'''
|
21 |
|
|
|
|
|
|
|
|
|
22 |
query = nlp(string_query)
|
23 |
tokenised_query = [token.text for token in query]
|
24 |
print(tokenised_query)
|
|
|
1 |
import spacy
|
2 |
+
spacy.prefer_gpu()
|
3 |
+
from spacy.cli.download import download
|
4 |
from spacy.matcher import Matcher
|
5 |
import numpy as np
|
6 |
import gradio as gr
|
|
|
12 |
|
13 |
today_rev = datetime.now().strftime("%Y%m%d")
|
14 |
|
15 |
+
# Load the SpaCy model
|
|
|
|
|
|
|
16 |
|
17 |
+
#os.system("python -m spacy download en_core_web_sm")
|
18 |
+
try:
|
19 |
+
import en_core_web_sm
|
20 |
+
nlp = en_core_web_sm.load()
|
21 |
+
print("Successfully imported spaCy model")
|
22 |
+
#nlp = spacy.load("en_core_web_sm")
|
23 |
+
#print(nlp._path)
|
24 |
+
except:
|
25 |
+
download("en_core_web_sm")
|
26 |
+
nlp = spacy.load("en_core_web_sm")
|
27 |
+
print("Successfully imported spaCy model")
|
28 |
|
29 |
def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
30 |
''' Conduct fuzzy match on a list of data.'''
|
31 |
|
32 |
+
if len(df_list) > 10000:
|
33 |
+
out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
|
34 |
+
return out_message, None
|
35 |
+
|
36 |
query = nlp(string_query)
|
37 |
tokenised_query = [token.text for token in query]
|
38 |
print(tokenised_query)
|