Spaces:
Sleeping
Sleeping
Commit
·
ea0dd40
1
Parent(s):
2806807
Changed embedding model to MiniLM-L6 as faster. Compressed embeddings are now int8. General improvements to API mode
Browse files- Dockerfile +9 -13
- app.py +16 -11
- download_model.py +15 -0
- output/36de65711121889ccdcb768b85e97e386d8fe4bd/keyword_search_result_20240702_school.xlsm +0 -0
- requirements.txt +1 -1
- requirements_gpu.txt +1 -1
- search_funcs/bm25_functions.py +53 -48
- search_funcs/clean_funcs.py +38 -17
- search_funcs/helper_functions.py +19 -4
- search_funcs/semantic_functions.py +149 -71
- search_funcs/semantic_ingest_functions.py +32 -157
- search_funcs/spacy_search_funcs.py +3 -14
Dockerfile
CHANGED
@@ -1,11 +1,8 @@
|
|
1 |
# First stage: build dependencies
|
2 |
-
|
3 |
|
4 |
-
#
|
5 |
-
|
6 |
-
|
7 |
-
# Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
8 |
-
COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
9 |
|
10 |
# Install wget
|
11 |
RUN apt-get update && apt-get install -y wget
|
@@ -20,14 +17,12 @@ COPY requirements.txt .
|
|
20 |
RUN pip install --no-cache-dir -r requirements.txt
|
21 |
|
22 |
# Gradio needs to be installed after due to conflict with spacy in requirements
|
23 |
-
RUN pip install --no-cache-dir gradio==4.
|
24 |
|
25 |
-
# Download the BGE embedding model during the build process
|
26 |
-
RUN
|
27 |
-
|
28 |
-
RUN
|
29 |
-
RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
|
30 |
-
RUN rm -rf /model/bge/.git
|
31 |
|
32 |
# Set up a new user named "user" with user ID 1000
|
33 |
RUN useradd -m -u 1000 user
|
@@ -47,6 +42,7 @@ ENV HOME=/home/user \
|
|
47 |
PATH=/home/user/.local/bin:$PATH \
|
48 |
PYTHONPATH=$HOME/app \
|
49 |
PYTHONUNBUFFERED=1 \
|
|
|
50 |
GRADIO_ALLOW_FLAGGING=never \
|
51 |
GRADIO_NUM_PORTS=1 \
|
52 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
|
|
1 |
# First stage: build dependencies
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
3 |
|
4 |
+
# Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
5 |
+
# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
|
|
|
|
|
|
6 |
|
7 |
# Install wget
|
8 |
RUN apt-get update && apt-get install -y wget
|
|
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
# Gradio needs to be installed after due to conflict with spacy in requirements
|
20 |
+
RUN pip install --no-cache-dir gradio==4.37.2
|
21 |
|
22 |
+
# Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
|
23 |
+
RUN mkdir -p /model/minilm
|
24 |
+
COPY download_model.py /src/download_model.py
|
25 |
+
RUN python /src/download_model.py
|
|
|
|
|
26 |
|
27 |
# Set up a new user named "user" with user ID 1000
|
28 |
RUN useradd -m -u 1000 user
|
|
|
42 |
PATH=/home/user/.local/bin:$PATH \
|
43 |
PYTHONPATH=$HOME/app \
|
44 |
PYTHONUNBUFFERED=1 \
|
45 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
46 |
GRADIO_ALLOW_FLAGGING=never \
|
47 |
GRADIO_NUM_PORTS=1 \
|
48 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
app.py
CHANGED
@@ -7,7 +7,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
-
from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_semantic_search
|
11 |
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
@@ -24,24 +24,29 @@ with app:
|
|
24 |
|
25 |
# BM25 state objects
|
26 |
orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
|
|
27 |
prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
28 |
#tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
|
29 |
tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
|
30 |
-
bm25_search_index_state = gr.State()
|
31 |
-
|
32 |
|
33 |
# Semantic search state objects
|
34 |
orig_semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
|
35 |
semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
|
36 |
semantic_input_document_format = gr.State([])
|
|
|
|
|
|
|
37 |
embeddings_state = gr.State(np.array([])) #gr.Dataframe(np.array([]), type="numpy", visible=False) #gr.State(np.array([])) # globals()["embeddings"]
|
|
|
|
|
38 |
semantic_k_val = gr.Number(9999, visible=False)
|
39 |
|
40 |
# State objects for app in general
|
41 |
session_hash_state = gr.State("")
|
42 |
s3_output_folder_state = gr.State("")
|
43 |
join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
|
44 |
-
output_file_state = gr.
|
45 |
|
46 |
# Informational state objects
|
47 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
@@ -95,7 +100,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
95 |
"""
|
96 |
**Thematic/semantic search**
|
97 |
|
98 |
-
This search type enables you to search for
|
99 |
""")
|
100 |
|
101 |
with gr.Row():
|
@@ -122,7 +127,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
122 |
with gr.Row():
|
123 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
124 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
125 |
-
|
126 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
127 |
with gr.Accordion(label="Keyword search options", open = False):
|
128 |
with gr.Row():
|
@@ -194,12 +199,12 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
194 |
# Load in a csv/excel file for semantic search
|
195 |
in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
|
196 |
load_semantic_data_button.click(
|
197 |
-
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state]).\
|
198 |
-
then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, embeddings_state,
|
199 |
-
|
200 |
# Semantic search query
|
201 |
-
semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
|
202 |
-
semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
203 |
|
204 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
205 |
|
|
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
+
from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
|
11 |
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
|
|
24 |
|
25 |
# BM25 state objects
|
26 |
orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
27 |
+
#orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
28 |
prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
29 |
#tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
|
30 |
tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
|
31 |
+
bm25_search_index_state = gr.State()
|
|
|
32 |
|
33 |
# Semantic search state objects
|
34 |
orig_semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
|
35 |
semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
|
36 |
semantic_input_document_format = gr.State([])
|
37 |
+
|
38 |
+
embeddings_model_name_state = gr.State("sentence-transformers/all-MiniLM-L6-v2")#"BAAI/bge-small-en-v1.5")
|
39 |
+
embeddings_model_loc_state = gr.State("minilm/")#"bge/")
|
40 |
embeddings_state = gr.State(np.array([])) #gr.Dataframe(np.array([]), type="numpy", visible=False) #gr.State(np.array([])) # globals()["embeddings"]
|
41 |
+
embeddings_model_state = gr.State()
|
42 |
+
torch_device_state = gr.State("cpu")
|
43 |
semantic_k_val = gr.Number(9999, visible=False)
|
44 |
|
45 |
# State objects for app in general
|
46 |
session_hash_state = gr.State("")
|
47 |
s3_output_folder_state = gr.State("")
|
48 |
join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
|
49 |
+
output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
|
50 |
|
51 |
# Informational state objects
|
52 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
|
|
100 |
"""
|
101 |
**Thematic/semantic search**
|
102 |
|
103 |
+
This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
|
104 |
""")
|
105 |
|
106 |
with gr.Row():
|
|
|
127 |
with gr.Row():
|
128 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
129 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
130 |
+
embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
131 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
132 |
with gr.Accordion(label="Keyword search options", open = False):
|
133 |
with gr.Row():
|
|
|
199 |
# Load in a csv/excel file for semantic search
|
200 |
in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
|
201 |
load_semantic_data_button.click(
|
202 |
+
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
|
203 |
+
then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
|
204 |
+
|
205 |
# Semantic search query
|
206 |
+
semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
|
207 |
+
semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
208 |
|
209 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
210 |
|
download_model.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import hf_hub_download
|
2 |
+
|
3 |
+
# Define the repository and files to download
|
4 |
+
repo_id = "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
|
5 |
+
files_to_download = [
|
6 |
+
"config.json",
|
7 |
+
"pytorch_model.bin",
|
8 |
+
"tokenizer_config.json",
|
9 |
+
"vocab.txt"
|
10 |
+
]
|
11 |
+
|
12 |
+
# Download each file and save it to the /model/bge directory
|
13 |
+
for file_name in files_to_download:
|
14 |
+
print("Checking for file", file_name)
|
15 |
+
hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="/model/minilm") #"/model/bge"
|
output/36de65711121889ccdcb768b85e97e386d8fe4bd/keyword_search_result_20240702_school.xlsm
ADDED
Binary file (9.92 kB). View file
|
|
requirements.txt
CHANGED
@@ -2,7 +2,7 @@ pandas==2.2.2
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.3
|
5 |
-
torch==2.3.1
|
6 |
spacy
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
gradio
|
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.3
|
5 |
+
torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
|
6 |
spacy
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
gradio
|
requirements_gpu.txt
CHANGED
@@ -2,7 +2,7 @@ pandas==2.2.2
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.3
|
5 |
-
torch==2.
|
6 |
spacy
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
gradio
|
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.3
|
5 |
+
torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
|
6 |
spacy
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
gradio
|
search_funcs/bm25_functions.py
CHANGED
@@ -15,28 +15,7 @@ from datetime import datetime
|
|
15 |
today_rev = datetime.now().strftime("%Y%m%d")
|
16 |
|
17 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
18 |
-
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
|
19 |
-
|
20 |
-
# Load the SpaCy model
|
21 |
-
from spacy.cli.download import download
|
22 |
-
import spacy
|
23 |
-
spacy.prefer_gpu()
|
24 |
-
|
25 |
-
#os.system("python -m spacy download en_core_web_sm")
|
26 |
-
try:
|
27 |
-
import en_core_web_sm
|
28 |
-
nlp = en_core_web_sm.load()
|
29 |
-
print("Successfully imported spaCy model")
|
30 |
-
#nlp = spacy.load("en_core_web_sm")
|
31 |
-
#print(nlp._path)
|
32 |
-
except:
|
33 |
-
download("en_core_web_sm")
|
34 |
-
nlp = spacy.load("en_core_web_sm")
|
35 |
-
print("Successfully imported spaCy model")
|
36 |
-
#print(nlp._path)
|
37 |
-
|
38 |
-
# including punctuation rules and exceptions
|
39 |
-
tokenizer = nlp.tokenizer
|
40 |
|
41 |
PARAM_K1 = 1.5
|
42 |
PARAM_B = 0.75
|
@@ -230,6 +209,35 @@ class BM25:
|
|
230 |
with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
|
231 |
return pickle.load(fsave)
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
def prepare_bm25_input_data(
|
234 |
in_file: list,
|
235 |
text_column: str,
|
@@ -348,9 +356,8 @@ def prepare_bm25_input_data(
|
|
348 |
else:
|
349 |
tokeniser_tic = time.perf_counter()
|
350 |
prepared_search_text_list = []
|
351 |
-
|
352 |
-
|
353 |
-
prepared_search_text_list.append([token.text for token in doc])
|
354 |
|
355 |
tokeniser_toc = time.perf_counter()
|
356 |
tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
|
@@ -519,26 +526,18 @@ def prepare_bm25(
|
|
519 |
|
520 |
return message, None, bm25, prepared_search_text_list
|
521 |
|
522 |
-
def convert_bm25_query_to_tokens(free_text_query
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
if clean=="Yes":
|
528 |
-
split_query = tokenizer(free_text_query.lower())
|
529 |
-
out_query = [token.text for token in split_query]
|
530 |
-
#out_query = stem_sentence(out_query)
|
531 |
-
else:
|
532 |
-
split_query = tokenizer(free_text_query.lower())
|
533 |
-
out_query = [token.text for token in split_query]
|
534 |
|
535 |
-
|
|
|
536 |
|
537 |
-
|
538 |
-
print("Converting string")
|
539 |
-
out_query = [out_query]
|
540 |
|
541 |
-
|
542 |
|
543 |
def bm25_search(
|
544 |
free_text_query: str,
|
@@ -596,9 +595,11 @@ def bm25_search(
|
|
596 |
Returns
|
597 |
-------
|
598 |
tuple
|
599 |
-
A tuple containing a message
|
600 |
"""
|
601 |
|
|
|
|
|
602 |
progress(0, desc = "Conducting keyword search")
|
603 |
|
604 |
print("in_join_file at start of bm25_search:", in_join_file)
|
@@ -611,10 +612,7 @@ def bm25_search(
|
|
611 |
# print("bm25:", bm25)
|
612 |
|
613 |
# Prepare query
|
614 |
-
|
615 |
-
token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
|
616 |
-
else:
|
617 |
-
token_query = convert_bm25_query_to_tokens(free_text_query, clean="No")
|
618 |
|
619 |
# Perform search
|
620 |
print("Searching")
|
@@ -685,6 +683,13 @@ def bm25_search(
|
|
685 |
|
686 |
results_first_text = results_df_out[text_column].iloc[0]
|
687 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
688 |
print("Returning results")
|
689 |
|
690 |
-
return results_first_text,
|
|
|
15 |
today_rev = datetime.now().strftime("%Y%m%d")
|
16 |
|
17 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
18 |
+
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder, load_spacy_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
PARAM_K1 = 1.5
|
21 |
PARAM_B = 0.75
|
|
|
209 |
with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
|
210 |
return pickle.load(fsave)
|
211 |
|
212 |
+
|
213 |
+
def tokenise_text_spacy(prepared_text_as_list:List[str], progress = gr.Progress(track_tqdm=True)):
|
214 |
+
'''
|
215 |
+
Tokenise a list of texts using the spaCy package and the en_core_web_sm model.
|
216 |
+
'''
|
217 |
+
|
218 |
+
# Load spaCy model
|
219 |
+
nlp = load_spacy_model()
|
220 |
+
|
221 |
+
prepared_search_text_list = []
|
222 |
+
batch_size = 256
|
223 |
+
for doc in nlp.tokenizer.pipe(progress.tqdm(prepared_text_as_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
224 |
+
prepared_search_text_list.append([token.text for token in doc])
|
225 |
+
|
226 |
+
return prepared_search_text_list
|
227 |
+
|
228 |
+
def tokenise_text_nltk(prepared_text_as_list: List[str], progress= gr.Progress(track_tqdm=True)):
|
229 |
+
"""
|
230 |
+
Tokenise a list of texts using the NLTK package.
|
231 |
+
"""
|
232 |
+
import nltk
|
233 |
+
nltk.download('punkt', quiet=True) # Download the necessary resource if not already present
|
234 |
+
|
235 |
+
prepared_search_text_list = []
|
236 |
+
for text in progress.tqdm(prepared_text_as_list, desc="Tokenising text", unit="rows"):
|
237 |
+
prepared_search_text_list.append(nltk.word_tokenize(text.lower())) # Lowercase for consistency
|
238 |
+
|
239 |
+
return prepared_search_text_list
|
240 |
+
|
241 |
def prepare_bm25_input_data(
|
242 |
in_file: list,
|
243 |
text_column: str,
|
|
|
356 |
else:
|
357 |
tokeniser_tic = time.perf_counter()
|
358 |
prepared_search_text_list = []
|
359 |
+
|
360 |
+
prepared_search_text_list = tokenise_text_spacy(prepared_text_as_list)
|
|
|
361 |
|
362 |
tokeniser_toc = time.perf_counter()
|
363 |
tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
|
|
|
526 |
|
527 |
return message, None, bm25, prepared_search_text_list
|
528 |
|
529 |
+
def convert_bm25_query_to_tokens(free_text_query:str):
|
530 |
+
"""
|
531 |
+
Split open text query into tokens.
|
532 |
+
"""
|
533 |
+
split_query = tokenise_text_spacy([free_text_query.lower()])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
+
# Flatten the list of lists into a single list
|
536 |
+
flattened_query = [token for sublist in split_query for token in sublist]
|
537 |
|
538 |
+
print("Search query out is:", flattened_query)
|
|
|
|
|
539 |
|
540 |
+
return flattened_query
|
541 |
|
542 |
def bm25_search(
|
543 |
free_text_query: str,
|
|
|
595 |
Returns
|
596 |
-------
|
597 |
tuple
|
598 |
+
A tuple containing a message and the output search results files (if any).
|
599 |
"""
|
600 |
|
601 |
+
output_files = []
|
602 |
+
|
603 |
progress(0, desc = "Conducting keyword search")
|
604 |
|
605 |
print("in_join_file at start of bm25_search:", in_join_file)
|
|
|
612 |
# print("bm25:", bm25)
|
613 |
|
614 |
# Prepare query
|
615 |
+
token_query = convert_bm25_query_to_tokens(free_text_query)
|
|
|
|
|
|
|
616 |
|
617 |
# Perform search
|
618 |
print("Searching")
|
|
|
683 |
|
684 |
results_first_text = results_df_out[text_column].iloc[0]
|
685 |
|
686 |
+
output_files.append(results_df_name)
|
687 |
+
|
688 |
+
csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
689 |
+
results_df_out.to_csv(csv_output_file, index=None)
|
690 |
+
|
691 |
+
output_files.append(csv_output_file)
|
692 |
+
|
693 |
print("Returning results")
|
694 |
|
695 |
+
return results_first_text, output_files
|
search_funcs/clean_funcs.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
# ## Some functions to clean text
|
2 |
-
|
3 |
import re
|
4 |
import string
|
5 |
-
import polars as pl
|
6 |
|
7 |
# Add calendar months onto stop words
|
8 |
import calendar
|
9 |
-
#from tqdm import tqdm
|
10 |
-
import gradio as gr
|
11 |
|
12 |
from typing import List
|
13 |
|
@@ -15,7 +11,6 @@ from typing import List
|
|
15 |
custom_words = []
|
16 |
my_stop_words = custom_words
|
17 |
|
18 |
-
|
19 |
cal_month = (list(calendar.month_name))
|
20 |
cal_month = [x.lower() for x in cal_month]
|
21 |
|
@@ -24,7 +19,6 @@ cal_month = [i for i in cal_month if i]
|
|
24 |
#print(cal_month)
|
25 |
custom_words.extend(cal_month)
|
26 |
|
27 |
-
|
28 |
# #### Some of my cleaning functions
|
29 |
replace_backslash = r'\\'
|
30 |
email_start_pattern_regex = r'.*importance:|.*subject:'
|
@@ -37,19 +31,19 @@ warning_pattern_regex = r'caution: this email originated from outside of the org
|
|
37 |
nbsp_pattern_regex = r' '
|
38 |
multiple_spaces_regex = r'\s{2,}'
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
# html_pattern = re.compile(html_pattern_regex)
|
44 |
-
# email_pattern = re.compile(email_end_pattern_regex)
|
45 |
-
# num_pattern = re.compile(num_pattern_regex)
|
46 |
-
# postcode_pattern = re.compile(postcode_pattern_regex)
|
47 |
-
# warning_pattern = re.compile(warning_pattern_regex)
|
48 |
-
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
texts = pl.Series(texts)#[]
|
53 |
|
54 |
text = texts.str.replace_all(replace_backslash, '/')
|
55 |
text = text.str.replace_all(html_pattern_regex, '')
|
@@ -62,6 +56,33 @@ def initial_clean(texts:List[str] , progress=gr.Progress()):
|
|
62 |
|
63 |
return text
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
def remove_hyphens(text_text):
|
66 |
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
|
67 |
|
|
|
1 |
# ## Some functions to clean text
|
|
|
2 |
import re
|
3 |
import string
|
|
|
4 |
|
5 |
# Add calendar months onto stop words
|
6 |
import calendar
|
|
|
|
|
7 |
|
8 |
from typing import List
|
9 |
|
|
|
11 |
custom_words = []
|
12 |
my_stop_words = custom_words
|
13 |
|
|
|
14 |
cal_month = (list(calendar.month_name))
|
15 |
cal_month = [x.lower() for x in cal_month]
|
16 |
|
|
|
19 |
#print(cal_month)
|
20 |
custom_words.extend(cal_month)
|
21 |
|
|
|
22 |
# #### Some of my cleaning functions
|
23 |
replace_backslash = r'\\'
|
24 |
email_start_pattern_regex = r'.*importance:|.*subject:'
|
|
|
31 |
nbsp_pattern_regex = r' '
|
32 |
multiple_spaces_regex = r'\s{2,}'
|
33 |
|
34 |
+
def initial_clean(texts:List[str]):
|
35 |
+
"""
|
36 |
+
This function cleans a list of text strings by performing various replacements using polars.
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
Args:
|
39 |
+
texts (List[str]): A list of strings to clean.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
List[str]: A list of cleaned strings.
|
43 |
+
"""
|
44 |
+
import polars as pl
|
45 |
|
46 |
+
texts = pl.Series(texts)
|
|
|
47 |
|
48 |
text = texts.str.replace_all(replace_backslash, '/')
|
49 |
text = text.str.replace_all(html_pattern_regex, '')
|
|
|
56 |
|
57 |
return text
|
58 |
|
59 |
+
|
60 |
+
def initial_clean_pandas(texts: List[str]):
|
61 |
+
"""
|
62 |
+
This function cleans a list of text strings by performing various replacements using pandas.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
texts (List[str]): A list of strings to clean.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
List[str]: A list of cleaned strings.
|
69 |
+
"""
|
70 |
+
import pandas as pd
|
71 |
+
|
72 |
+
# Create a pandas Series from the text list for easier manipulation
|
73 |
+
text_series = pd.Series(texts)
|
74 |
+
|
75 |
+
# Replace patterns with pandas string methods (`.str.replace`)
|
76 |
+
text_series = text_series.astype(str).str.replace(replace_backslash, '/', regex=True)
|
77 |
+
text_series = text_series.astype(str).str.replace(html_pattern_regex, '', regex=True)
|
78 |
+
text_series = text_series.astype(str).str.replace(email_start_pattern_regex, '', regex=True)
|
79 |
+
text_series = text_series.astype(str).str.replace(email_end_pattern_regex, '', regex=True)
|
80 |
+
text_series = text_series.astype(str).str.replace(email_pattern_regex, '', regex=True)
|
81 |
+
text_series = text_series.astype(str).str.replace(multiple_spaces_regex, ' ', regex=True)
|
82 |
+
|
83 |
+
# Convert cleaned Series back to a list
|
84 |
+
return text_series.tolist()
|
85 |
+
|
86 |
def remove_hyphens(text_text):
|
87 |
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
|
88 |
|
search_funcs/helper_functions.py
CHANGED
@@ -67,7 +67,7 @@ def get_connection_params(request: gr.Request):
|
|
67 |
#print("Query parameters:", dict(request.query_params))
|
68 |
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
69 |
#print("Request dictionary to object:", request.request.body())
|
70 |
-
print("Session hash:", request.session_hash)
|
71 |
|
72 |
if 'x-cognito-id' in request.headers:
|
73 |
out_session_hash = request.headers['x-cognito-id']
|
@@ -77,11 +77,11 @@ def get_connection_params(request: gr.Request):
|
|
77 |
else:
|
78 |
out_session_hash = request.session_hash
|
79 |
base_folder = "temp-files/"
|
80 |
-
print("Cognito ID not found. Using session hash as save folder.")
|
81 |
|
82 |
output_folder = base_folder + out_session_hash + "/"
|
83 |
-
if bucket_name:
|
84 |
-
|
85 |
|
86 |
return out_session_hash, output_folder
|
87 |
else:
|
@@ -281,6 +281,21 @@ def put_columns_in_join_df(in_file:str):
|
|
281 |
|
282 |
return gr.Dropdown(choices=concat_choices), new_df, out_message
|
283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
|
285 |
def display_info(info_component):
|
286 |
gr.Info(info_component)
|
|
|
67 |
#print("Query parameters:", dict(request.query_params))
|
68 |
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
69 |
#print("Request dictionary to object:", request.request.body())
|
70 |
+
#print("Session hash:", request.session_hash)
|
71 |
|
72 |
if 'x-cognito-id' in request.headers:
|
73 |
out_session_hash = request.headers['x-cognito-id']
|
|
|
77 |
else:
|
78 |
out_session_hash = request.session_hash
|
79 |
base_folder = "temp-files/"
|
80 |
+
#print("Cognito ID not found. Using session hash as save folder.")
|
81 |
|
82 |
output_folder = base_folder + out_session_hash + "/"
|
83 |
+
#if bucket_name:
|
84 |
+
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
85 |
|
86 |
return out_session_hash, output_folder
|
87 |
else:
|
|
|
281 |
|
282 |
return gr.Dropdown(choices=concat_choices), new_df, out_message
|
283 |
|
284 |
+
def load_spacy_model():
|
285 |
+
# Load the SpaCy model
|
286 |
+
from spacy.cli.download import download
|
287 |
+
import spacy
|
288 |
+
spacy.prefer_gpu()
|
289 |
+
|
290 |
+
try:
|
291 |
+
import en_core_web_sm
|
292 |
+
nlp = en_core_web_sm.load()
|
293 |
+
print("Successfully imported spaCy model")
|
294 |
+
except:
|
295 |
+
download("en_core_web_sm")
|
296 |
+
nlp = spacy.load("en_core_web_sm")
|
297 |
+
print("Successfully imported spaCy model")
|
298 |
+
return nlp
|
299 |
|
300 |
def display_info(info_component):
|
301 |
gr.Info(info_component)
|
search_funcs/semantic_functions.py
CHANGED
@@ -6,58 +6,61 @@ import gradio as gr
|
|
6 |
import numpy as np
|
7 |
from datetime import datetime
|
8 |
from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
|
9 |
-
from torch import cuda, backends
|
10 |
-
from sentence_transformers import SentenceTransformer
|
11 |
PandasDataFrame = Type[pd.DataFrame]
|
12 |
-
|
13 |
today_rev = datetime.now().strftime("%Y%m%d")
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
def docs_to_bge_embed_np_array(
|
53 |
docs_out: list,
|
54 |
in_file: list,
|
55 |
-
embeddings_state: np.ndarray,
|
56 |
output_file_state: str,
|
57 |
-
clean: str,
|
|
|
|
|
|
|
58 |
return_intermediate_files: str = "No",
|
59 |
-
|
60 |
-
embeddings_model: SentenceTransformer = embeddings_model,
|
61 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
62 |
) -> tuple:
|
63 |
"""
|
@@ -66,18 +69,20 @@ def docs_to_bge_embed_np_array(
|
|
66 |
Parameters:
|
67 |
- docs_out (list): List of documents to be embedded.
|
68 |
- in_file (list): List of input files.
|
69 |
-
- embeddings_state (np.ndarray): Current state of embeddings.
|
70 |
- output_file_state (str): State of the output file.
|
71 |
- clean (str): Indicates if the data should be cleaned.
|
|
|
|
|
|
|
72 |
- return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
|
73 |
-
-
|
74 |
-
- embeddings_model (SentenceTransformer, optional): The embeddings model to use. Default is embeddings_model.
|
75 |
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
|
76 |
|
77 |
Returns:
|
78 |
- tuple: A tuple containing the output message, embeddings, and output file state.
|
79 |
"""
|
80 |
|
|
|
81 |
|
82 |
ensure_output_folder_exists(output_folder)
|
83 |
|
@@ -102,12 +107,29 @@ def docs_to_bge_embed_np_array(
|
|
102 |
|
103 |
out_message = "Document processing complete. Ready to search."
|
104 |
|
105 |
-
|
106 |
if embeddings_state.size == 0:
|
107 |
tic = time.perf_counter()
|
108 |
print("Starting to embed documents.")
|
109 |
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
toc = time.perf_counter()
|
113 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
@@ -119,27 +141,25 @@ def docs_to_bge_embed_np_array(
|
|
119 |
else: data_file_name_no_ext = data_file_name_no_ext
|
120 |
|
121 |
progress(0.9, desc = "Saving embeddings to file")
|
122 |
-
if
|
123 |
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
|
124 |
-
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
125 |
else:
|
126 |
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
|
127 |
-
|
128 |
-
|
129 |
-
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
130 |
|
131 |
output_file_state.append(semantic_search_file_name)
|
132 |
|
133 |
-
return out_message, embeddings_out, output_file_state, output_file_state
|
134 |
|
135 |
-
return out_message, embeddings_out, output_file_state, output_file_state
|
136 |
else:
|
137 |
# Just return existing embeddings if already exist
|
138 |
embeddings_out = embeddings_state
|
139 |
|
140 |
print(out_message)
|
141 |
|
142 |
-
return out_message, embeddings_out, output_file_state, output_file_state
|
143 |
|
144 |
def process_data_from_scores_df(
|
145 |
df_docs: pd.DataFrame,
|
@@ -226,14 +246,15 @@ def bge_semantic_search(
|
|
226 |
embeddings: np.ndarray,
|
227 |
documents: list,
|
228 |
k_val: int,
|
229 |
-
vec_score_cut_off: float,
|
|
|
|
|
|
|
230 |
in_join_file: pd.DataFrame,
|
231 |
in_join_column: str = None,
|
232 |
-
search_df_join_column: str = None,
|
233 |
-
device: str = torch_device,
|
234 |
-
embeddings_model: SentenceTransformer = embeddings_model,
|
235 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
236 |
-
) ->
|
237 |
"""
|
238 |
Perform a semantic search using the BGE model.
|
239 |
|
@@ -243,33 +264,83 @@ def bge_semantic_search(
|
|
243 |
- documents (list): The list of documents to search.
|
244 |
- k_val (int): The number of top results to return.
|
245 |
- vec_score_cut_off (float): The score cutoff for filtering results.
|
|
|
|
|
|
|
246 |
- in_join_file (pd.DataFrame): The DataFrame to join with the search results.
|
247 |
- in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
|
248 |
-
- search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
|
249 |
-
- device (str, optional): The device to run the model on. Default is torch_device.
|
250 |
-
- embeddings_model (SentenceTransformer, optional): The embeddings model to use. Default is embeddings_model.
|
251 |
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
|
252 |
|
253 |
Returns:
|
254 |
-
-
|
255 |
"""
|
256 |
|
257 |
progress(0, desc = "Conducting semantic search")
|
258 |
|
|
|
|
|
259 |
ensure_output_folder_exists(output_folder)
|
260 |
|
261 |
print("Searching")
|
262 |
|
263 |
-
|
264 |
-
embeddings_model = embeddings_model.to(device)
|
265 |
|
266 |
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
-
|
270 |
-
|
271 |
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
cosine_similarities = cosine_similarities.flatten()
|
274 |
|
275 |
# Create a Pandas Series
|
@@ -309,6 +380,13 @@ def bge_semantic_search(
|
|
309 |
#results_df_out.to_excel(results_df_name, index= None)
|
310 |
results_first_text = results_df_out.iloc[0, 1]
|
311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
print("Returning results")
|
313 |
|
314 |
return results_first_text, results_df_name
|
|
|
6 |
import numpy as np
|
7 |
from datetime import datetime
|
8 |
from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
|
|
|
|
|
9 |
PandasDataFrame = Type[pd.DataFrame]
|
|
|
10 |
today_rev = datetime.now().strftime("%Y%m%d")
|
11 |
|
12 |
+
def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):
|
13 |
+
|
14 |
+
from torch import cuda, backends
|
15 |
+
from sentence_transformers import SentenceTransformer
|
16 |
+
|
17 |
+
# Check for torch cuda
|
18 |
+
print("Is CUDA enabled? ", cuda.is_available())
|
19 |
+
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
20 |
+
if cuda.is_available():
|
21 |
+
torch_device = "cuda"
|
22 |
+
#os.system("nvidia-smi")
|
23 |
+
else:
|
24 |
+
torch_device = "cpu"
|
25 |
+
|
26 |
+
print("Device used is: ", torch_device)
|
27 |
+
|
28 |
+
# Define a list of possible local locations to search for the model
|
29 |
+
local_embeddings_locations = [
|
30 |
+
"model/" + embedding_loc, # Potential local location
|
31 |
+
"/model/" + embedding_loc, # Potential location in Docker container
|
32 |
+
"/home/user/app/model/" + embedding_loc # This is inside a Docker container
|
33 |
+
]
|
34 |
+
|
35 |
+
# Attempt to load the model from each local location
|
36 |
+
for location in local_embeddings_locations:
|
37 |
+
try:
|
38 |
+
embeddings_model = SentenceTransformer(location)
|
39 |
+
print(f"Found local model installation at: {location}")
|
40 |
+
break # Exit the loop if the model is found
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Failed to load model from {location}: {e}")
|
43 |
+
continue
|
44 |
+
else:
|
45 |
+
# If the loop completes without finding the model in any local location
|
46 |
+
embeddings_model = SentenceTransformer(embeddings_name)
|
47 |
+
print("Could not find local model installation. Downloading from Huggingface")
|
48 |
+
|
49 |
+
# Load the sentence transformer model and move it to CPU/GPU
|
50 |
+
embeddings_model = embeddings_model.to(torch_device)
|
51 |
+
|
52 |
+
return embeddings_model, torch_device
|
53 |
|
54 |
def docs_to_bge_embed_np_array(
|
55 |
docs_out: list,
|
56 |
in_file: list,
|
|
|
57 |
output_file_state: str,
|
58 |
+
clean: str,
|
59 |
+
embeddings_state: np.ndarray,
|
60 |
+
embeddings_model_name:str,
|
61 |
+
embeddings_model_loc:str,
|
62 |
return_intermediate_files: str = "No",
|
63 |
+
embeddings_compress: str = "No",
|
|
|
64 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
65 |
) -> tuple:
|
66 |
"""
|
|
|
69 |
Parameters:
|
70 |
- docs_out (list): List of documents to be embedded.
|
71 |
- in_file (list): List of input files.
|
|
|
72 |
- output_file_state (str): State of the output file.
|
73 |
- clean (str): Indicates if the data should be cleaned.
|
74 |
+
- embeddings_state (np.ndarray): Current state of embeddings.
|
75 |
+
- embeddings_model_name (str): The Huggingface repo name of the embeddings model.
|
76 |
+
- embeddings_model_loc (str): Embeddings model save location.
|
77 |
- return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
|
78 |
+
- embeddings_compress (str, optional): Whether to compress the embeddings to int8 precision. Default is "No".
|
|
|
79 |
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
|
80 |
|
81 |
Returns:
|
82 |
- tuple: A tuple containing the output message, embeddings, and output file state.
|
83 |
"""
|
84 |
|
85 |
+
embeddings_model, torch_device = load_embedding_model(embeddings_model_name, embeddings_model_loc)
|
86 |
|
87 |
ensure_output_folder_exists(output_folder)
|
88 |
|
|
|
107 |
|
108 |
out_message = "Document processing complete. Ready to search."
|
109 |
|
|
|
110 |
if embeddings_state.size == 0:
|
111 |
tic = time.perf_counter()
|
112 |
print("Starting to embed documents.")
|
113 |
|
114 |
+
# Encode embeddings. If in normal mode, float32, if in 'super compress' mode, int8
|
115 |
+
batch_size = 32
|
116 |
+
|
117 |
+
if "bge" in embeddings_model_name:
|
118 |
+
print("Embedding with BGE model")
|
119 |
+
if embeddings_compress == "No":
|
120 |
+
print("Embedding with full fp32 precision")
|
121 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True)
|
122 |
+
else:
|
123 |
+
print("Embedding with int8 precision")
|
124 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True, precision="int8")
|
125 |
+
else:
|
126 |
+
print("Embedding with MiniLM-L6-v2 model")
|
127 |
+
if embeddings_compress == "No":
|
128 |
+
print("Embedding with full fp32 precision")
|
129 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
|
130 |
+
else:
|
131 |
+
print("Embedding with int8 precision")
|
132 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
|
133 |
|
134 |
toc = time.perf_counter()
|
135 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
|
|
141 |
else: data_file_name_no_ext = data_file_name_no_ext
|
142 |
|
143 |
progress(0.9, desc = "Saving embeddings to file")
|
144 |
+
if embeddings_compress == "No":
|
145 |
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
|
|
|
146 |
else:
|
147 |
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
|
148 |
+
|
149 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
|
|
150 |
|
151 |
output_file_state.append(semantic_search_file_name)
|
152 |
|
153 |
+
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
|
154 |
|
155 |
+
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
|
156 |
else:
|
157 |
# Just return existing embeddings if already exist
|
158 |
embeddings_out = embeddings_state
|
159 |
|
160 |
print(out_message)
|
161 |
|
162 |
+
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
|
163 |
|
164 |
def process_data_from_scores_df(
|
165 |
df_docs: pd.DataFrame,
|
|
|
246 |
embeddings: np.ndarray,
|
247 |
documents: list,
|
248 |
k_val: int,
|
249 |
+
vec_score_cut_off: float,
|
250 |
+
embeddings_model,
|
251 |
+
embeddings_model_name: str,
|
252 |
+
embeddings_compress:str,
|
253 |
in_join_file: pd.DataFrame,
|
254 |
in_join_column: str = None,
|
255 |
+
search_df_join_column: str = None,
|
|
|
|
|
256 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
257 |
+
) -> tuple:
|
258 |
"""
|
259 |
Perform a semantic search using the BGE model.
|
260 |
|
|
|
264 |
- documents (list): The list of documents to search.
|
265 |
- k_val (int): The number of top results to return.
|
266 |
- vec_score_cut_off (float): The score cutoff for filtering results.
|
267 |
+
- embeddings_model (SentenceTransformer, optional): The embeddings model to use.
|
268 |
+
- embeddings_model_name (str): The Huggingface repo name of the embeddings model.
|
269 |
+
- embeddings_compress (str): Whether the embeddings have been compressed to int8 precision
|
270 |
- in_join_file (pd.DataFrame): The DataFrame to join with the search results.
|
271 |
- in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
|
272 |
+
- search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
|
|
|
|
|
273 |
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
|
274 |
|
275 |
Returns:
|
276 |
+
- tuple: The DataFrame containing the search results.
|
277 |
"""
|
278 |
|
279 |
progress(0, desc = "Conducting semantic search")
|
280 |
|
281 |
+
output_files = []
|
282 |
+
|
283 |
ensure_output_folder_exists(output_folder)
|
284 |
|
285 |
print("Searching")
|
286 |
|
287 |
+
from sentence_transformers import quantize_embeddings
|
|
|
288 |
|
289 |
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
290 |
+
if "bge" in embeddings_model_name:
|
291 |
+
if embeddings_compress == "Yes":
|
292 |
+
query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
|
293 |
+
|
294 |
+
#query = query_fp32
|
295 |
+
query = quantize_embeddings(
|
296 |
+
query_fp32,
|
297 |
+
precision="int8",
|
298 |
+
calibration_embeddings=embeddings)
|
299 |
|
300 |
+
else:
|
301 |
+
query = embeddings_model.encode(query_str, normalize_embeddings=True)
|
302 |
|
303 |
+
# Get cosine similarities
|
304 |
+
cosine_similarities = query @ embeddings.T
|
305 |
+
|
306 |
+
# Sentence transformers method, not used:
|
307 |
+
#cosine_similarities = query @ embeddings.T
|
308 |
+
|
309 |
+
#cosine_similarities = embeddings_model.similarity(query, embeddings)
|
310 |
+
# Flatten the tensor to a 1D array
|
311 |
+
#cosine_similarities = cosine_similarities.flatten()
|
312 |
+
else:
|
313 |
+
print("Comparing similarity using Minilm-L6-v2")
|
314 |
+
|
315 |
+
if embeddings_compress == "Yes":
|
316 |
+
query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
|
317 |
+
|
318 |
+
#query = query_fp32
|
319 |
+
query = quantize_embeddings(
|
320 |
+
query_fp32,
|
321 |
+
precision="int8",
|
322 |
+
calibration_embeddings=embeddings)
|
323 |
+
else:
|
324 |
+
query = embeddings_model.encode(query_str, normalize_embeddings=True)
|
325 |
+
|
326 |
+
#cosine_similarities = embeddings_model.cosine_similarity(query, embeddings)
|
327 |
+
|
328 |
+
print("query:", query_fp32)
|
329 |
+
print("embeddings:", embeddings)
|
330 |
+
|
331 |
+
embeddings_norm = np.linalg.norm(embeddings, axis=1)
|
332 |
+
|
333 |
+
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting
|
334 |
+
normalized_embeddings = embeddings / embeddings_norm
|
335 |
+
|
336 |
+
print("normalized_embeddings:", normalized_embeddings)
|
337 |
+
|
338 |
+
expanded_query_fp32 = np.expand_dims(query_fp32, axis=0)
|
339 |
+
cosine_similarities = (expanded_query_fp32 @ normalized_embeddings.T)
|
340 |
+
|
341 |
+
print("Initial cosine similarities:", cosine_similarities)
|
342 |
+
|
343 |
+
# Flatten the tensor to a 1D array
|
344 |
cosine_similarities = cosine_similarities.flatten()
|
345 |
|
346 |
# Create a Pandas Series
|
|
|
380 |
#results_df_out.to_excel(results_df_name, index= None)
|
381 |
results_first_text = results_df_out.iloc[0, 1]
|
382 |
|
383 |
+
output_files.append(results_df_name)
|
384 |
+
|
385 |
+
csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
386 |
+
results_df_out.to_csv(csv_output_file, index=None)
|
387 |
+
|
388 |
+
output_files.append(csv_output_file)
|
389 |
+
|
390 |
print("Returning results")
|
391 |
|
392 |
return results_first_text, results_df_name
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -1,18 +1,15 @@
|
|
1 |
-
# Install/ import packages
|
2 |
import time
|
3 |
-
import re
|
4 |
import ast
|
5 |
import gzip
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
import pickle
|
9 |
from typing import Type, List, Literal
|
10 |
-
#from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
-
|
12 |
from pydantic import BaseModel, Field
|
13 |
|
14 |
# Creating an alias for pandas DataFrame using Type
|
15 |
PandasDataFrame = Type[pd.DataFrame]
|
|
|
16 |
|
17 |
class Document(BaseModel):
|
18 |
"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
|
@@ -25,114 +22,21 @@ class Document(BaseModel):
|
|
25 |
"""
|
26 |
type: Literal["Document"] = "Document"
|
27 |
|
28 |
-
|
29 |
-
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
30 |
-
chunk_size = 512
|
31 |
-
chunk_overlap = 0
|
32 |
-
start_index = True
|
33 |
-
|
34 |
-
from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
|
35 |
from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
|
36 |
from search_funcs.clean_funcs import initial_clean
|
37 |
|
38 |
-
def
|
39 |
-
|
40 |
-
|
41 |
-
and passes it to the relevant parsing function.
|
42 |
-
|
43 |
-
Parameters:
|
44 |
-
file_paths (list): List of file paths.
|
45 |
-
text_column (str): Name of the column in CSV/Excel files that contains the text content.
|
46 |
-
|
47 |
-
Returns:
|
48 |
-
dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
|
49 |
-
"""
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
if not isinstance(file_paths, list):
|
54 |
-
raise ValueError("Expected a list of file paths.")
|
55 |
-
|
56 |
-
extension_to_parser = {
|
57 |
-
# '.pdf': parse_pdf,
|
58 |
-
# '.docx': parse_docx,
|
59 |
-
# '.txt': parse_txt,
|
60 |
-
# '.html': parse_html,
|
61 |
-
# '.htm': parse_html, # Considering both .html and .htm for HTML files
|
62 |
-
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
63 |
-
'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
64 |
-
'.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
|
65 |
-
}
|
66 |
-
|
67 |
-
parsed_contents = {}
|
68 |
-
file_names = []
|
69 |
-
|
70 |
-
for file_path in file_paths:
|
71 |
-
|
72 |
-
file_extension = detect_file_type(file_path.name)
|
73 |
-
if file_extension in extension_to_parser:
|
74 |
-
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
|
75 |
-
else:
|
76 |
-
parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
|
77 |
-
|
78 |
-
filename_end = get_file_path_end_with_ext(file_path.name)
|
79 |
-
|
80 |
-
file_names.append(filename_end)
|
81 |
-
|
82 |
-
return parsed_contents, file_names
|
83 |
-
|
84 |
-
def text_regex_clean(text):
|
85 |
-
# Merge hyphenated words
|
86 |
-
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
|
87 |
-
# If a double newline ends in a letter, add a full stop.
|
88 |
-
text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text)
|
89 |
-
# Fix newlines in the middle of sentences
|
90 |
-
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
|
91 |
-
# Remove multiple newlines
|
92 |
-
text = re.sub(r"\n\s*\n", "\n\n", text)
|
93 |
-
text = re.sub(r" ", " ", text)
|
94 |
-
# Add full stops and new lines between words with no space between where the second one has a capital letter
|
95 |
-
text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text)
|
96 |
-
|
97 |
-
return text
|
98 |
-
|
99 |
-
def parse_csv_or_excel(file_path, data_state, text_column = "text"):
|
100 |
-
"""
|
101 |
-
Read in a CSV or Excel file.
|
102 |
-
|
103 |
-
Parameters:
|
104 |
-
file_path (str): Path to the CSV file.
|
105 |
-
text_column (str): Name of the column in the CSV file that contains the text content.
|
106 |
-
|
107 |
-
Returns:
|
108 |
-
Pandas DataFrame: Dataframe output from file read
|
109 |
-
"""
|
110 |
-
|
111 |
-
file_list = [string.name for string in file_path]
|
112 |
-
|
113 |
-
#print(file_list)
|
114 |
-
|
115 |
-
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
|
116 |
-
|
117 |
-
data_file_name = data_file_names[0]
|
118 |
-
|
119 |
-
#for file_path in file_paths:
|
120 |
-
file_name = get_file_path_end_with_ext(data_file_name)
|
121 |
-
|
122 |
-
message = "Loaded in file. Now converting to document format."
|
123 |
-
print(message)
|
124 |
-
|
125 |
-
return data_state, file_name, message
|
126 |
-
|
127 |
-
def write_out_metadata_as_string(metadata_in):
|
128 |
-
# If metadata_in is a single dictionary, wrap it in a list
|
129 |
-
if isinstance(metadata_in, dict):
|
130 |
-
metadata_in = [metadata_in]
|
131 |
|
132 |
-
|
133 |
-
|
|
|
134 |
|
135 |
-
|
|
|
|
|
136 |
|
137 |
df['metadata'] = '{'
|
138 |
df['blank_column'] = ''
|
@@ -147,32 +51,14 @@ def combine_metadata_columns(df, cols):
|
|
147 |
|
148 |
return df['metadata']
|
149 |
|
150 |
-
def
|
151 |
-
|
152 |
-
if not input_string or not split_symbols:
|
153 |
-
return [input_string]
|
154 |
-
|
155 |
-
chunks = []
|
156 |
-
current_chunk = ""
|
157 |
-
|
158 |
-
for char in input_string:
|
159 |
-
current_chunk += char
|
160 |
-
if len(current_chunk) >= max_length or char in split_symbols:
|
161 |
-
# Add the current chunk to the chunks list
|
162 |
-
chunks.append(current_chunk)
|
163 |
-
current_chunk = ""
|
164 |
-
|
165 |
-
# Adding any remaining part of the string
|
166 |
-
if current_chunk:
|
167 |
-
chunks.append(current_chunk)
|
168 |
-
|
169 |
-
return chunks
|
170 |
-
|
171 |
-
def clean_line_breaks(text):
|
172 |
-
# Replace \n and \r\n with a space
|
173 |
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
|
174 |
|
175 |
def parse_metadata(row):
|
|
|
|
|
|
|
176 |
try:
|
177 |
# Ensure the 'title' field is a string and clean line breaks
|
178 |
#if 'TITLE' in row:
|
@@ -193,8 +79,20 @@ def parse_metadata(row):
|
|
193 |
# Handle the error or log it
|
194 |
return None # or some default value
|
195 |
|
196 |
-
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No",
|
197 |
-
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
ensure_output_folder_exists(output_folder)
|
200 |
output_list = []
|
@@ -212,7 +110,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
212 |
return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
|
213 |
|
214 |
if not text_column:
|
215 |
-
return None, "Please enter a column name to search"
|
216 |
|
217 |
data_file_name = data_file_names[0]
|
218 |
|
@@ -246,7 +144,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
246 |
|
247 |
df[text_column] = df_list
|
248 |
|
249 |
-
|
250 |
clean_toc = time.perf_counter()
|
251 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
252 |
print(clean_time_out)
|
@@ -285,26 +182,4 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
285 |
output_list.append(out_doc_file_name)
|
286 |
print("Documents saved to file.")
|
287 |
|
288 |
-
return doc_sections, "Finished preparing documents.", output_list
|
289 |
-
|
290 |
-
def document_to_dataframe(documents):
|
291 |
-
'''
|
292 |
-
Convert an object in document format to pandas dataframe
|
293 |
-
'''
|
294 |
-
rows = []
|
295 |
-
|
296 |
-
for doc in documents:
|
297 |
-
# Convert Document to dictionary and extract metadata
|
298 |
-
doc_dict = doc.dict()
|
299 |
-
metadata = doc_dict.pop('metadata')
|
300 |
-
|
301 |
-
# Add the page_content and type to the metadata
|
302 |
-
metadata['page_content'] = doc_dict['page_content']
|
303 |
-
metadata['type'] = doc_dict['type']
|
304 |
-
|
305 |
-
# Add to the list of rows
|
306 |
-
rows.append(metadata)
|
307 |
-
|
308 |
-
# Create a DataFrame from the list of rows
|
309 |
-
df = pd.DataFrame(rows)
|
310 |
-
return df
|
|
|
|
|
1 |
import time
|
|
|
2 |
import ast
|
3 |
import gzip
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
6 |
import pickle
|
7 |
from typing import Type, List, Literal
|
|
|
|
|
8 |
from pydantic import BaseModel, Field
|
9 |
|
10 |
# Creating an alias for pandas DataFrame using Type
|
11 |
PandasDataFrame = Type[pd.DataFrame]
|
12 |
+
PandasSeries = Type[pd.Series]
|
13 |
|
14 |
class Document(BaseModel):
|
15 |
"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
|
|
|
22 |
"""
|
23 |
type: Literal["Document"] = "Document"
|
24 |
|
25 |
+
from search_funcs.helper_functions import get_file_path_end, ensure_output_folder_exists
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
|
27 |
from search_funcs.clean_funcs import initial_clean
|
28 |
|
29 |
+
def combine_metadata_columns(df:PandasDataFrame, cols:List[str]) -> PandasSeries:
|
30 |
+
'''
|
31 |
+
Construct a metadata column as a string version of a dictionary for later parsing.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
Parameters:
|
34 |
+
- df (PandasDataFrame): Data frame of search data.
|
35 |
+
- cols (List[str]): List of column names that will be included in the output metadata column.
|
36 |
|
37 |
+
Returns:
|
38 |
+
- PandasSeries: A series containing the metadata elements combined into a dictionary format as a string.
|
39 |
+
'''
|
40 |
|
41 |
df['metadata'] = '{'
|
42 |
df['blank_column'] = ''
|
|
|
51 |
|
52 |
return df['metadata']
|
53 |
|
54 |
+
def clean_line_breaks(text:str):
|
55 |
+
'''Replace \n and \r\n with a space'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
|
57 |
|
58 |
def parse_metadata(row):
|
59 |
+
'''
|
60 |
+
Parse a string instance of a dictionary into a Python object.
|
61 |
+
'''
|
62 |
try:
|
63 |
# Ensure the 'title' field is a string and clean line breaks
|
64 |
#if 'TITLE' in row:
|
|
|
79 |
# Handle the error or log it
|
80 |
return None # or some default value
|
81 |
|
82 |
+
def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:str, clean:str = "No", return_intermediate_files:str = "No", progress=gr.Progress(track_tqdm=True)) -> tuple:
|
83 |
+
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata.
|
84 |
+
|
85 |
+
Parameters:
|
86 |
+
- df (PandasDataFrame): Data frame of search data.
|
87 |
+
- in_file (List[str]): List of input file names.
|
88 |
+
- text_column (str): The text column that will be searched.
|
89 |
+
- clean (str): Whether the text is cleaned before searching.
|
90 |
+
- return_intermediate_files (str): Whether intermediate processing files are saved to file.
|
91 |
+
- progress (gr.Progress, optional): The progress tracker for the operation.
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
- tuple: A tuple containing data outputs in a Document class format, an output message, and a list of output file paths.
|
95 |
+
"""
|
96 |
|
97 |
ensure_output_folder_exists(output_folder)
|
98 |
output_list = []
|
|
|
110 |
return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
|
111 |
|
112 |
if not text_column:
|
113 |
+
return None, "Please enter a column name to search", output_list
|
114 |
|
115 |
data_file_name = data_file_names[0]
|
116 |
|
|
|
144 |
|
145 |
df[text_column] = df_list
|
146 |
|
|
|
147 |
clean_toc = time.perf_counter()
|
148 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
149 |
print(clean_time_out)
|
|
|
182 |
output_list.append(out_doc_file_name)
|
183 |
print("Documents saved to file.")
|
184 |
|
185 |
+
return doc_sections, "Finished preparing documents.", output_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
search_funcs/spacy_search_funcs.py
CHANGED
@@ -7,30 +7,19 @@ import gradio as gr
|
|
7 |
import pandas as pd
|
8 |
from typing import List, Type
|
9 |
from datetime import datetime
|
10 |
-
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder
|
11 |
|
12 |
PandasDataFrame = Type[pd.DataFrame]
|
13 |
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
16 |
-
# Load the SpaCy model
|
17 |
|
18 |
-
#os.system("python -m spacy download en_core_web_sm")
|
19 |
-
try:
|
20 |
-
import en_core_web_sm
|
21 |
-
nlp = en_core_web_sm.load()
|
22 |
-
print("Successfully imported spaCy model")
|
23 |
-
#nlp = spacy.load("en_core_web_sm")
|
24 |
-
#print(nlp._path)
|
25 |
-
except:
|
26 |
-
download("en_core_web_sm")
|
27 |
-
nlp = spacy.load("en_core_web_sm")
|
28 |
-
print("Successfully imported spaCy model")
|
29 |
|
30 |
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
31 |
''' Conduct fuzzy match on a list of data.'''
|
32 |
|
33 |
-
#
|
|
|
34 |
|
35 |
# Convert tokenised data back into a list of strings
|
36 |
df_list = list(map(" ".join, tokenised_data))
|
|
|
7 |
import pandas as pd
|
8 |
from typing import List, Type
|
9 |
from datetime import datetime
|
10 |
+
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
|
11 |
|
12 |
PandasDataFrame = Type[pd.DataFrame]
|
13 |
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
|
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
19 |
''' Conduct fuzzy match on a list of data.'''
|
20 |
|
21 |
+
# Load spaCy model
|
22 |
+
nlp = load_spacy_model()
|
23 |
|
24 |
# Convert tokenised data back into a list of strings
|
25 |
df_list = list(map(" ".join, tokenised_data))
|