seanpedrickcase
commited on
Commit
•
8466e45
1
Parent(s):
7bdc986
Fixed cleaning for semantic search. Handles text with backslashes in (if cleaned). Updated packages. requirements file for only keyword search added.
Browse files- Dockerfile +16 -7
- README.md +1 -1
- app.py +1 -1
- requirements.txt +4 -6
- requirements_no_semantic.txt +9 -0
- search_funcs/aws_functions.py +2 -2
- search_funcs/bm25_functions.py +5 -5
- search_funcs/clean_funcs.py +6 -2
- search_funcs/semantic_functions.py +2 -2
- search_funcs/semantic_ingest_functions.py +2 -2
Dockerfile
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# First stage: build dependencies
|
2 |
-
FROM public.ecr.aws/docker/library/python:3.
|
3 |
|
4 |
# Install wget
|
5 |
RUN apt-get update && apt-get install -y wget
|
@@ -11,9 +11,12 @@ WORKDIR /src
|
|
11 |
|
12 |
COPY requirements.txt .
|
13 |
|
14 |
-
RUN pip install -r requirements.txt
|
15 |
|
16 |
-
#
|
|
|
|
|
|
|
17 |
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
18 |
RUN apt-get install git-lfs -y
|
19 |
RUN git lfs install
|
@@ -21,10 +24,16 @@ RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
|
|
21 |
RUN rm -rf /model/bge/.git
|
22 |
|
23 |
# Set up a new user named "user" with user ID 1000
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# Switch to the "user" user
|
27 |
-
|
28 |
|
29 |
# Set home to the user's home directory
|
30 |
ENV HOME=/home/user \
|
@@ -43,7 +52,7 @@ ENV HOME=/home/user \
|
|
43 |
WORKDIR $HOME/app
|
44 |
|
45 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
46 |
-
|
47 |
-
COPY . $HOME/app
|
48 |
|
49 |
CMD ["python", "app.py"]
|
|
|
1 |
# First stage: build dependencies
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
3 |
|
4 |
# Install wget
|
5 |
RUN apt-get update && apt-get install -y wget
|
|
|
11 |
|
12 |
COPY requirements.txt .
|
13 |
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
|
16 |
+
# Gradio needs to be installed after due to conflict with spacy in requirements
|
17 |
+
RUN pip install --no-cache-dir gradio==4.31.0
|
18 |
+
|
19 |
+
# Download the BGE embedding model during the build process
|
20 |
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
21 |
RUN apt-get install git-lfs -y
|
22 |
RUN git lfs install
|
|
|
24 |
RUN rm -rf /model/bge/.git
|
25 |
|
26 |
# Set up a new user named "user" with user ID 1000
|
27 |
+
RUN useradd -m -u 1000 user
|
28 |
+
|
29 |
+
# Change ownership of /home/user directory
|
30 |
+
RUN chown -R user:user /home/user
|
31 |
+
|
32 |
+
# Create the output files directory and set its permissions
|
33 |
+
RUN mkdir -p /home/user/output && chown -R user:user /home/user/output
|
34 |
|
35 |
# Switch to the "user" user
|
36 |
+
USER user
|
37 |
|
38 |
# Set home to the user's home directory
|
39 |
ENV HOME=/home/user \
|
|
|
52 |
WORKDIR $HOME/app
|
53 |
|
54 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
55 |
+
COPY --chown=user . $HOME/app
|
56 |
+
#COPY . $HOME/app
|
57 |
|
58 |
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🔍
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.31.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -129,7 +129,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
129 |
with gr.Accordion(label="Data load / save options", open = True):
|
130 |
with gr.Row():
|
131 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
132 |
-
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="
|
133 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
134 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
135 |
with gr.Accordion(label="Keyword search options", open = False):
|
|
|
129 |
with gr.Accordion(label="Data load / save options", open = True):
|
130 |
with gr.Row():
|
131 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
132 |
+
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="False", choices=["Yes", "No"])
|
133 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
134 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
135 |
with gr.Accordion(label="Keyword search options", open = False):
|
requirements.txt
CHANGED
@@ -1,13 +1,11 @@
|
|
1 |
-
pandas==2.2.
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
-
#transformers==4.37.2
|
6 |
-
#accelerate==0.26.0
|
7 |
torch==2.1.2
|
8 |
-
spacy
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
-
gradio
|
11 |
sentence_transformers==2.3.1
|
12 |
lxml==5.1.0
|
13 |
-
boto3==1.34.
|
|
|
1 |
+
pandas==2.2.2
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
|
|
|
|
5 |
torch==2.1.2
|
6 |
+
spacy
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
+
gradio
|
9 |
sentence_transformers==2.3.1
|
10 |
lxml==5.1.0
|
11 |
+
boto3==1.34.103
|
requirements_no_semantic.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
polars==0.20.3
|
3 |
+
pyarrow==14.0.2
|
4 |
+
openpyxl==3.1.2
|
5 |
+
spacy
|
6 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
7 |
+
gradio
|
8 |
+
lxml==5.1.0
|
9 |
+
boto3==1.34.103
|
search_funcs/aws_functions.py
CHANGED
@@ -6,11 +6,11 @@ import os
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
-
bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
|
10 |
-
|
11 |
try:
|
|
|
12 |
session = boto3.Session(profile_name="default")
|
13 |
except Exception as e:
|
|
|
14 |
print(e)
|
15 |
|
16 |
# sts = session.client("sts")
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
|
|
|
|
9 |
try:
|
10 |
+
bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
|
11 |
session = boto3.Session(profile_name="default")
|
12 |
except Exception as e:
|
13 |
+
bucket_name = ''
|
14 |
print(e)
|
15 |
|
16 |
# sts = session.client("sts")
|
search_funcs/bm25_functions.py
CHANGED
@@ -325,9 +325,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
325 |
|
326 |
if return_intermediate_files == "Yes":
|
327 |
if clean == "Yes":
|
328 |
-
tokenised_data_file_name = data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
|
329 |
else:
|
330 |
-
tokenised_data_file_name = data_file_out_name_no_ext + "_tokenised.parquet"
|
331 |
|
332 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
333 |
|
@@ -354,9 +354,9 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
|
|
354 |
prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
|
355 |
|
356 |
if file_end == ".csv":
|
357 |
-
prepared_df.to_csv(file_name)
|
358 |
elif file_end == ".parquet":
|
359 |
-
prepared_df.to_parquet(file_name)
|
360 |
else: file_name = None
|
361 |
|
362 |
return file_name, new_text_column, prepared_df
|
@@ -544,7 +544,7 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
|
|
544 |
|
545 |
# Out file
|
546 |
query_str_file = ("_").join(token_query)
|
547 |
-
results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
548 |
|
549 |
print("Saving search file output")
|
550 |
progress(0.7, desc = "Saving search output to file")
|
|
|
325 |
|
326 |
if return_intermediate_files == "Yes":
|
327 |
if clean == "Yes":
|
328 |
+
tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
|
329 |
else:
|
330 |
+
tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_tokenised.parquet"
|
331 |
|
332 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
333 |
|
|
|
354 |
prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
|
355 |
|
356 |
if file_end == ".csv":
|
357 |
+
prepared_df.to_csv("output/" + file_name)
|
358 |
elif file_end == ".parquet":
|
359 |
+
prepared_df.to_parquet("output/" + file_name)
|
360 |
else: file_name = None
|
361 |
|
362 |
return file_name, new_text_column, prepared_df
|
|
|
544 |
|
545 |
# Out file
|
546 |
query_str_file = ("_").join(token_query)
|
547 |
+
results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
548 |
|
549 |
print("Saving search file output")
|
550 |
progress(0.7, desc = "Saving search output to file")
|
search_funcs/clean_funcs.py
CHANGED
@@ -9,6 +9,8 @@ import calendar
|
|
9 |
#from tqdm import tqdm
|
10 |
import gradio as gr
|
11 |
|
|
|
|
|
12 |
# Adding custom words to the stopwords
|
13 |
custom_words = []
|
14 |
my_stop_words = custom_words
|
@@ -24,6 +26,7 @@ custom_words.extend(cal_month)
|
|
24 |
|
25 |
|
26 |
# #### Some of my cleaning functions
|
|
|
27 |
email_start_pattern_regex = r'.*importance:|.*subject:'
|
28 |
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
|
29 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
@@ -45,10 +48,11 @@ multiple_spaces_regex = r'\s{2,}'
|
|
45 |
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
46 |
|
47 |
|
48 |
-
def initial_clean(texts , progress=gr.Progress()):
|
49 |
texts = pl.Series(texts)#[]
|
50 |
|
51 |
-
text = texts.str.replace_all(
|
|
|
52 |
text = text.str.replace_all(email_start_pattern_regex, '')
|
53 |
text = text.str.replace_all(email_end_pattern_regex, '')
|
54 |
text = text.str.replace_all(email_pattern_regex, '')
|
|
|
9 |
#from tqdm import tqdm
|
10 |
import gradio as gr
|
11 |
|
12 |
+
from typing import List
|
13 |
+
|
14 |
# Adding custom words to the stopwords
|
15 |
custom_words = []
|
16 |
my_stop_words = custom_words
|
|
|
26 |
|
27 |
|
28 |
# #### Some of my cleaning functions
|
29 |
+
replace_backslash = r'\\'
|
30 |
email_start_pattern_regex = r'.*importance:|.*subject:'
|
31 |
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
|
32 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
|
|
48 |
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
49 |
|
50 |
|
51 |
+
def initial_clean(texts:List[str] , progress=gr.Progress()):
|
52 |
texts = pl.Series(texts)#[]
|
53 |
|
54 |
+
text = texts.str.replace_all(replace_backslash, '/')
|
55 |
+
text = text.str.replace_all(html_pattern_regex, '')
|
56 |
text = text.str.replace_all(email_start_pattern_regex, '')
|
57 |
text = text.str.replace_all(email_end_pattern_regex, '')
|
58 |
text = text.str.replace_all(email_pattern_regex, '')
|
search_funcs/semantic_functions.py
CHANGED
@@ -292,7 +292,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
|
|
292 |
|
293 |
query_str_file = query_str.replace(" ", "_")
|
294 |
|
295 |
-
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
296 |
|
297 |
print("Saving search output to file")
|
298 |
progress(0.7, desc = "Saving search output to file")
|
@@ -589,7 +589,7 @@ def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:st
|
|
589 |
|
590 |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
591 |
|
592 |
-
results_df_name = "semantic_search_result.csv"
|
593 |
results_df_out.to_csv(results_df_name, index= None)
|
594 |
results_first_text = results_df_out[orig_df_col].iloc[0]
|
595 |
|
|
|
292 |
|
293 |
query_str_file = query_str.replace(" ", "_")
|
294 |
|
295 |
+
results_df_name = "output/semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
296 |
|
297 |
print("Saving search output to file")
|
298 |
progress(0.7, desc = "Saving search output to file")
|
|
|
589 |
|
590 |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
591 |
|
592 |
+
results_df_name = "output/semantic_search_result.csv"
|
593 |
results_df_out.to_csv(results_df_name, index= None)
|
594 |
results_first_text = results_df_out[orig_df_col].iloc[0]
|
595 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -304,7 +304,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
304 |
|
305 |
if clean == "No":
|
306 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
307 |
-
out_doc_file_name = file_name + "_prepared_docs.pkl.gz"
|
308 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
309 |
pickle.dump(doc_sections, file)
|
310 |
|
@@ -312,7 +312,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
312 |
elif clean == "Yes":
|
313 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
314 |
|
315 |
-
out_doc_file_name = file_name + "_cleaned_prepared_docs.pkl.gz"
|
316 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
317 |
pickle.dump(doc_sections, file)
|
318 |
|
|
|
304 |
|
305 |
if clean == "No":
|
306 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
307 |
+
out_doc_file_name = "output/" + file_name + "_prepared_docs.pkl.gz"
|
308 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
309 |
pickle.dump(doc_sections, file)
|
310 |
|
|
|
312 |
elif clean == "Yes":
|
313 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
314 |
|
315 |
+
out_doc_file_name = "output/" + file_name + "_cleaned_prepared_docs.pkl.gz"
|
316 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
317 |
pickle.dump(doc_sections, file)
|
318 |
|