Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on May 20, 2024

Commit

8466e45

1 Parent(s): 7bdc986

Fixed cleaning for semantic search. Handles text with backslashes in (if cleaned). Updated packages. requirements file for only keyword search added.

Browse files

Files changed (10) hide show

Dockerfile +16 -7
README.md +1 -1
app.py +1 -1
requirements.txt +4 -6
requirements_no_semantic.txt +9 -0
search_funcs/aws_functions.py +2 -2
search_funcs/bm25_functions.py +5 -5
search_funcs/clean_funcs.py +6 -2
search_funcs/semantic_functions.py +2 -2
search_funcs/semantic_ingest_functions.py +2 -2

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
 # First stage: build dependencies
-FROM public.ecr.aws/docker/library/python:3.10.13-slim
 # Install wget
 RUN apt-get update && apt-get install -y wget
@@ -11,9 +11,12 @@ WORKDIR /src
 COPY requirements.txt .
-RUN pip install -r requirements.txt
-# Download the model during the build process
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
 RUN apt-get install git-lfs -y
 RUN git lfs install
@@ -21,10 +24,16 @@ RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
 RUN rm -rf /model/bge/.git
 # Set up a new user named "user" with user ID 1000
-#RUN useradd -m -u 1000 user
 # Switch to the "user" user
-#USER user
 # Set home to the user's home directory
 ENV HOME=/home/user \
@@ -43,7 +52,7 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
-#COPY --chown=user . $HOME/app
-COPY . $HOME/app
 CMD ["python", "app.py"]

 # First stage: build dependencies
+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Install wget
 RUN apt-get update && apt-get install -y wget
 COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Gradio needs to be installed after due to conflict with spacy in requirements
+RUN pip install --no-cache-dir gradio==4.31.0
+# Download the BGE embedding model during the build process
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
 RUN apt-get install git-lfs -y
 RUN git lfs install
 RUN rm -rf /model/bge/.git
 # Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Change ownership of /home/user directory
+RUN chown -R user:user /home/user
+# Create the output files directory and set its permissions
+RUN mkdir -p /home/user/output && chown -R user:user /home/user/output
 # Switch to the "user" user
+USER user
 # Set home to the user's home directory
 ENV HOME=/home/user \
 WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+#COPY . $HOME/app
 CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔍
 colorFrom: purple
 colorTo: green
 sdk: gradio
-sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: purple
 colorTo: green
 sdk: gradio
+sdk_version: 4.31.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -129,7 +129,7 @@ depends on factors such as the type of documents or queries. Information taken f
         with gr.Accordion(label="Data load / save options", open = True):
             with gr.Row():
                 in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
-                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
         with gr.Accordion(label="Keyword search options", open = False):

         with gr.Accordion(label="Data load / save options", open = True):
             with gr.Row():
                 in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
+                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="False", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
         with gr.Accordion(label="Keyword search options", open = False):

requirements.txt CHANGED Viewed

@@ -1,13 +1,11 @@
-pandas==2.2.0
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
-#transformers==4.37.2
-#accelerate==0.26.0
 torch==2.1.2
-spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio==4.21.0
 sentence_transformers==2.3.1
 lxml==5.1.0
-boto3==1.34.63

+pandas==2.2.2
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
 torch==2.1.2
+spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio
 sentence_transformers==2.3.1
 lxml==5.1.0
+boto3==1.34.103

requirements_no_semantic.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pandas==2.2.2
+polars==0.20.3
+pyarrow==14.0.2
+openpyxl==3.1.2
+spacy
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio
+lxml==5.1.0
+boto3==1.34.103

search_funcs/aws_functions.py CHANGED Viewed

@@ -6,11 +6,11 @@ import os
 PandasDataFrame = Type[pd.DataFrame]
-bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
 try:
     session = boto3.Session(profile_name="default")
 except Exception as e:
     print(e)
 # sts = session.client("sts")

 PandasDataFrame = Type[pd.DataFrame]
 try:
+    bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
     session = boto3.Session(profile_name="default")
 except Exception as e:
+    bucket_name = ''
     print(e)
 # sts = session.client("sts")

search_funcs/bm25_functions.py CHANGED Viewed

@@ -325,9 +325,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
 	if return_intermediate_files == "Yes":
 		if clean == "Yes":
-			tokenised_data_file_name = data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
 		else:
-			tokenised_data_file_name = data_file_out_name_no_ext + "_tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
@@ -354,9 +354,9 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
 	prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
 	if file_end == ".csv":
-		prepared_df.to_csv(file_name)
 	elif file_end == ".parquet":
-		prepared_df.to_parquet(file_name)
 	else: file_name = None
 	return file_name, new_text_column, prepared_df
@@ -544,7 +544,7 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
 	# Out file
 	query_str_file = ("_").join(token_query)
-	results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
 	print("Saving search file output")
 	progress(0.7, desc = "Saving search output to file")

 	if return_intermediate_files == "Yes":
 		if clean == "Yes":
+			tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
 		else:
+			tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
 	prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
 	if file_end == ".csv":
+		prepared_df.to_csv("output/" + file_name)
 	elif file_end == ".parquet":
+		prepared_df.to_parquet("output/" + file_name)
 	else: file_name = None
 	return file_name, new_text_column, prepared_df
 	# Out file
 	query_str_file = ("_").join(token_query)
+	results_df_name = "output/keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
 	print("Saving search file output")
 	progress(0.7, desc = "Saving search output to file")

search_funcs/clean_funcs.py CHANGED Viewed

@@ -9,6 +9,8 @@ import calendar
 #from tqdm import tqdm
 import gradio as gr
 # Adding custom words to the stopwords
 custom_words = []
 my_stop_words = custom_words
@@ -24,6 +26,7 @@ custom_words.extend(cal_month)
 # #### Some of my cleaning functions
 email_start_pattern_regex = r'.*importance:|.*subject:'
 email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
@@ -45,10 +48,11 @@ multiple_spaces_regex = r'\s{2,}'
 # nbsp_pattern = re.compile(nbsp_pattern_regex)
-def initial_clean(texts , progress=gr.Progress()):
     texts = pl.Series(texts)#[]
-    text = texts.str.replace_all(html_pattern_regex, '')
     text = text.str.replace_all(email_start_pattern_regex, '')
     text = text.str.replace_all(email_end_pattern_regex, '')
     text = text.str.replace_all(email_pattern_regex, '')

 #from tqdm import tqdm
 import gradio as gr
+from typing import List
 # Adding custom words to the stopwords
 custom_words = []
 my_stop_words = custom_words
 # #### Some of my cleaning functions
+replace_backslash = r'\\'
 email_start_pattern_regex = r'.*importance:|.*subject:'
 email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 # nbsp_pattern = re.compile(nbsp_pattern_regex)
+def initial_clean(texts:List[str] , progress=gr.Progress()):
     texts = pl.Series(texts)#[]
+    text = texts.str.replace_all(replace_backslash, '/')
+    text = text.str.replace_all(html_pattern_regex, '')
     text = text.str.replace_all(email_start_pattern_regex, '')
     text = text.str.replace_all(email_end_pattern_regex, '')
     text = text.str.replace_all(email_pattern_regex, '')

search_funcs/semantic_functions.py CHANGED Viewed

@@ -292,7 +292,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
     query_str_file = query_str.replace(" ", "_")
-    results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     print("Saving search output to file")
     progress(0.7, desc = "Saving search output to file")
@@ -589,7 +589,7 @@ def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:st
             results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
-            results_df_name = "semantic_search_result.csv"
             results_df_out.to_csv(results_df_name, index= None)
             results_first_text = results_df_out[orig_df_col].iloc[0]

     query_str_file = query_str.replace(" ", "_")
+    results_df_name = "output/semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     print("Saving search output to file")
     progress(0.7, desc = "Saving search output to file")
             results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
+            results_df_name = "output/semantic_search_result.csv"
             results_df_out.to_csv(results_df_name, index= None)
             results_first_text = results_df_out[orig_df_col].iloc[0]

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -304,7 +304,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
-            out_doc_file_name = file_name + "_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)
@@ -312,7 +312,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
-            out_doc_file_name = file_name + "_cleaned_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)

         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
+            out_doc_file_name = "output/" + file_name + "_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
+            out_doc_file_name = "output/" + file_name + "_cleaned_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)