Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Jun 4, 2024

Commit

d3ff2e2

1 Parent(s): 3b77fe5

Allowed for custom output folder, returned Dockerfile to work under user account and port 7860

Browse files

Files changed (6) hide show

Dockerfile +7 -13
app.py +4 -2
search_funcs/bm25_functions.py +8 -8
search_funcs/helper_functions.py +23 -5
search_funcs/semantic_functions.py +5 -5
search_funcs/semantic_ingest_functions.py +4 -4

Dockerfile CHANGED Viewed

@@ -26,21 +26,15 @@ RUN git lfs install
 RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
 RUN rm -rf /model/bge/.git
-# Expose port 8080
-EXPOSE 8080
 # Set up a new user named "user" with user ID 1000
-#RUN useradd -m -u 1000 user
 # Change ownership of /home/user directory
-#RUN chown -R user:user /home/user
 # Make output folder
-#RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
-#RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
-RUN mkdir -p /home/user/app/output
-RUN mkdir -p /home/user/.cache/huggingface/hub
 # Switch to the "user" user
 USER user
@@ -53,7 +47,7 @@ ENV HOME=/home/user \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
-	GRADIO_SERVER_PORT=8080 \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	#GRADIO_ROOT_PATH=/data-text-search \
@@ -63,8 +57,8 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
-#COPY --chown=user . $HOME/app
-COPY . $HOME/app
 CMD ["python", "app.py"]

 RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
 RUN rm -rf /model/bge/.git
 # Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
 # Change ownership of /home/user directory
+RUN chown -R user:user /home/user
 # Make output folder
+RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
+RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
 # Switch to the "user" user
 USER user
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	#GRADIO_ROOT_PATH=/data-text-search \
 WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+#COPY . $HOME/app
 CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -8,14 +8,16 @@ PandasDataFrame = Type[pd.DataFrame]
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
 from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
-from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 from search_funcs.aws_functions import load_data_from_aws
 #from fastapi import FastAPI
 #app = FastAPI()
-# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
 temp_folder_path = get_temp_folder_path()
 empty_folder(temp_folder_path)

 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
 from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
+from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, output_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 from search_funcs.aws_functions import load_data_from_aws
 #from fastapi import FastAPI
 #app = FastAPI()
+# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
 temp_folder_path = get_temp_folder_path()
 empty_folder(temp_folder_path)

search_funcs/bm25_functions.py CHANGED Viewed

@@ -14,7 +14,7 @@ from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
-from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists
 # Load the SpaCy model
 from spacy.cli.download import download
@@ -232,7 +232,7 @@ class BM25:
 def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No",  return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
 	#print(in_file)
-	ensure_output_folder_exists()
 	if not in_file:
 		print("No input file found. Please load in at least one file.")
@@ -327,9 +327,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
 	if return_intermediate_files == "Yes":
 		if clean == "Yes":
-			tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
 		else:
-			tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
@@ -339,7 +339,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
 def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
-	ensure_output_folder_exists()
 	# Check if the list and the dataframe have the same length
 	if len(prepared_text_list) != len(in_df):
@@ -347,7 +347,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
 	file_end = ".parquet"
-	file_name = "output/" + get_file_path_end(in_file_name) + "_cleaned" + file_end
 	new_text_column = in_bm25_column + "_cleaned"
 	prepared_text_df = pd.DataFrame(data={new_text_column:prepared_text_list})
@@ -547,10 +547,10 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
 	results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
 	# Out file
-	ensure_output_folder_exists()
 	query_str_file = ("_").join(token_query)
-	results_df_name = "output/keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
 	print("Saving search file output")
 	progress(0.7, desc = "Saving search output to file")

 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
+from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
 # Load the SpaCy model
 from spacy.cli.download import download
 def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No",  return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
 	#print(in_file)
+	ensure_output_folder_exists(output_folder)
 	if not in_file:
 		print("No input file found. Please load in at least one file.")
 	if return_intermediate_files == "Yes":
 		if clean == "Yes":
+			tokenised_data_file_name = output_folder + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
 		else:
+			tokenised_data_file_name = output_folder + data_file_out_name_no_ext + "_tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
 def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
+	ensure_output_folder_exists(output_folder)
 	# Check if the list and the dataframe have the same length
 	if len(prepared_text_list) != len(in_df):
 	file_end = ".parquet"
+	file_name = output_folder + get_file_path_end(in_file_name) + "_cleaned" + file_end
 	new_text_column = in_bm25_column + "_cleaned"
 	prepared_text_df = pd.DataFrame(data={new_text_column:prepared_text_list})
 	results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
 	# Out file
+	ensure_output_folder_exists(output_folder)
 	query_str_file = ("_").join(token_query)
+	results_df_name = output_folder + "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
 	print("Saving search file output")
 	progress(0.7, desc = "Saving search output to file")

search_funcs/helper_functions.py CHANGED Viewed

@@ -19,6 +19,24 @@ megabyte = 1024 * 1024  # Bytes in a megabyte
 file_size_mb = 500  # Size in megabytes
 file_size_bytes_500mb =  megabyte * file_size_mb
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()
@@ -58,17 +76,17 @@ def get_file_path_end_with_ext(file_path):
     return filename_end
-def ensure_output_folder_exists():
-    """Checks if the 'output/' folder exists, creates it if not."""
-    folder_name = "output/"
     if not os.path.exists(folder_name):
         # Create the folder if it doesn't exist
         os.makedirs(folder_name)
-        print(f"Created the 'output/' folder.")
     else:
-        print(f"The 'output/' folder already exists.")
 def detect_file_type(filename):
     """Detect the file type based on its extension."""

 file_size_mb = 500  # Size in megabytes
 file_size_bytes_500mb =  megabyte * file_size_mb
+def get_or_create_env_var(var_name, default_value):
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set it to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    return value
+# Retrieving or setting output folder
+env_var_name = 'GRADIO_OUTPUT_FOLDER'
+default_value = 'output/'
+output_folder = get_or_create_env_var(env_var_name, default_value)
+print(f'The value of {env_var_name} is {output_folder}')
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()
     return filename_end
+def ensure_output_folder_exists(output_folder):
+    """Checks if the output folder exists, creates it if not."""
+    folder_name = output_folder
     if not os.path.exists(folder_name):
         # Create the folder if it doesn't exist
         os.makedirs(folder_name)
+        print(f"Created the output folder:", folder_name)
     else:
+        print(f"The output folder already exists:", folder_name)
 def detect_file_type(filename):
     """Detect the file type based on its extension."""

search_funcs/semantic_functions.py CHANGED Viewed

@@ -25,7 +25,7 @@ else:
 print("Device used is: ", torch_device)
-from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists
 PandasDataFrame = Type[pd.DataFrame]
@@ -70,7 +70,7 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_
     Takes a Langchain document class and saves it into a Numpy array.
     '''
-    ensure_output_folder_exists()
     if not in_file:
         out_message = "No input file found. Please load in at least one file."
@@ -232,7 +232,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
     # print("vectorstore loaded: ", vectorstore)
     progress(0, desc = "Conducting semantic search")
-    ensure_output_folder_exists()
     print("Searching")
@@ -297,7 +297,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
     query_str_file = query_str.replace(" ", "_")
-    results_df_name = "output/semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     print("Saving search output to file")
     progress(0.7, desc = "Saving search output to file")
@@ -594,7 +594,7 @@ def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:st
             results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
-            results_df_name = "output/semantic_search_result.csv"
             results_df_out.to_csv(results_df_name, index= None)
             results_first_text = results_df_out[orig_df_col].iloc[0]

 print("Device used is: ", torch_device)
+from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
 PandasDataFrame = Type[pd.DataFrame]
     Takes a Langchain document class and saves it into a Numpy array.
     '''
+    ensure_output_folder_exists(output_folder)
     if not in_file:
         out_message = "No input file found. Please load in at least one file."
     # print("vectorstore loaded: ", vectorstore)
     progress(0, desc = "Conducting semantic search")
+    ensure_output_folder_exists(output_folder)
     print("Searching")
     query_str_file = query_str.replace(" ", "_")
+    results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     print("Saving search output to file")
     progress(0.7, desc = "Saving search output to file")
             results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
+            results_df_name = output_folder + "semantic_search_result.csv"
             results_df_out.to_csv(results_df_name, index= None)
             results_first_text = results_df_out[orig_df_col].iloc[0]

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -32,7 +32,7 @@ chunk_overlap = 0
 start_index = True
 from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
-from search_funcs.bm25_functions import save_prepared_bm25_data
 from search_funcs.clean_funcs import initial_clean
 def parse_file_not_used(file_paths, text_column='text'):
@@ -198,7 +198,7 @@ def parse_metadata(row):
 def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
-    ensure_output_folder_exists()
     output_list = []
     if not in_file:
@@ -305,7 +305,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
-            out_doc_file_name = "output/" + file_name + "_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)
@@ -313,7 +313,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
-            out_doc_file_name = "output/" + file_name + "_cleaned_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)

 start_index = True
 from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
+from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
 from search_funcs.clean_funcs import initial_clean
 def parse_file_not_used(file_paths, text_column='text'):
 def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
     """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
+    ensure_output_folder_exists(output_folder)
     output_list = []
     if not in_file:
         if clean == "No":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
+            out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)
         elif clean == "Yes":
             #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
+            out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
             with gzip.open(out_doc_file_name, 'wb') as file:
                 pickle.dump(doc_sections, file)