seanpedrickcase commited on
Commit
d3ff2e2
1 Parent(s): 3b77fe5

Allowed for custom output folder, returned Dockerfile to work under user account and port 7860

Browse files
Dockerfile CHANGED
@@ -26,21 +26,15 @@ RUN git lfs install
26
  RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
27
  RUN rm -rf /model/bge/.git
28
 
29
- # Expose port 8080
30
- EXPOSE 8080
31
-
32
  # Set up a new user named "user" with user ID 1000
33
- #RUN useradd -m -u 1000 user
34
 
35
  # Change ownership of /home/user directory
36
- #RUN chown -R user:user /home/user
37
 
38
  # Make output folder
39
- #RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
40
- #RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
41
-
42
- RUN mkdir -p /home/user/app/output
43
- RUN mkdir -p /home/user/.cache/huggingface/hub
44
 
45
  # Switch to the "user" user
46
  USER user
@@ -53,7 +47,7 @@ ENV HOME=/home/user \
53
  GRADIO_ALLOW_FLAGGING=never \
54
  GRADIO_NUM_PORTS=1 \
55
  GRADIO_SERVER_NAME=0.0.0.0 \
56
- GRADIO_SERVER_PORT=8080 \
57
  GRADIO_THEME=huggingface \
58
  AWS_STS_REGIONAL_ENDPOINT=regional \
59
  #GRADIO_ROOT_PATH=/data-text-search \
@@ -63,8 +57,8 @@ ENV HOME=/home/user \
63
  WORKDIR $HOME/app
64
 
65
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
66
- #COPY --chown=user . $HOME/app
67
- COPY . $HOME/app
68
 
69
 
70
  CMD ["python", "app.py"]
 
26
  RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
27
  RUN rm -rf /model/bge/.git
28
 
 
 
 
29
  # Set up a new user named "user" with user ID 1000
30
+ RUN useradd -m -u 1000 user
31
 
32
  # Change ownership of /home/user directory
33
+ RUN chown -R user:user /home/user
34
 
35
  # Make output folder
36
+ RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
37
+ RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
 
 
 
38
 
39
  # Switch to the "user" user
40
  USER user
 
47
  GRADIO_ALLOW_FLAGGING=never \
48
  GRADIO_NUM_PORTS=1 \
49
  GRADIO_SERVER_NAME=0.0.0.0 \
50
+ GRADIO_SERVER_PORT=7860 \
51
  GRADIO_THEME=huggingface \
52
  AWS_STS_REGIONAL_ENDPOINT=regional \
53
  #GRADIO_ROOT_PATH=/data-text-search \
 
57
  WORKDIR $HOME/app
58
 
59
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
60
+ COPY --chown=user . $HOME/app
61
+ #COPY . $HOME/app
62
 
63
 
64
  CMD ["python", "app.py"]
app.py CHANGED
@@ -8,14 +8,16 @@ PandasDataFrame = Type[pd.DataFrame]
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
  from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
11
- from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
14
 
15
  #from fastapi import FastAPI
16
  #app = FastAPI()
17
 
18
- # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
 
 
19
  temp_folder_path = get_temp_folder_path()
20
  empty_folder(temp_folder_path)
21
 
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
  from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
11
+ from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, output_folder
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
14
 
15
  #from fastapi import FastAPI
16
  #app = FastAPI()
17
 
18
+
19
+
20
+ # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
21
  temp_folder_path = get_temp_folder_path()
22
  empty_folder(temp_folder_path)
23
 
search_funcs/bm25_functions.py CHANGED
@@ -14,7 +14,7 @@ from datetime import datetime
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
17
- from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists
18
 
19
  # Load the SpaCy model
20
  from spacy.cli.download import download
@@ -232,7 +232,7 @@ class BM25:
232
 
233
  def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
234
  #print(in_file)
235
- ensure_output_folder_exists()
236
 
237
  if not in_file:
238
  print("No input file found. Please load in at least one file.")
@@ -327,9 +327,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
327
  if return_intermediate_files == "Yes":
328
 
329
  if clean == "Yes":
330
- tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
331
  else:
332
- tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_tokenised.parquet"
333
 
334
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
335
 
@@ -339,7 +339,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
339
 
340
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
341
 
342
- ensure_output_folder_exists()
343
 
344
  # Check if the list and the dataframe have the same length
345
  if len(prepared_text_list) != len(in_df):
@@ -347,7 +347,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
347
 
348
  file_end = ".parquet"
349
 
350
- file_name = "output/" + get_file_path_end(in_file_name) + "_cleaned" + file_end
351
 
352
  new_text_column = in_bm25_column + "_cleaned"
353
  prepared_text_df = pd.DataFrame(data={new_text_column:prepared_text_list})
@@ -547,10 +547,10 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
547
  results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
548
 
549
  # Out file
550
- ensure_output_folder_exists()
551
 
552
  query_str_file = ("_").join(token_query)
553
- results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
554
 
555
  print("Saving search file output")
556
  progress(0.7, desc = "Saving search output to file")
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
17
+ from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
18
 
19
  # Load the SpaCy model
20
  from spacy.cli.download import download
 
232
 
233
  def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
234
  #print(in_file)
235
+ ensure_output_folder_exists(output_folder)
236
 
237
  if not in_file:
238
  print("No input file found. Please load in at least one file.")
 
327
  if return_intermediate_files == "Yes":
328
 
329
  if clean == "Yes":
330
+ tokenised_data_file_name = output_folder + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
331
  else:
332
+ tokenised_data_file_name = output_folder + data_file_out_name_no_ext + "_tokenised.parquet"
333
 
334
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
335
 
 
339
 
340
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
341
 
342
+ ensure_output_folder_exists(output_folder)
343
 
344
  # Check if the list and the dataframe have the same length
345
  if len(prepared_text_list) != len(in_df):
 
347
 
348
  file_end = ".parquet"
349
 
350
+ file_name = output_folder + get_file_path_end(in_file_name) + "_cleaned" + file_end
351
 
352
  new_text_column = in_bm25_column + "_cleaned"
353
  prepared_text_df = pd.DataFrame(data={new_text_column:prepared_text_list})
 
547
  results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
548
 
549
  # Out file
550
+ ensure_output_folder_exists(output_folder)
551
 
552
  query_str_file = ("_").join(token_query)
553
+ results_df_name = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
554
 
555
  print("Saving search file output")
556
  progress(0.7, desc = "Saving search output to file")
search_funcs/helper_functions.py CHANGED
@@ -19,6 +19,24 @@ megabyte = 1024 * 1024 # Bytes in a megabyte
19
  file_size_mb = 500 # Size in megabytes
20
  file_size_bytes_500mb = megabyte * file_size_mb
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Attempt to delete content of gradio temp folder
23
  def get_temp_folder_path():
24
  username = getpass.getuser()
@@ -58,17 +76,17 @@ def get_file_path_end_with_ext(file_path):
58
 
59
  return filename_end
60
 
61
- def ensure_output_folder_exists():
62
- """Checks if the 'output/' folder exists, creates it if not."""
63
 
64
- folder_name = "output/"
65
 
66
  if not os.path.exists(folder_name):
67
  # Create the folder if it doesn't exist
68
  os.makedirs(folder_name)
69
- print(f"Created the 'output/' folder.")
70
  else:
71
- print(f"The 'output/' folder already exists.")
72
 
73
  def detect_file_type(filename):
74
  """Detect the file type based on its extension."""
 
19
  file_size_mb = 500 # Size in megabytes
20
  file_size_bytes_500mb = megabyte * file_size_mb
21
 
22
+ def get_or_create_env_var(var_name, default_value):
23
+ # Get the environment variable if it exists
24
+ value = os.environ.get(var_name)
25
+
26
+ # If it doesn't exist, set it to the default value
27
+ if value is None:
28
+ os.environ[var_name] = default_value
29
+ value = default_value
30
+
31
+ return value
32
+
33
+ # Retrieving or setting output folder
34
+ env_var_name = 'GRADIO_OUTPUT_FOLDER'
35
+ default_value = 'output/'
36
+
37
+ output_folder = get_or_create_env_var(env_var_name, default_value)
38
+ print(f'The value of {env_var_name} is {output_folder}')
39
+
40
  # Attempt to delete content of gradio temp folder
41
  def get_temp_folder_path():
42
  username = getpass.getuser()
 
76
 
77
  return filename_end
78
 
79
+ def ensure_output_folder_exists(output_folder):
80
+ """Checks if the output folder exists, creates it if not."""
81
 
82
+ folder_name = output_folder
83
 
84
  if not os.path.exists(folder_name):
85
  # Create the folder if it doesn't exist
86
  os.makedirs(folder_name)
87
+ print(f"Created the output folder:", folder_name)
88
  else:
89
+ print(f"The output folder already exists:", folder_name)
90
 
91
  def detect_file_type(filename):
92
  """Detect the file type based on its extension."""
search_funcs/semantic_functions.py CHANGED
@@ -25,7 +25,7 @@ else:
25
 
26
  print("Device used is: ", torch_device)
27
 
28
- from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
@@ -70,7 +70,7 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_
70
  Takes a Langchain document class and saves it into a Numpy array.
71
  '''
72
 
73
- ensure_output_folder_exists()
74
 
75
  if not in_file:
76
  out_message = "No input file found. Please load in at least one file."
@@ -232,7 +232,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
232
  # print("vectorstore loaded: ", vectorstore)
233
  progress(0, desc = "Conducting semantic search")
234
 
235
- ensure_output_folder_exists()
236
 
237
  print("Searching")
238
 
@@ -297,7 +297,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
297
 
298
  query_str_file = query_str.replace(" ", "_")
299
 
300
- results_df_name = "output/semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
301
 
302
  print("Saving search output to file")
303
  progress(0.7, desc = "Saving search output to file")
@@ -594,7 +594,7 @@ def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:st
594
 
595
  results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
596
 
597
- results_df_name = "output/semantic_search_result.csv"
598
  results_df_out.to_csv(results_df_name, index= None)
599
  results_first_text = results_df_out[orig_df_col].iloc[0]
600
 
 
25
 
26
  print("Device used is: ", torch_device)
27
 
28
+ from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
 
70
  Takes a Langchain document class and saves it into a Numpy array.
71
  '''
72
 
73
+ ensure_output_folder_exists(output_folder)
74
 
75
  if not in_file:
76
  out_message = "No input file found. Please load in at least one file."
 
232
  # print("vectorstore loaded: ", vectorstore)
233
  progress(0, desc = "Conducting semantic search")
234
 
235
+ ensure_output_folder_exists(output_folder)
236
 
237
  print("Searching")
238
 
 
297
 
298
  query_str_file = query_str.replace(" ", "_")
299
 
300
+ results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
301
 
302
  print("Saving search output to file")
303
  progress(0.7, desc = "Saving search output to file")
 
594
 
595
  results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
596
 
597
+ results_df_name = output_folder + "semantic_search_result.csv"
598
  results_df_out.to_csv(results_df_name, index= None)
599
  results_first_text = results_df_out[orig_df_col].iloc[0]
600
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -32,7 +32,7 @@ chunk_overlap = 0
32
  start_index = True
33
 
34
  from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
35
- from search_funcs.bm25_functions import save_prepared_bm25_data
36
  from search_funcs.clean_funcs import initial_clean
37
 
38
  def parse_file_not_used(file_paths, text_column='text'):
@@ -198,7 +198,7 @@ def parse_metadata(row):
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
200
 
201
- ensure_output_folder_exists()
202
  output_list = []
203
 
204
  if not in_file:
@@ -305,7 +305,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
305
 
306
  if clean == "No":
307
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
308
- out_doc_file_name = "output/" + file_name + "_prepared_docs.pkl.gz"
309
  with gzip.open(out_doc_file_name, 'wb') as file:
310
  pickle.dump(doc_sections, file)
311
 
@@ -313,7 +313,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
313
  elif clean == "Yes":
314
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
315
 
316
- out_doc_file_name = "output/" + file_name + "_cleaned_prepared_docs.pkl.gz"
317
  with gzip.open(out_doc_file_name, 'wb') as file:
318
  pickle.dump(doc_sections, file)
319
 
 
32
  start_index = True
33
 
34
  from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
35
+ from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
36
  from search_funcs.clean_funcs import initial_clean
37
 
38
  def parse_file_not_used(file_paths, text_column='text'):
 
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
200
 
201
+ ensure_output_folder_exists(output_folder)
202
  output_list = []
203
 
204
  if not in_file:
 
305
 
306
  if clean == "No":
307
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
308
+ out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
309
  with gzip.open(out_doc_file_name, 'wb') as file:
310
  pickle.dump(doc_sections, file)
311
 
 
313
  elif clean == "Yes":
314
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
315
 
316
+ out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
317
  with gzip.open(out_doc_file_name, 'wb') as file:
318
  pickle.dump(doc_sections, file)
319