Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Jul 1, 2024

Commit

2754a2b

1 Parent(s): 1dc162b

Some package updates and minor changes

Browse files

Files changed (8) hide show

Dockerfile +1 -1
README.md +1 -1
app.py +6 -6
how_to_create_exe_dist.txt +2 -2
requirements.txt +4 -3
requirements_gpu.txt +11 -0
search_funcs/helper_functions.py +12 -0
search_funcs/semantic_functions.py +1 -0

Dockerfile CHANGED Viewed

@@ -17,7 +17,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Gradio needs to be installed after due to conflict with spacy in requirements
-RUN pip install --no-cache-dir gradio==4.32.2
 # Download the BGE embedding model during the build process
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash

 RUN pip install --no-cache-dir -r requirements.txt
 # Gradio needs to be installed after due to conflict with spacy in requirements
+RUN pip install --no-cache-dir gradio==4.36.1
 # Download the BGE embedding model during the build process
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔍
 colorFrom: purple
 colorTo: green
 sdk: gradio
-sdk_version: 4.32.2
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: purple
 colorTo: green
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -183,12 +183,12 @@ depends on factors such as the type of documents or queries. Information taken f
     in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
-    load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
-    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
     # BM25 search functions on click or enter
-    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
     keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
@@ -209,15 +209,15 @@ depends on factors such as the type of documents or queries. Information taken f
 # Simple run for HF spaces or local on your computer
 #block.queue().launch(debug=True)
-#def get_params(request: gr.Request):
 #    if request:
 #        print("Request headers dictionary:", request.headers)
 #        print("IP address:", request.client.host)
 #        print("Query parameters:", dict(request.query_params))
 #    return request.query_params
-#request_params = get_params()
-#print(request_params)
 # Running on server (e.g. AWS) without specifying port
 block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",

     in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
+    load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column], api_name="load_keyword").\
+    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file], api_name="prepare_keyword")#.\
     # BM25 search functions on click or enter
+    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword_search")
     keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
 # Simple run for HF spaces or local on your computer
 #block.queue().launch(debug=True)
+# def get_params(request: gr.Request):
 #    if request:
 #        print("Request headers dictionary:", request.headers)
 #        print("IP address:", request.client.host)
 #        print("Query parameters:", dict(request.query_params))
 #    return request.query_params
+# request_params = get_params()
+# print(request_params)
 # Running on server (e.g. AWS) without specifying port
 block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",

how_to_create_exe_dist.txt CHANGED Viewed

@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
-a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client  --collect-data=gradio  --hidden-import pyarrow.vendored.version  --name DataSearchApp_0.4 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
@@ -28,7 +28,7 @@ a = Analysis(
     }
 )
-c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.4.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client  --collect-data=gradio  --hidden-import pyarrow.vendored.version  --name DataSearchApp_0.5 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
     }
 )
+c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.5.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

requirements.txt CHANGED Viewed

@@ -2,10 +2,11 @@ pandas==2.2.2
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
-torch==2.1.2
 spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio
-sentence_transformers==2.3.1
 lxml==5.1.0
-boto3==1.34.103

 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.2
+torch==2.3.1
+transformers==4.41.2
 spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio
+sentence_transformers==3.0.1
 lxml==5.1.0
+boto3==1.34.103

requirements_gpu.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas==2.2.2
+polars==0.20.3
+pyarrow==14.0.2
+openpyxl==3.1.2
+torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
+spacy
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio
+sentence_transformers==2.3.1
+lxml==5.1.0
+boto3==1.34.103

search_funcs/helper_functions.py CHANGED Viewed

@@ -37,6 +37,18 @@ default_value = 'output/'
 output_folder = get_or_create_env_var(env_var_name, default_value)
 print(f'The value of {env_var_name} is {output_folder}')
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()

 output_folder = get_or_create_env_var(env_var_name, default_value)
 print(f'The value of {env_var_name} is {output_folder}')
+def ensure_output_folder_exists(output_folder):
+    """Checks if the output folder exists, creates it if not."""
+    folder_name = output_folder
+    if not os.path.exists(folder_name):
+        # Create the folder if it doesn't exist
+        os.makedirs(folder_name)
+        print(f"Created the output folder:", folder_name)
+    else:
+        print(f"The output folder already exists:", folder_name)
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()

search_funcs/semantic_functions.py CHANGED Viewed

@@ -206,6 +206,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
     results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
     results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
     # Join back to original df
     # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")

     results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
     results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
     # Join back to original df
     # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")