seanpedrickcase
commited on
Commit
β’
2754a2b
1
Parent(s):
1dc162b
Some package updates and minor changes
Browse files- Dockerfile +1 -1
- README.md +1 -1
- app.py +6 -6
- how_to_create_exe_dist.txt +2 -2
- requirements.txt +4 -3
- requirements_gpu.txt +11 -0
- search_funcs/helper_functions.py +12 -0
- search_funcs/semantic_functions.py +1 -0
Dockerfile
CHANGED
@@ -17,7 +17,7 @@ COPY requirements.txt .
|
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
# Gradio needs to be installed after due to conflict with spacy in requirements
|
20 |
-
RUN pip install --no-cache-dir gradio==4.
|
21 |
|
22 |
# Download the BGE embedding model during the build process
|
23 |
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
|
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
# Gradio needs to be installed after due to conflict with spacy in requirements
|
20 |
+
RUN pip install --no-cache-dir gradio==4.36.1
|
21 |
|
22 |
# Download the BGE embedding model during the build process
|
23 |
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.36.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -183,12 +183,12 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
183 |
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
184 |
|
185 |
# Load in BM25 data
|
186 |
-
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
|
187 |
-
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
|
188 |
|
189 |
|
190 |
# BM25 search functions on click or enter
|
191 |
-
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="
|
192 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
193 |
|
194 |
# Fuzzy search functions on click
|
@@ -209,15 +209,15 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
209 |
# Simple run for HF spaces or local on your computer
|
210 |
#block.queue().launch(debug=True)
|
211 |
|
212 |
-
#def get_params(request: gr.Request):
|
213 |
# if request:
|
214 |
# print("Request headers dictionary:", request.headers)
|
215 |
# print("IP address:", request.client.host)
|
216 |
# print("Query parameters:", dict(request.query_params))
|
217 |
# return request.query_params
|
218 |
|
219 |
-
#request_params = get_params()
|
220 |
-
#print(request_params)
|
221 |
|
222 |
# Running on server (e.g. AWS) without specifying port
|
223 |
block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
|
|
|
183 |
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
184 |
|
185 |
# Load in BM25 data
|
186 |
+
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column], api_name="load_keyword").\
|
187 |
+
then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file], api_name="prepare_keyword")#.\
|
188 |
|
189 |
|
190 |
# BM25 search functions on click or enter
|
191 |
+
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword_search")
|
192 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
|
193 |
|
194 |
# Fuzzy search functions on click
|
|
|
209 |
# Simple run for HF spaces or local on your computer
|
210 |
#block.queue().launch(debug=True)
|
211 |
|
212 |
+
# def get_params(request: gr.Request):
|
213 |
# if request:
|
214 |
# print("Request headers dictionary:", request.headers)
|
215 |
# print("IP address:", request.client.host)
|
216 |
# print("Query parameters:", dict(request.query_params))
|
217 |
# return request.query_params
|
218 |
|
219 |
+
# request_params = get_params()
|
220 |
+
# print(request_params)
|
221 |
|
222 |
# Running on server (e.g. AWS) without specifying port
|
223 |
block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
|
how_to_create_exe_dist.txt
CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
@@ -28,7 +28,7 @@ a = Analysis(
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.
|
32 |
|
33 |
|
34 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.5 app.py
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.5.spec
|
32 |
|
33 |
|
34 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
requirements.txt
CHANGED
@@ -2,10 +2,11 @@ pandas==2.2.2
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
-
torch==2.1
|
|
|
6 |
spacy
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
gradio
|
9 |
-
sentence_transformers==
|
10 |
lxml==5.1.0
|
11 |
-
boto3==1.34.103
|
|
|
2 |
polars==0.20.3
|
3 |
pyarrow==14.0.2
|
4 |
openpyxl==3.1.2
|
5 |
+
torch==2.3.1
|
6 |
+
transformers==4.41.2
|
7 |
spacy
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
9 |
gradio
|
10 |
+
sentence_transformers==3.0.1
|
11 |
lxml==5.1.0
|
12 |
+
boto3==1.34.103
|
requirements_gpu.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
polars==0.20.3
|
3 |
+
pyarrow==14.0.2
|
4 |
+
openpyxl==3.1.2
|
5 |
+
torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
|
6 |
+
spacy
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
+
gradio
|
9 |
+
sentence_transformers==2.3.1
|
10 |
+
lxml==5.1.0
|
11 |
+
boto3==1.34.103
|
search_funcs/helper_functions.py
CHANGED
@@ -37,6 +37,18 @@ default_value = 'output/'
|
|
37 |
output_folder = get_or_create_env_var(env_var_name, default_value)
|
38 |
print(f'The value of {env_var_name} is {output_folder}')
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Attempt to delete content of gradio temp folder
|
41 |
def get_temp_folder_path():
|
42 |
username = getpass.getuser()
|
|
|
37 |
output_folder = get_or_create_env_var(env_var_name, default_value)
|
38 |
print(f'The value of {env_var_name} is {output_folder}')
|
39 |
|
40 |
+
def ensure_output_folder_exists(output_folder):
|
41 |
+
"""Checks if the output folder exists, creates it if not."""
|
42 |
+
|
43 |
+
folder_name = output_folder
|
44 |
+
|
45 |
+
if not os.path.exists(folder_name):
|
46 |
+
# Create the folder if it doesn't exist
|
47 |
+
os.makedirs(folder_name)
|
48 |
+
print(f"Created the output folder:", folder_name)
|
49 |
+
else:
|
50 |
+
print(f"The output folder already exists:", folder_name)
|
51 |
+
|
52 |
# Attempt to delete content of gradio temp folder
|
53 |
def get_temp_folder_path():
|
54 |
username = getpass.getuser()
|
search_funcs/semantic_functions.py
CHANGED
@@ -206,6 +206,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
|
|
206 |
|
207 |
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
|
208 |
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
|
|
|
209 |
|
210 |
# Join back to original df
|
211 |
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|
|
|
206 |
|
207 |
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
|
208 |
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
|
209 |
+
|
210 |
|
211 |
# Join back to original df
|
212 |
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
|