seanpedrickcase
commited on
Commit
•
651ef78
1
Parent(s):
30b5dc1
Added additional password auth for AWS-based files. Changed 'Clean' default to no
Browse files- .gitignore +3 -1
- app.py +3 -2
- search_funcs/aws_functions.py +36 -33
.gitignore
CHANGED
@@ -17,6 +17,7 @@
|
|
17 |
*.pkl.gz
|
18 |
*.pem
|
19 |
*.json.out
|
|
|
20 |
docs/*
|
21 |
build/*
|
22 |
dist/*
|
@@ -25,4 +26,5 @@ db/*
|
|
25 |
experiments/*
|
26 |
model/*
|
27 |
build_deps/*
|
28 |
-
build_deps_old/*
|
|
|
|
17 |
*.pkl.gz
|
18 |
*.pem
|
19 |
*.json.out
|
20 |
+
*.env
|
21 |
docs/*
|
22 |
build/*
|
23 |
dist/*
|
|
|
26 |
experiments/*
|
27 |
model/*
|
28 |
build_deps/*
|
29 |
+
build_deps_old/*
|
30 |
+
AWS errors 19-03.txt
|
app.py
CHANGED
@@ -157,6 +157,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
159 |
with gr.Accordion(label = "AWS data access", open = False):
|
|
|
160 |
with gr.Row():
|
161 |
in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
|
162 |
load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
|
@@ -176,8 +177,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
176 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
177 |
|
178 |
### Loading AWS data ###
|
179 |
-
load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file], outputs=[in_bm25_file, out_aws_data_message])
|
180 |
-
load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file], outputs=[in_semantic_file, out_aws_data_message])
|
181 |
|
182 |
|
183 |
### BM25 SEARCH ###
|
|
|
157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
159 |
with gr.Accordion(label = "AWS data access", open = False):
|
160 |
+
aws_password_box = gr.Textbox(label="Password for AWS data access (ask Data team if you don't have this)")
|
161 |
with gr.Row():
|
162 |
in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
|
163 |
load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
|
|
|
177 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
178 |
|
179 |
### Loading AWS data ###
|
180 |
+
load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file, aws_password_box], outputs=[in_bm25_file, out_aws_data_message])
|
181 |
+
load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file, aws_password_box], outputs=[in_semantic_file, out_aws_data_message])
|
182 |
|
183 |
|
184 |
### BM25 SEARCH ###
|
search_funcs/aws_functions.py
CHANGED
@@ -106,60 +106,63 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
|
106 |
|
107 |
|
108 |
|
109 |
-
def load_data_from_aws(in_aws_keyword_file, bucket_name=bucket_name):
|
110 |
|
111 |
temp_dir = tempfile.mkdtemp()
|
112 |
local_keyword_stub = temp_dir + '/keyword/'
|
113 |
local_semantic_stub = temp_dir + '/semantic/'
|
114 |
|
115 |
files = []
|
|
|
|
|
116 |
|
117 |
-
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
s3_folder_stub = s3_folder_stub + 'keyword/'
|
123 |
-
local_folder_path = local_keyword_stub
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
|
134 |
-
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
|
148 |
-
|
149 |
|
150 |
-
|
151 |
-
|
152 |
|
153 |
-
|
154 |
-
|
155 |
|
156 |
-
|
157 |
|
158 |
-
|
159 |
-
|
160 |
|
|
|
|
|
|
|
161 |
else:
|
162 |
-
out_message = "
|
163 |
print(out_message)
|
164 |
|
165 |
return files, out_message
|
|
|
106 |
|
107 |
|
108 |
|
109 |
+
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
110 |
|
111 |
temp_dir = tempfile.mkdtemp()
|
112 |
local_keyword_stub = temp_dir + '/keyword/'
|
113 |
local_semantic_stub = temp_dir + '/semantic/'
|
114 |
|
115 |
files = []
|
116 |
+
if aws_password:
|
117 |
+
if "Bioasq - Biomedical example data" in in_aws_keyword_file and aws_password == os.environ['BIOASQ_PASSWORD']:
|
118 |
|
119 |
+
s3_folder_stub = 'example_data/bioasq/latest/'
|
120 |
|
121 |
+
if 'keyword' in in_aws_keyword_file:
|
122 |
+
s3_folder_stub = s3_folder_stub + 'keyword/'
|
123 |
+
local_folder_path = local_keyword_stub
|
|
|
|
|
124 |
|
125 |
+
if 'semantic' in in_aws_keyword_file:
|
126 |
+
s3_folder_stub = s3_folder_stub + 'semantic/'
|
127 |
+
local_folder_path = local_semantic_stub
|
128 |
+
|
129 |
|
130 |
+
# Check if folder exists
|
131 |
+
if not os.path.exists(local_folder_path):
|
132 |
+
print(f"Folder {local_folder_path} does not exist! Making folder.")
|
133 |
|
134 |
+
os.mkdir(local_folder_path)
|
135 |
|
136 |
+
# Check if folder is empty
|
137 |
+
if len(os.listdir(local_folder_path)) == 0:
|
138 |
+
print(f"Folder {local_folder_path} is empty")
|
139 |
|
140 |
+
if 'keyword' in in_aws_keyword_file:
|
141 |
+
# Download keyword folder
|
142 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
|
143 |
|
144 |
+
if 'semantic' in in_aws_keyword_file:
|
145 |
+
# Download keyword folder
|
146 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames=['mini-bioasq-0000_cleaned_bge_embedding_compress.npz', 'mini-bioasq-0000_cleaned_prepared_docs.pkl.gz'])
|
147 |
|
148 |
+
print("AWS data downloaded")
|
149 |
|
150 |
+
else:
|
151 |
+
print(f"Folder {local_folder_path} is not empty")
|
152 |
|
153 |
+
#files = os.listdir(local_folder_stub)
|
154 |
+
#print(files)
|
155 |
|
156 |
+
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
157 |
|
158 |
+
out_message = "Data successfully loaded from AWS"
|
159 |
+
print(out_message)
|
160 |
|
161 |
+
else:
|
162 |
+
out_message = "Data not loaded from AWS"
|
163 |
+
print(out_message)
|
164 |
else:
|
165 |
+
out_message = "No password provided. Please ask the data team for access if you need this."
|
166 |
print(out_message)
|
167 |
|
168 |
return files, out_message
|