Spaces:
Running
Running
Update mtdna_backend.py
Browse files- mtdna_backend.py +25 -8
mtdna_backend.py
CHANGED
@@ -22,10 +22,10 @@ import threading
|
|
22 |
# def classify_sample_location_cached(accession):
|
23 |
# return classify_sample_location(accession)
|
24 |
|
25 |
-
|
26 |
-
def pipeline_classify_sample_location_cached(accession,stop_flag=None):
|
27 |
print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
|
28 |
-
return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag)
|
29 |
|
30 |
# Count and suggest final location
|
31 |
# def compute_final_suggested_location(rows):
|
@@ -155,7 +155,22 @@ def summarize_results(accession, stop_flag=None):
|
|
155 |
# only run when nothing in the cache
|
156 |
try:
|
157 |
print("try gemini pipeline: ",accession)
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
if stop_flag is not None and stop_flag.value:
|
160 |
print(f"π Skipped {accession} mid-pipeline.")
|
161 |
return []
|
@@ -220,7 +235,9 @@ def summarize_results(accession, stop_flag=None):
|
|
220 |
"Sample Type Explanation":sample_explanation or "unknown",
|
221 |
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
222 |
"Query_cost": outputs[key]["query_cost"],
|
223 |
-
"Time cost": outputs[key]["time_cost"]
|
|
|
|
|
224 |
}
|
225 |
#row_score.append(row)
|
226 |
save_rows.append(list(save_row.values()))
|
@@ -280,7 +297,7 @@ def summarize_results(accession, stop_flag=None):
|
|
280 |
df_new = pd.DataFrame(save_rows, columns=[
|
281 |
"Sample ID", "Predicted Country", "Country Explanation",
|
282 |
"Predicted Sample Type", "Sample Type Explanation",
|
283 |
-
"Sources", "Query_cost", "Time cost"
|
284 |
])
|
285 |
|
286 |
# β
Setup Google Sheets
|
@@ -299,7 +316,7 @@ def summarize_results(accession, stop_flag=None):
|
|
299 |
df_old = pd.DataFrame(columns=[
|
300 |
"Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
|
301 |
"Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
|
302 |
-
"Query_cost", "Sample Type Explanation", "Sources", "Time cost"
|
303 |
])
|
304 |
|
305 |
# β
Index by Sample ID
|
@@ -309,7 +326,7 @@ def summarize_results(accession, stop_flag=None):
|
|
309 |
# β
Update only matching fields
|
310 |
update_columns = [
|
311 |
"Predicted Country", "Predicted Sample Type", "Country Explanation",
|
312 |
-
"Sample Type Explanation", "Sources", "Query_cost", "Time cost"
|
313 |
]
|
314 |
for idx, row in df_new.iterrows():
|
315 |
if idx not in df_old.index:
|
|
|
22 |
# def classify_sample_location_cached(accession):
|
23 |
# return classify_sample_location(accession)
|
24 |
|
25 |
+
#@lru_cache(maxsize=3600)
|
26 |
+
def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None):
|
27 |
print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
|
28 |
+
return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df)
|
29 |
|
30 |
# Count and suggest final location
|
31 |
# def compute_final_suggested_location(rows):
|
|
|
155 |
# only run when nothing in the cache
|
156 |
try:
|
157 |
print("try gemini pipeline: ",accession)
|
158 |
+
# β
Load credentials from Hugging Face secret
|
159 |
+
creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
160 |
+
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
|
161 |
+
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
|
162 |
+
client = gspread.authorize(creds)
|
163 |
+
|
164 |
+
spreadsheet = client.open("known_samples")
|
165 |
+
sheet = spreadsheet.sheet1
|
166 |
+
|
167 |
+
data = sheet.get_all_values()
|
168 |
+
if not data:
|
169 |
+
print("β οΈ Google Sheet 'known_samples' is empty.")
|
170 |
+
return None
|
171 |
+
|
172 |
+
save_df = pd.DataFrame(data[1:], columns=data[0])
|
173 |
+
outputs = pipeline_classify_sample_location_cached(accession, stop_flag, save_df)
|
174 |
if stop_flag is not None and stop_flag.value:
|
175 |
print(f"π Skipped {accession} mid-pipeline.")
|
176 |
return []
|
|
|
235 |
"Sample Type Explanation":sample_explanation or "unknown",
|
236 |
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
237 |
"Query_cost": outputs[key]["query_cost"],
|
238 |
+
"Time cost": outputs[key]["time_cost"],
|
239 |
+
"file_chunk":outputs[key]["file_chunk"],
|
240 |
+
"file_all_output":outputs[key]["file_all_output"]
|
241 |
}
|
242 |
#row_score.append(row)
|
243 |
save_rows.append(list(save_row.values()))
|
|
|
297 |
df_new = pd.DataFrame(save_rows, columns=[
|
298 |
"Sample ID", "Predicted Country", "Country Explanation",
|
299 |
"Predicted Sample Type", "Sample Type Explanation",
|
300 |
+
"Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
301 |
])
|
302 |
|
303 |
# β
Setup Google Sheets
|
|
|
316 |
df_old = pd.DataFrame(columns=[
|
317 |
"Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
|
318 |
"Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
|
319 |
+
"Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
|
320 |
])
|
321 |
|
322 |
# β
Index by Sample ID
|
|
|
326 |
# β
Update only matching fields
|
327 |
update_columns = [
|
328 |
"Predicted Country", "Predicted Sample Type", "Country Explanation",
|
329 |
+
"Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
330 |
]
|
331 |
for idx, row in df_new.iterrows():
|
332 |
if idx not in df_old.index:
|