VyLala commited on
Commit
909f1fb
Β·
verified Β·
1 Parent(s): a63f173

Update mtdna_backend.py

Browse files
Files changed (1) hide show
  1. mtdna_backend.py +25 -8
mtdna_backend.py CHANGED
@@ -22,10 +22,10 @@ import threading
22
  # def classify_sample_location_cached(accession):
23
  # return classify_sample_location(accession)
24
 
25
- @lru_cache(maxsize=3600)
26
- def pipeline_classify_sample_location_cached(accession,stop_flag=None):
27
  print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
28
- return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag)
29
 
30
  # Count and suggest final location
31
  # def compute_final_suggested_location(rows):
@@ -155,7 +155,22 @@ def summarize_results(accession, stop_flag=None):
155
  # only run when nothing in the cache
156
  try:
157
  print("try gemini pipeline: ",accession)
158
- outputs = pipeline_classify_sample_location_cached(accession, stop_flag)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  if stop_flag is not None and stop_flag.value:
160
  print(f"πŸ›‘ Skipped {accession} mid-pipeline.")
161
  return []
@@ -220,7 +235,9 @@ def summarize_results(accession, stop_flag=None):
220
  "Sample Type Explanation":sample_explanation or "unknown",
221
  "Sources": "\n".join(outputs[key]["source"]) or "No Links",
222
  "Query_cost": outputs[key]["query_cost"],
223
- "Time cost": outputs[key]["time_cost"]
 
 
224
  }
225
  #row_score.append(row)
226
  save_rows.append(list(save_row.values()))
@@ -280,7 +297,7 @@ def summarize_results(accession, stop_flag=None):
280
  df_new = pd.DataFrame(save_rows, columns=[
281
  "Sample ID", "Predicted Country", "Country Explanation",
282
  "Predicted Sample Type", "Sample Type Explanation",
283
- "Sources", "Query_cost", "Time cost"
284
  ])
285
 
286
  # βœ… Setup Google Sheets
@@ -299,7 +316,7 @@ def summarize_results(accession, stop_flag=None):
299
  df_old = pd.DataFrame(columns=[
300
  "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
301
  "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
302
- "Query_cost", "Sample Type Explanation", "Sources", "Time cost"
303
  ])
304
 
305
  # βœ… Index by Sample ID
@@ -309,7 +326,7 @@ def summarize_results(accession, stop_flag=None):
309
  # βœ… Update only matching fields
310
  update_columns = [
311
  "Predicted Country", "Predicted Sample Type", "Country Explanation",
312
- "Sample Type Explanation", "Sources", "Query_cost", "Time cost"
313
  ]
314
  for idx, row in df_new.iterrows():
315
  if idx not in df_old.index:
 
22
  # def classify_sample_location_cached(accession):
23
  # return classify_sample_location(accession)
24
 
25
+ #@lru_cache(maxsize=3600)
26
+ def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None):
27
  print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
28
+ return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df)
29
 
30
  # Count and suggest final location
31
  # def compute_final_suggested_location(rows):
 
155
  # only run when nothing in the cache
156
  try:
157
  print("try gemini pipeline: ",accession)
158
+ # βœ… Load credentials from Hugging Face secret
159
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
160
+ scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
161
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
162
+ client = gspread.authorize(creds)
163
+
164
+ spreadsheet = client.open("known_samples")
165
+ sheet = spreadsheet.sheet1
166
+
167
+ data = sheet.get_all_values()
168
+ if not data:
169
+ print("⚠️ Google Sheet 'known_samples' is empty.")
170
+ return None
171
+
172
+ save_df = pd.DataFrame(data[1:], columns=data[0])
173
+ outputs = pipeline_classify_sample_location_cached(accession, stop_flag, save_df)
174
  if stop_flag is not None and stop_flag.value:
175
  print(f"πŸ›‘ Skipped {accession} mid-pipeline.")
176
  return []
 
235
  "Sample Type Explanation":sample_explanation or "unknown",
236
  "Sources": "\n".join(outputs[key]["source"]) or "No Links",
237
  "Query_cost": outputs[key]["query_cost"],
238
+ "Time cost": outputs[key]["time_cost"],
239
+ "file_chunk":outputs[key]["file_chunk"],
240
+ "file_all_output":outputs[key]["file_all_output"]
241
  }
242
  #row_score.append(row)
243
  save_rows.append(list(save_row.values()))
 
297
  df_new = pd.DataFrame(save_rows, columns=[
298
  "Sample ID", "Predicted Country", "Country Explanation",
299
  "Predicted Sample Type", "Sample Type Explanation",
300
+ "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
301
  ])
302
 
303
  # βœ… Setup Google Sheets
 
316
  df_old = pd.DataFrame(columns=[
317
  "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
318
  "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
319
+ "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
320
  ])
321
 
322
  # βœ… Index by Sample ID
 
326
  # βœ… Update only matching fields
327
  update_columns = [
328
  "Predicted Country", "Predicted Sample Type", "Country Explanation",
329
+ "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
330
  ]
331
  for idx, row in df_new.iterrows():
332
  if idx not in df_old.index: