Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 9 days ago

Commit

909f1fb

verified ·

1 Parent(s): a63f173

Update mtdna_backend.py

Browse files

Files changed (1) hide show

mtdna_backend.py +25 -8

mtdna_backend.py CHANGED Viewed

@@ -22,10 +22,10 @@ import threading
 # def classify_sample_location_cached(accession):
 #     return classify_sample_location(accession)
-@lru_cache(maxsize=3600)
-def pipeline_classify_sample_location_cached(accession,stop_flag=None):
     print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
-    return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag)
 # Count and suggest final location
 # def compute_final_suggested_location(rows):
@@ -155,7 +155,22 @@ def summarize_results(accession, stop_flag=None):
     # only run when nothing in the cache
     try:
         print("try gemini pipeline: ",accession)
-        outputs = pipeline_classify_sample_location_cached(accession, stop_flag)
         if stop_flag is not None and stop_flag.value:
             print(f"🛑 Skipped {accession} mid-pipeline.")
             return []
@@ -220,7 +235,9 @@ def summarize_results(accession, stop_flag=None):
             "Sample Type Explanation":sample_explanation or "unknown",
             "Sources": "\n".join(outputs[key]["source"]) or "No Links",
             "Query_cost": outputs[key]["query_cost"],
-            "Time cost": outputs[key]["time_cost"]
         }
         #row_score.append(row)
         save_rows.append(list(save_row.values()))
@@ -280,7 +297,7 @@ def summarize_results(accession, stop_flag=None):
         df_new = pd.DataFrame(save_rows, columns=[
             "Sample ID", "Predicted Country", "Country Explanation",
             "Predicted Sample Type", "Sample Type Explanation",
-            "Sources", "Query_cost", "Time cost"
         ])
         # ✅ Setup Google Sheets
@@ -299,7 +316,7 @@ def summarize_results(accession, stop_flag=None):
             df_old = pd.DataFrame(columns=[
                 "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
                 "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
-                "Query_cost", "Sample Type Explanation", "Sources", "Time cost"
             ])
         # ✅ Index by Sample ID
@@ -309,7 +326,7 @@ def summarize_results(accession, stop_flag=None):
         # ✅ Update only matching fields
         update_columns = [
             "Predicted Country", "Predicted Sample Type", "Country Explanation",
-            "Sample Type Explanation", "Sources", "Query_cost", "Time cost"
         ]
         for idx, row in df_new.iterrows():
             if idx not in df_old.index:

 # def classify_sample_location_cached(accession):
 #     return classify_sample_location(accession)
+#@lru_cache(maxsize=3600)
+def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None):
     print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
+    return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df)
 # Count and suggest final location
 # def compute_final_suggested_location(rows):
     # only run when nothing in the cache
     try:
         print("try gemini pipeline: ",accession)
+        # ✅ Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("known_samples")
+        sheet = spreadsheet.sheet1
+        data = sheet.get_all_values()
+        if not data:
+            print("⚠️ Google Sheet 'known_samples' is empty.")
+            return None
+        save_df = pd.DataFrame(data[1:], columns=data[0])
+        outputs = pipeline_classify_sample_location_cached(accession, stop_flag, save_df)
         if stop_flag is not None and stop_flag.value:
             print(f"🛑 Skipped {accession} mid-pipeline.")
             return []
             "Sample Type Explanation":sample_explanation or "unknown",
             "Sources": "\n".join(outputs[key]["source"]) or "No Links",
             "Query_cost": outputs[key]["query_cost"],
+            "Time cost": outputs[key]["time_cost"],
+            "file_chunk":outputs[key]["file_chunk"],
+            "file_all_output":outputs[key]["file_all_output"]
         }
         #row_score.append(row)
         save_rows.append(list(save_row.values()))
         df_new = pd.DataFrame(save_rows, columns=[
             "Sample ID", "Predicted Country", "Country Explanation",
             "Predicted Sample Type", "Sample Type Explanation",
+            "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
         ])
         # ✅ Setup Google Sheets
             df_old = pd.DataFrame(columns=[
                 "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
                 "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
+                "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
             ])
         # ✅ Index by Sample ID
         # ✅ Update only matching fields
         update_columns = [
             "Predicted Country", "Predicted Sample Type", "Country Explanation",
+            "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
         ]
         for idx, row in df_new.iterrows():
             if idx not in df_old.index: