Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on Jul 7

Commit

ac84bdc

verified ·

1 Parent(s): d47e63d

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +5 -3

pipeline.py CHANGED Viewed

@@ -150,6 +150,7 @@ def pipeline_with_gemini(accessions):
       meta = mtdna_classifier.fetch_ncbi_metadata(acc)
       country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
       acc_score["isolate"] = iso
       # set up step: create the folder to save document
       chunk, all_output = "",""
       if pudID:
@@ -173,7 +174,7 @@ def pipeline_with_gemini(accessions):
       # Define document names
       chunk_filename = f"{saveTitle}_merged_document.docx"
       all_filename = f"{saveTitle}_all_merged_document.docx"
       # Define local temp paths for reading/writing
       import tempfile
       tmp_dir = tempfile.mkdtemp()
@@ -183,7 +184,7 @@ def pipeline_with_gemini(accessions):
       # Try to download if already exists on Drive
       chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
       all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
       # first way: ncbi method
       if country.lower() != "unknown":
         stand_country = standardize_location.smart_country_lookup(country.lower())
@@ -227,7 +228,7 @@ def pipeline_with_gemini(accessions):
               links.append(link)
           if jsonSM:
             links += sum((jsonSM[key] for key in jsonSM),[])
-      #print(links)
       links = unique_preserve_order(links)
       acc_score["source"] = links
       # chunk_path = "/"+saveTitle+"_merged_document.docx"
@@ -256,6 +257,7 @@ def pipeline_with_gemini(accessions):
             text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
             all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
       if not chunk and not all_output:
         # else: check if we can reuse these chunk and all output of existed accession to find another
         if links:
           for link in links:

       meta = mtdna_classifier.fetch_ncbi_metadata(acc)
       country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
       acc_score["isolate"] = iso
+      print(meta)
       # set up step: create the folder to save document
       chunk, all_output = "",""
       if pudID:
       # Define document names
       chunk_filename = f"{saveTitle}_merged_document.docx"
       all_filename = f"{saveTitle}_all_merged_document.docx"
+      print(chunk_filename, all_filename)
       # Define local temp paths for reading/writing
       import tempfile
       tmp_dir = tempfile.mkdtemp()
       # Try to download if already exists on Drive
       chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
       all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+      print("chunk exist: ", chunk_exists)
       # first way: ncbi method
       if country.lower() != "unknown":
         stand_country = standardize_location.smart_country_lookup(country.lower())
               links.append(link)
           if jsonSM:
             links += sum((jsonSM[key] for key in jsonSM),[])
+      print("this is links: ",links)
       links = unique_preserve_order(links)
       acc_score["source"] = links
       # chunk_path = "/"+saveTitle+"_merged_document.docx"
             text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
             all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
       if not chunk and not all_output:
+        print("not chunk and all output")
         # else: check if we can reuse these chunk and all output of existed accession to find another
         if links:
           for link in links: