Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 17 days ago

Commit

fb8d818

verified ·

1 Parent(s): bdd4893

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +47 -10

pipeline.py CHANGED Viewed

@@ -187,7 +187,7 @@ def unique_preserve_order(seq):
     seen = set()
     return [x for x in seq if not (x in seen or seen.add(x))]
 # Main execution
-def pipeline_with_gemini(accessions):
   # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
   # there can be one accession number in the accessions
   # Prices are per 1,000 tokens
@@ -213,6 +213,10 @@ def pipeline_with_gemini(accessions):
                    "query_cost":total_cost_title,
                    "time_cost":None,
                    "source":links}
       meta = mtdna_classifier.fetch_ncbi_metadata(acc)
       country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
       acc_score["isolate"] = iso
@@ -350,7 +354,15 @@ def pipeline_with_gemini(accessions):
         print("tem link before filtering: ", tem_links)
         # filter the quality link
         print("saveLinkFolder as sample folder id: ", sample_folder_id)
-        links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc)
       print("this is links: ",links)
       links = unique_preserve_order(links)
       acc_score["source"] = links
@@ -419,15 +431,26 @@ def pipeline_with_gemini(accessions):
                   final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
                   if len(final_input_link) > 1000 *1000:
                     final_input_link = final_input_link[:100000]
-              if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
-                success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
                 if success:
                   all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                   print("yes succeed")
                 else:
                   all_output += final_input_link
                   print("len final input: ", len(final_input_link))
                   print("basic fall back")
               print("len all output after: ", len(all_output))
           #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
@@ -552,12 +575,19 @@ def pipeline_with_gemini(accessions):
           model.call_llm_api, chunk=chunk, all_output=all_output)
       print("country using ai: ", country)
       print("sample type using ai: ", sample_type)
       if len(country) == 0: country = "unknown"
       if len(sample_type) == 0: sample_type = "unknown"
-      if country_explanation: country_explanation = "-"+country_explanation
       else: country_explanation = ""
-      if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
       else: sample_type_explanation = ""
       if method_used == "unknown": method_used = ""
       if country.lower() != "unknown":
         stand_country = standardize_location.smart_country_lookup(country.lower())
@@ -592,8 +622,9 @@ def pipeline_with_gemini(accessions):
         else:
           if len(method_used + sample_type_explanation)> 0:
             acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
       # last resort: combine all information to give all output otherwise unknown
-      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
         text = ""
         for key in meta_expand:
           text += str(key) + ": " + meta_expand[key] + "\n"
@@ -612,10 +643,15 @@ def pipeline_with_gemini(accessions):
         print("sample type: ", sample_type)
         if len(country) == 0: country = "unknown"
         if len(sample_type) == 0: sample_type = "unknown"
-        if country_explanation: country_explanation = "-"+country_explanation
         else: country_explanation = ""
-        if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
         else: sample_type_explanation = ""
         if method_used == "unknown": method_used = ""
         if country.lower() != "unknown":
           stand_country = standardize_location.smart_country_lookup(country.lower())
@@ -640,8 +676,9 @@ def pipeline_with_gemini(accessions):
             else:
               if len(method_used + sample_type_explanation)> 0:
                 acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
       end = time.time()
-      total_cost_title += total_query_cost
       acc_score["query_cost"] = f"{total_cost_title:.6f}"
       elapsed = end - start
       acc_score["time_cost"] = f"{elapsed:.3f} seconds"

     seen = set()
     return [x for x in seq if not (x in seen or seen.add(x))]
 # Main execution
+def pipeline_with_gemini(accessions,niche_cases=None):
   # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
   # there can be one accession number in the accessions
   # Prices are per 1,000 tokens
                    "query_cost":total_cost_title,
                    "time_cost":None,
                    "source":links}
+      if niche_cases:
+        for niche in niche_cases:
+          acc_score[niche] = {}
       meta = mtdna_classifier.fetch_ncbi_metadata(acc)
       country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
       acc_score["isolate"] = iso
         print("tem link before filtering: ", tem_links)
         # filter the quality link
         print("saveLinkFolder as sample folder id: ", sample_folder_id)
+        print("start the smart filter link")
+        success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=100)
+        if success_process:
+          links = output_process
+          print("yes succeed for smart filter link")
+        else:
+          print("no suceed, fallback to all tem links")
+          links = tem_links
+        #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc)
       print("this is links: ",links)
       links = unique_preserve_order(links)
       acc_score["source"] = links
                   final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
                   if len(final_input_link) > 1000 *1000:
                     final_input_link = final_input_link[:100000]
+              if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
+                print("Running merge_texts_skipping_overlap with timeout")
+                success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
+                print("Returned from timeout logic")
                 if success:
                   all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                   print("yes succeed")
                 else:
+                  print("len all output: ", len(all_output))
+                  print("len final input link: ", len(final_input_link))
                   all_output += final_input_link
                   print("len final input: ", len(final_input_link))
                   print("basic fall back")
+              else:
+                  print("both/either all output or final link too large more than 100000")
+                  print("len all output: ", len(all_output))
+                  print("len final input link: ", len(final_input_link))
+                  all_output += final_input_link
+                  print("len final input: ", len(final_input_link))
+                  print("basic fall back")
               print("len all output after: ", len(all_output))
           #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
           model.call_llm_api, chunk=chunk, all_output=all_output)
       print("country using ai: ", country)
       print("sample type using ai: ", sample_type)
+      # if len(country) == 0: country = "unknown"
+      # if len(sample_type) == 0: sample_type = "unknown"
+      # if country_explanation: country_explanation = "-"+country_explanation
+      # else: country_explanation = ""
+      # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+      # else: sample_type_explanation = ""
       if len(country) == 0: country = "unknown"
       if len(sample_type) == 0: sample_type = "unknown"
+      if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation
       else: country_explanation = ""
+      if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
       else: sample_type_explanation = ""
       if method_used == "unknown": method_used = ""
       if country.lower() != "unknown":
         stand_country = standardize_location.smart_country_lookup(country.lower())
         else:
           if len(method_used + sample_type_explanation)> 0:
             acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
+      total_cost_title += total_query_cost
       # last resort: combine all information to give all output otherwise unknown
+      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":
         text = ""
         for key in meta_expand:
           text += str(key) + ": " + meta_expand[key] + "\n"
         print("sample type: ", sample_type)
         if len(country) == 0: country = "unknown"
         if len(sample_type) == 0: sample_type = "unknown"
+        # if country_explanation: country_explanation = "-"+country_explanation
+        # else: country_explanation = ""
+        # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+        # else: sample_type_explanation = ""
+        if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation
         else: country_explanation = ""
+        if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
         else: sample_type_explanation = ""
         if method_used == "unknown": method_used = ""
         if country.lower() != "unknown":
           stand_country = standardize_location.smart_country_lookup(country.lower())
             else:
               if len(method_used + sample_type_explanation)> 0:
                 acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
+        total_cost_title += total_query_cost
       end = time.time()
+      #total_cost_title += total_query_cost
       acc_score["query_cost"] = f"{total_cost_title:.6f}"
       elapsed = end - start
       acc_score["time_cost"] = f"{elapsed:.3f} seconds"