Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 8 days ago

Commit

dd66f82

verified ·

1 Parent(s): d3e0e88

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +203 -100

pipeline.py CHANGED Viewed

@@ -234,18 +234,12 @@ def time_it(func, *args, **kwargs):
     print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
     return result, elapsed
 # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
-def track_gemini_cost():
-  # Prices are per 1,000 tokens
-  PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-  PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-  PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-  return True
 def unique_preserve_order(seq):
     seen = set()
     return [x for x in seq if not (x in seen or seen.add(x))]
 # Main execution
-def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
   # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
   # there can be one accession number in the accessions
   # Prices are per 1,000 tokens
@@ -253,15 +247,22 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
   if stop_flag is not None and stop_flag.value:
     print(f"🛑 Stop detected before starting {accession}, aborting early...")
     return {}
-  PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-  PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-  PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
   if not accessions:
     print("no input")
     return None
   else:
     accs_output = {}
-    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
     for acc in accessions:
       print("start gemini: ", acc)
       start = time.time()
@@ -274,7 +275,9 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
                    #"ethnicity":{},
                    "query_cost":total_cost_title,
                    "time_cost":None,
-                   "source":links}
       if niche_cases:
         for niche in niche_cases:
           acc_score[niche] = {}
@@ -345,8 +348,11 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
         print("✅ Files already exist in Google Drive. Downloading them...")
         chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
         all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
         print("chunk_id and all_id: ")
         print(chunk_id, all_id)
         file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
         print("📄 Name:", file["name"])
         print("📁 Parent folder ID:", file["parents"][0])
@@ -397,46 +403,129 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
         print(f"🛑 Stop processing {accession}, aborting early...")
         return {}
       # check doi first
-      if doi != "unknown":
-        link = 'https://doi.org/' + doi
-        # get the file to create listOfFile for each id
-        print("link of doi: ", link)
-        html = extractHTML.HTML("",link)
-        jsonSM = html.getSupMaterial()
-        article_text = html.getListSection()
-        if article_text:
-          if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
-            links.append(link)
-        if jsonSM:
-          links += sum((jsonSM[key] for key in jsonSM),[])
-      # no doi then google custom search api
-      if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
-        # might find the article
-        print("no article text, start tem link")
-        #tem_links = mtdna_classifier.search_google_custom(title, 2)
-        tem_links = smart_fallback.smart_google_search(meta_expand)
-        print("tem links: ", tem_links)
-        tem_link_acc = smart_fallback.google_accession_search(acc)
-        tem_links += tem_link_acc
-        tem_links = unique_preserve_order(tem_links)
-        print("tem link before filtering: ", tem_links)
-        # filter the quality link
-        print("saveLinkFolder as sample folder id: ", sample_folder_id)
-        print("start the smart filter link")
-        if stop_flag is not None and stop_flag.value:
-            print(f"🛑 Stop processing {accession}, aborting early...")
-            return {}
-        # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
-        # if success_process:
-        #   links = output_process
-        #   print("yes succeed for smart filter link")
-        # else:
-        #   print("no suceed, fallback to all tem links")
-        #   links = tem_links
-        links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
-      print("this is links: ",links)
-      links = unique_preserve_order(links)
-      acc_score["source"] = links
       # chunk_path = "/"+saveTitle+"_merged_document.docx"
       # all_path = "/"+saveTitle+"_all_merged_document.docx"
       # # if chunk and all output not exist yet
@@ -469,6 +558,12 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
       if not chunk and not all_output:
         print("not chunk and all output")
         # else: check if we can reuse these chunk and all output of existed accession to find another
         if links:
           for link in links:
               print(link)
@@ -620,55 +715,63 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
         download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
         download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
         download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
-      print("move to load rag")
-      master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
-          faiss_index_path, document_chunks_path, structured_lookup_path
-      )
-      global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
-      if not all_output:
-        if chunk: all_output = chunk
-        else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-      if faiss_index is None:
-          print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
-          total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
-              all_output
-          ).total_tokens
-          initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
-          total_cost_title += initial_embedding_cost
-          print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
-          master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
-              file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
           )
-      else:
-          print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
-          plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
-          master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
-      if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Stop processing {accession}, aborting early...")
-        return {}
-      primary_word = iso
-      alternative_word = acc
-      print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
-      if features.lower() not in all_output.lower():
-        all_output += ". NCBI Features: " + features
-      # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
-      #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
-      #     model.call_llm_api, chunk=chunk, all_output=all_output)
-      print("this is chunk for the model")
-      print(chunk)
-      print("this is all output for the model")
-      print(all_output)
-      if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Stop processing {accession}, aborting early...")
-        return {}
-      country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
-          primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
-          model.call_llm_api, chunk=chunk, all_output=all_output)
       print("country using ai: ", country)
       print("sample type using ai: ", sample_type)
       # if len(country) == 0: country = "unknown"

     print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
     return result, elapsed
 # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
 def unique_preserve_order(seq):
     seen = set()
     return [x for x in seq if not (x in seen or seen.add(x))]
 # Main execution
+def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
   # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
   # there can be one accession number in the accessions
   # Prices are per 1,000 tokens
   if stop_flag is not None and stop_flag.value:
     print(f"🛑 Stop detected before starting {accession}, aborting early...")
     return {}
+  # PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
+  # PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
+  # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
+  # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
+  PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
+  PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
+  # Embedding-001 pricing per 1,000 input tokens
+  PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
   if not accessions:
     print("no input")
     return None
   else:
     accs_output = {}
+    #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+    genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
     for acc in accessions:
       print("start gemini: ", acc)
       start = time.time()
                    #"ethnicity":{},
                    "query_cost":total_cost_title,
                    "time_cost":None,
+                   "source":links,
+                    "file_chunk":"",
+                   "file_all_output":""}
       if niche_cases:
         for niche in niche_cases:
           acc_score[niche] = {}
         print("✅ Files already exist in Google Drive. Downloading them...")
         chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
         all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+        acc_score["file_chunk"] = str(chunk_filename)
+        acc_score["file_all_output"] = str(all_filename)
         print("chunk_id and all_id: ")
         print(chunk_id, all_id)
+        print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])
         file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
         print("📄 Name:", file["name"])
         print("📁 Parent folder ID:", file["parents"][0])
         print(f"🛑 Stop processing {accession}, aborting early...")
         return {}
       # check doi first
+      if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
+          if doi != "unknown":
+            link = 'https://doi.org/' + doi
+            # get the file to create listOfFile for each id
+            print("link of doi: ", link)
+            html = extractHTML.HTML("",link)
+            jsonSM = html.getSupMaterial()
+            article_text = html.getListSection()
+            if article_text:
+              if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
+                links.append(link)
+            if jsonSM:
+              links += sum((jsonSM[key] for key in jsonSM),[])
+          # no doi then google custom search api
+          if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
+            # might find the article
+            print("no article text, start tem link")
+            #tem_links = mtdna_classifier.search_google_custom(title, 2)
+            tem_links = smart_fallback.smart_google_search(meta_expand)
+            print("tem links: ", tem_links)
+            tem_link_acc = smart_fallback.google_accession_search(acc)
+            tem_links += tem_link_acc
+            tem_links = unique_preserve_order(tem_links)
+            print("tem link before filtering: ", tem_links)
+            # filter the quality link
+            print("saveLinkFolder as sample folder id: ", sample_folder_id)
+            print("start the smart filter link")
+            if stop_flag is not None and stop_flag.value:
+                print(f"🛑 Stop processing {accession}, aborting early...")
+                return {}
+            # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
+            # if success_process:
+            #   links = output_process
+            #   print("yes succeed for smart filter link")
+            # else:
+            #   print("no suceed, fallback to all tem links")
+            #   links = tem_links
+            links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
+          print("this is links: ",links)
+          links = unique_preserve_order(links)
+          acc_score["source"] = links
+      else:
+          try:
+              temp_source = False
+              if save_df is not None and not save_df.empty:
+                print("save df not none")
+                print(str(chunks_filename))
+                print(str(all_filename))
+                if str(chunks_filename) != "":
+                  link = save_df.loc[save_df["file_chunk"]==str(chunks_filename),"Sources"].iloc[0]
+                  #link = row["Sources"].iloc[0]
+                  if "http" in link:
+                    print("yeah http in save df source")
+                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
+                  else:  # temporary
+                    print("tempo source")
+                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
+                    temp_source = True
+                elif str(all_filename) != "":
+                  link = save_df.loc[save_df["file_all_output"]==str(all_filename),"Sources"].iloc[0]
+                  #link = row["Sources"].iloc[0]
+                  print(link)
+                  print("list of link")
+                  print([x for x in link.split("\n") if x.strip()])
+                  if "http" in link:
+                    print("yeah http in save df source")
+                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
+                  else:  # temporary
+                    print("tempo source")
+                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
+                    temp_source = True
+                else:  # temporary
+                  print("tempo source")
+                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
+                  temp_source = True
+              else:  # temporary
+                  print("tempo source")
+                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
+                  temp_source = True
+              if temp_source:
+                if doi != "unknown":
+                    link = 'https://doi.org/' + doi
+                    # get the file to create listOfFile for each id
+                    print("link of doi: ", link)
+                    html = extractHTML.HTML("",link)
+                    jsonSM = html.getSupMaterial()
+                    article_text = html.getListSection()
+                    if article_text:
+                      if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
+                        links.append(link)
+                    if jsonSM:
+                      links += sum((jsonSM[key] for key in jsonSM),[])
+                  # no doi then google custom search api
+                if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
+                    # might find the article
+                    print("no article text, start tem link")
+                    #tem_links = mtdna_classifier.search_google_custom(title, 2)
+                    tem_links = smart_fallback.smart_google_search(meta_expand)
+                    print("tem links: ", tem_links)
+                    tem_link_acc = smart_fallback.google_accession_search(acc)
+                    tem_links += tem_link_acc
+                    tem_links = unique_preserve_order(tem_links)
+                    print("tem link before filtering: ", tem_links)
+                    # filter the quality link
+                    print("saveLinkFolder as sample folder id: ", sample_folder_id)
+                    print("start the smart filter link")
+                    if stop_flag is not None and stop_flag.value:
+                        print(f"🛑 Stop processing {accession}, aborting early...")
+                        return {}
+                    # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
+                    # if success_process:
+                    #   links = output_process
+                    #   print("yes succeed for smart filter link")
+                    # else:
+                    #   print("no suceed, fallback to all tem links")
+                    #   links = tem_links
+                    links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
+                print("this is links: ",links)
+                links = unique_preserve_order(links)
+                acc_score["source"] = links
+        except:
+              print("except for source")
+              acc_score["source"] = []
       # chunk_path = "/"+saveTitle+"_merged_document.docx"
       # all_path = "/"+saveTitle+"_all_merged_document.docx"
       # # if chunk and all output not exist yet
       if not chunk and not all_output:
         print("not chunk and all output")
         # else: check if we can reuse these chunk and all output of existed accession to find another
+        if str(chunks_filename) != "":
+          print("first time have chunk path: ", str(chunks_filename))
+          acc_score["file_chunk"] = str(chunks_filename)
+        if str(all_filename) != "":
+          print("first time have all path: ", str(all_filename))
+          acc_score["file_all_output"] = str(all_filename)
         if links:
           for link in links:
               print(link)
         download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
         download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
         download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
+      try:
+          print("try gemini 2.5")
+          print("move to load rag")
+          master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
+              faiss_index_path, document_chunks_path, structured_lookup_path
           )
+          global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
+          if not all_output:
+            if chunk: all_output = chunk
+            else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+          if faiss_index is None:
+              print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
+              total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
+                  all_output
+              ).total_tokens
+              initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
+              total_cost_title += initial_embedding_cost
+              print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
+              master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
+                  file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
+              )
+          else:
+              print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
+              plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
+              master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
+          if stop_flag is not None and stop_flag.value:
+            print(f"🛑 Stop processing {accession}, aborting early...")
+            return {}
+          primary_word = iso
+          alternative_word = acc
+          print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
+          if features.lower() not in all_output.lower():
+            all_output += ". NCBI Features: " + features
+          # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
+          #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+          #     model.call_llm_api, chunk=chunk, all_output=all_output)
+          print("this is chunk for the model")
+          print(chunk)
+          print("this is all output for the model")
+          print(all_output)
+          if stop_flag is not None and stop_flag.value:
+            print(f"🛑 Stop processing {accession}, aborting early...")
+            return {}
+          country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
+              primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+              model.call_llm_api, chunk=chunk, all_output=all_output)
+          print("pass query of 2.5")
+      except:
+          print("try gemini 1.5")
+          country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
+            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+            model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")
+          print("yeah pass the query of 1.5")
       print("country using ai: ", country)
       print("sample type using ai: ", sample_type)
       # if len(country) == 0: country = "unknown"