Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +203 -100
pipeline.py
CHANGED
@@ -234,18 +234,12 @@ def time_it(func, *args, **kwargs):
|
|
234 |
print(f"β±οΈ '{func.__name__}' took {elapsed:.3f} seconds")
|
235 |
return result, elapsed
|
236 |
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
|
237 |
-
def track_gemini_cost():
|
238 |
-
# Prices are per 1,000 tokens
|
239 |
-
PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
240 |
-
PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
241 |
-
PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
242 |
-
return True
|
243 |
|
244 |
def unique_preserve_order(seq):
|
245 |
seen = set()
|
246 |
return [x for x in seq if not (x in seen or seen.add(x))]
|
247 |
# Main execution
|
248 |
-
def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
249 |
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
|
250 |
# there can be one accession number in the accessions
|
251 |
# Prices are per 1,000 tokens
|
@@ -253,15 +247,22 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
|
253 |
if stop_flag is not None and stop_flag.value:
|
254 |
print(f"π Stop detected before starting {accession}, aborting early...")
|
255 |
return {}
|
256 |
-
PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
257 |
-
PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
258 |
-
PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
if not accessions:
|
260 |
print("no input")
|
261 |
return None
|
262 |
else:
|
263 |
accs_output = {}
|
264 |
-
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
|
|
265 |
for acc in accessions:
|
266 |
print("start gemini: ", acc)
|
267 |
start = time.time()
|
@@ -274,7 +275,9 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
|
274 |
#"ethnicity":{},
|
275 |
"query_cost":total_cost_title,
|
276 |
"time_cost":None,
|
277 |
-
"source":links
|
|
|
|
|
278 |
if niche_cases:
|
279 |
for niche in niche_cases:
|
280 |
acc_score[niche] = {}
|
@@ -345,8 +348,11 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
|
345 |
print("β
Files already exist in Google Drive. Downloading them...")
|
346 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
347 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
|
|
|
|
348 |
print("chunk_id and all_id: ")
|
349 |
print(chunk_id, all_id)
|
|
|
350 |
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
351 |
print("π Name:", file["name"])
|
352 |
print("π Parent folder ID:", file["parents"][0])
|
@@ -397,46 +403,129 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
|
397 |
print(f"π Stop processing {accession}, aborting early...")
|
398 |
return {}
|
399 |
# check doi first
|
400 |
-
if
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
441 |
# all_path = "/"+saveTitle+"_all_merged_document.docx"
|
442 |
# # if chunk and all output not exist yet
|
@@ -469,6 +558,12 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
|
469 |
if not chunk and not all_output:
|
470 |
print("not chunk and all output")
|
471 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
if links:
|
473 |
for link in links:
|
474 |
print(link)
|
@@ -620,55 +715,63 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
|
|
620 |
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
|
621 |
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
|
622 |
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
|
630 |
-
if not all_output:
|
631 |
-
if chunk: all_output = chunk
|
632 |
-
else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
633 |
-
if faiss_index is None:
|
634 |
-
print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
|
635 |
-
total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
|
636 |
-
all_output
|
637 |
-
).total_tokens
|
638 |
-
|
639 |
-
initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
|
640 |
-
total_cost_title += initial_embedding_cost
|
641 |
-
print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
|
642 |
-
|
643 |
-
|
644 |
-
master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
|
645 |
-
file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
|
646 |
)
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
print("country using ai: ", country)
|
673 |
print("sample type using ai: ", sample_type)
|
674 |
# if len(country) == 0: country = "unknown"
|
|
|
234 |
print(f"β±οΈ '{func.__name__}' took {elapsed:.3f} seconds")
|
235 |
return result, elapsed
|
236 |
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
def unique_preserve_order(seq):
|
239 |
seen = set()
|
240 |
return [x for x in seq if not (x in seen or seen.add(x))]
|
241 |
# Main execution
|
242 |
+
def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
|
243 |
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
|
244 |
# there can be one accession number in the accessions
|
245 |
# Prices are per 1,000 tokens
|
|
|
247 |
if stop_flag is not None and stop_flag.value:
|
248 |
print(f"π Stop detected before starting {accession}, aborting early...")
|
249 |
return {}
|
250 |
+
# PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
251 |
+
# PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
252 |
+
# PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
253 |
+
# Gemini 2.5 Flash-Lite pricing per 1,000 tokens
|
254 |
+
PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
|
255 |
+
PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
|
256 |
+
|
257 |
+
# Embedding-001 pricing per 1,000 input tokens
|
258 |
+
PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens
|
259 |
if not accessions:
|
260 |
print("no input")
|
261 |
return None
|
262 |
else:
|
263 |
accs_output = {}
|
264 |
+
#genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
265 |
+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
|
266 |
for acc in accessions:
|
267 |
print("start gemini: ", acc)
|
268 |
start = time.time()
|
|
|
275 |
#"ethnicity":{},
|
276 |
"query_cost":total_cost_title,
|
277 |
"time_cost":None,
|
278 |
+
"source":links,
|
279 |
+
"file_chunk":"",
|
280 |
+
"file_all_output":""}
|
281 |
if niche_cases:
|
282 |
for niche in niche_cases:
|
283 |
acc_score[niche] = {}
|
|
|
348 |
print("β
Files already exist in Google Drive. Downloading them...")
|
349 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
350 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
351 |
+
acc_score["file_chunk"] = str(chunk_filename)
|
352 |
+
acc_score["file_all_output"] = str(all_filename)
|
353 |
print("chunk_id and all_id: ")
|
354 |
print(chunk_id, all_id)
|
355 |
+
print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])
|
356 |
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
357 |
print("π Name:", file["name"])
|
358 |
print("π Parent folder ID:", file["parents"][0])
|
|
|
403 |
print(f"π Stop processing {accession}, aborting early...")
|
404 |
return {}
|
405 |
# check doi first
|
406 |
+
if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
|
407 |
+
if doi != "unknown":
|
408 |
+
link = 'https://doi.org/' + doi
|
409 |
+
# get the file to create listOfFile for each id
|
410 |
+
print("link of doi: ", link)
|
411 |
+
html = extractHTML.HTML("",link)
|
412 |
+
jsonSM = html.getSupMaterial()
|
413 |
+
article_text = html.getListSection()
|
414 |
+
if article_text:
|
415 |
+
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
416 |
+
links.append(link)
|
417 |
+
if jsonSM:
|
418 |
+
links += sum((jsonSM[key] for key in jsonSM),[])
|
419 |
+
# no doi then google custom search api
|
420 |
+
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
421 |
+
# might find the article
|
422 |
+
print("no article text, start tem link")
|
423 |
+
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
424 |
+
tem_links = smart_fallback.smart_google_search(meta_expand)
|
425 |
+
print("tem links: ", tem_links)
|
426 |
+
tem_link_acc = smart_fallback.google_accession_search(acc)
|
427 |
+
tem_links += tem_link_acc
|
428 |
+
tem_links = unique_preserve_order(tem_links)
|
429 |
+
print("tem link before filtering: ", tem_links)
|
430 |
+
# filter the quality link
|
431 |
+
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
432 |
+
print("start the smart filter link")
|
433 |
+
if stop_flag is not None and stop_flag.value:
|
434 |
+
print(f"π Stop processing {accession}, aborting early...")
|
435 |
+
return {}
|
436 |
+
# success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
|
437 |
+
# if success_process:
|
438 |
+
# links = output_process
|
439 |
+
# print("yes succeed for smart filter link")
|
440 |
+
# else:
|
441 |
+
# print("no suceed, fallback to all tem links")
|
442 |
+
# links = tem_links
|
443 |
+
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
|
444 |
+
print("this is links: ",links)
|
445 |
+
links = unique_preserve_order(links)
|
446 |
+
acc_score["source"] = links
|
447 |
+
else:
|
448 |
+
try:
|
449 |
+
temp_source = False
|
450 |
+
if save_df is not None and not save_df.empty:
|
451 |
+
print("save df not none")
|
452 |
+
print(str(chunks_filename))
|
453 |
+
print(str(all_filename))
|
454 |
+
if str(chunks_filename) != "":
|
455 |
+
link = save_df.loc[save_df["file_chunk"]==str(chunks_filename),"Sources"].iloc[0]
|
456 |
+
#link = row["Sources"].iloc[0]
|
457 |
+
if "http" in link:
|
458 |
+
print("yeah http in save df source")
|
459 |
+
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
|
460 |
+
else: # temporary
|
461 |
+
print("tempo source")
|
462 |
+
#acc_score["source"] = [str(all_filename), str(chunks_filename)]
|
463 |
+
temp_source = True
|
464 |
+
elif str(all_filename) != "":
|
465 |
+
link = save_df.loc[save_df["file_all_output"]==str(all_filename),"Sources"].iloc[0]
|
466 |
+
#link = row["Sources"].iloc[0]
|
467 |
+
print(link)
|
468 |
+
print("list of link")
|
469 |
+
print([x for x in link.split("\n") if x.strip()])
|
470 |
+
if "http" in link:
|
471 |
+
print("yeah http in save df source")
|
472 |
+
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
|
473 |
+
else: # temporary
|
474 |
+
print("tempo source")
|
475 |
+
#acc_score["source"] = [str(all_filename), str(chunks_filename)]
|
476 |
+
temp_source = True
|
477 |
+
else: # temporary
|
478 |
+
print("tempo source")
|
479 |
+
#acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
|
480 |
+
temp_source = True
|
481 |
+
else: # temporary
|
482 |
+
print("tempo source")
|
483 |
+
#acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
|
484 |
+
temp_source = True
|
485 |
+
if temp_source:
|
486 |
+
if doi != "unknown":
|
487 |
+
link = 'https://doi.org/' + doi
|
488 |
+
# get the file to create listOfFile for each id
|
489 |
+
print("link of doi: ", link)
|
490 |
+
html = extractHTML.HTML("",link)
|
491 |
+
jsonSM = html.getSupMaterial()
|
492 |
+
article_text = html.getListSection()
|
493 |
+
if article_text:
|
494 |
+
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
495 |
+
links.append(link)
|
496 |
+
if jsonSM:
|
497 |
+
links += sum((jsonSM[key] for key in jsonSM),[])
|
498 |
+
# no doi then google custom search api
|
499 |
+
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
500 |
+
# might find the article
|
501 |
+
print("no article text, start tem link")
|
502 |
+
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
503 |
+
tem_links = smart_fallback.smart_google_search(meta_expand)
|
504 |
+
print("tem links: ", tem_links)
|
505 |
+
tem_link_acc = smart_fallback.google_accession_search(acc)
|
506 |
+
tem_links += tem_link_acc
|
507 |
+
tem_links = unique_preserve_order(tem_links)
|
508 |
+
print("tem link before filtering: ", tem_links)
|
509 |
+
# filter the quality link
|
510 |
+
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
511 |
+
print("start the smart filter link")
|
512 |
+
if stop_flag is not None and stop_flag.value:
|
513 |
+
print(f"π Stop processing {accession}, aborting early...")
|
514 |
+
return {}
|
515 |
+
# success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
|
516 |
+
# if success_process:
|
517 |
+
# links = output_process
|
518 |
+
# print("yes succeed for smart filter link")
|
519 |
+
# else:
|
520 |
+
# print("no suceed, fallback to all tem links")
|
521 |
+
# links = tem_links
|
522 |
+
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
|
523 |
+
print("this is links: ",links)
|
524 |
+
links = unique_preserve_order(links)
|
525 |
+
acc_score["source"] = links
|
526 |
+
except:
|
527 |
+
print("except for source")
|
528 |
+
acc_score["source"] = []
|
529 |
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
530 |
# all_path = "/"+saveTitle+"_all_merged_document.docx"
|
531 |
# # if chunk and all output not exist yet
|
|
|
558 |
if not chunk and not all_output:
|
559 |
print("not chunk and all output")
|
560 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
561 |
+
if str(chunks_filename) != "":
|
562 |
+
print("first time have chunk path: ", str(chunks_filename))
|
563 |
+
acc_score["file_chunk"] = str(chunks_filename)
|
564 |
+
if str(all_filename) != "":
|
565 |
+
print("first time have all path: ", str(all_filename))
|
566 |
+
acc_score["file_all_output"] = str(all_filename)
|
567 |
if links:
|
568 |
for link in links:
|
569 |
print(link)
|
|
|
715 |
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
|
716 |
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
|
717 |
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
|
718 |
+
try:
|
719 |
+
print("try gemini 2.5")
|
720 |
+
print("move to load rag")
|
721 |
+
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
|
722 |
+
faiss_index_path, document_chunks_path, structured_lookup_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
723 |
)
|
724 |
+
|
725 |
+
global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
|
726 |
+
if not all_output:
|
727 |
+
if chunk: all_output = chunk
|
728 |
+
else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
729 |
+
if faiss_index is None:
|
730 |
+
print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
|
731 |
+
total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
|
732 |
+
all_output
|
733 |
+
).total_tokens
|
734 |
+
|
735 |
+
initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
|
736 |
+
total_cost_title += initial_embedding_cost
|
737 |
+
print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
|
738 |
+
|
739 |
+
|
740 |
+
master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
|
741 |
+
file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
|
742 |
+
)
|
743 |
+
else:
|
744 |
+
print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
|
745 |
+
plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
|
746 |
+
master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
|
747 |
+
if stop_flag is not None and stop_flag.value:
|
748 |
+
print(f"π Stop processing {accession}, aborting early...")
|
749 |
+
return {}
|
750 |
+
primary_word = iso
|
751 |
+
alternative_word = acc
|
752 |
+
print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
|
753 |
+
if features.lower() not in all_output.lower():
|
754 |
+
all_output += ". NCBI Features: " + features
|
755 |
+
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
|
756 |
+
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
757 |
+
# model.call_llm_api, chunk=chunk, all_output=all_output)
|
758 |
+
print("this is chunk for the model")
|
759 |
+
print(chunk)
|
760 |
+
print("this is all output for the model")
|
761 |
+
print(all_output)
|
762 |
+
if stop_flag is not None and stop_flag.value:
|
763 |
+
print(f"π Stop processing {accession}, aborting early...")
|
764 |
+
return {}
|
765 |
+
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
766 |
+
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
767 |
+
model.call_llm_api, chunk=chunk, all_output=all_output)
|
768 |
+
print("pass query of 2.5")
|
769 |
+
except:
|
770 |
+
print("try gemini 1.5")
|
771 |
+
country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
|
772 |
+
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
773 |
+
model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")
|
774 |
+
print("yeah pass the query of 1.5")
|
775 |
print("country using ai: ", country)
|
776 |
print("sample type using ai: ", sample_type)
|
777 |
# if len(country) == 0: country = "unknown"
|