VyLala commited on
Commit
dd66f82
Β·
verified Β·
1 Parent(s): d3e0e88

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +203 -100
pipeline.py CHANGED
@@ -234,18 +234,12 @@ def time_it(func, *args, **kwargs):
234
  print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
235
  return result, elapsed
236
  # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
237
- def track_gemini_cost():
238
- # Prices are per 1,000 tokens
239
- PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
240
- PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
241
- PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
242
- return True
243
 
244
  def unique_preserve_order(seq):
245
  seen = set()
246
  return [x for x in seq if not (x in seen or seen.add(x))]
247
  # Main execution
248
- def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
249
  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
250
  # there can be one accession number in the accessions
251
  # Prices are per 1,000 tokens
@@ -253,15 +247,22 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
253
  if stop_flag is not None and stop_flag.value:
254
  print(f"πŸ›‘ Stop detected before starting {accession}, aborting early...")
255
  return {}
256
- PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
257
- PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
258
- PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
 
 
 
 
 
 
259
  if not accessions:
260
  print("no input")
261
  return None
262
  else:
263
  accs_output = {}
264
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 
265
  for acc in accessions:
266
  print("start gemini: ", acc)
267
  start = time.time()
@@ -274,7 +275,9 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
274
  #"ethnicity":{},
275
  "query_cost":total_cost_title,
276
  "time_cost":None,
277
- "source":links}
 
 
278
  if niche_cases:
279
  for niche in niche_cases:
280
  acc_score[niche] = {}
@@ -345,8 +348,11 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
345
  print("βœ… Files already exist in Google Drive. Downloading them...")
346
  chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
347
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
 
 
348
  print("chunk_id and all_id: ")
349
  print(chunk_id, all_id)
 
350
  file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
351
  print("πŸ“„ Name:", file["name"])
352
  print("πŸ“ Parent folder ID:", file["parents"][0])
@@ -397,46 +403,129 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
397
  print(f"πŸ›‘ Stop processing {accession}, aborting early...")
398
  return {}
399
  # check doi first
400
- if doi != "unknown":
401
- link = 'https://doi.org/' + doi
402
- # get the file to create listOfFile for each id
403
- print("link of doi: ", link)
404
- html = extractHTML.HTML("",link)
405
- jsonSM = html.getSupMaterial()
406
- article_text = html.getListSection()
407
- if article_text:
408
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
409
- links.append(link)
410
- if jsonSM:
411
- links += sum((jsonSM[key] for key in jsonSM),[])
412
- # no doi then google custom search api
413
- if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
414
- # might find the article
415
- print("no article text, start tem link")
416
- #tem_links = mtdna_classifier.search_google_custom(title, 2)
417
- tem_links = smart_fallback.smart_google_search(meta_expand)
418
- print("tem links: ", tem_links)
419
- tem_link_acc = smart_fallback.google_accession_search(acc)
420
- tem_links += tem_link_acc
421
- tem_links = unique_preserve_order(tem_links)
422
- print("tem link before filtering: ", tem_links)
423
- # filter the quality link
424
- print("saveLinkFolder as sample folder id: ", sample_folder_id)
425
- print("start the smart filter link")
426
- if stop_flag is not None and stop_flag.value:
427
- print(f"πŸ›‘ Stop processing {accession}, aborting early...")
428
- return {}
429
- # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
430
- # if success_process:
431
- # links = output_process
432
- # print("yes succeed for smart filter link")
433
- # else:
434
- # print("no suceed, fallback to all tem links")
435
- # links = tem_links
436
- links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
437
- print("this is links: ",links)
438
- links = unique_preserve_order(links)
439
- acc_score["source"] = links
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  # chunk_path = "/"+saveTitle+"_merged_document.docx"
441
  # all_path = "/"+saveTitle+"_all_merged_document.docx"
442
  # # if chunk and all output not exist yet
@@ -469,6 +558,12 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
469
  if not chunk and not all_output:
470
  print("not chunk and all output")
471
  # else: check if we can reuse these chunk and all output of existed accession to find another
 
 
 
 
 
 
472
  if links:
473
  for link in links:
474
  print(link)
@@ -620,55 +715,63 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
620
  download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
621
  download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
622
  download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
623
-
624
- print("move to load rag")
625
- master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
626
- faiss_index_path, document_chunks_path, structured_lookup_path
627
- )
628
-
629
- global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
630
- if not all_output:
631
- if chunk: all_output = chunk
632
- else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
633
- if faiss_index is None:
634
- print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
635
- total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
636
- all_output
637
- ).total_tokens
638
-
639
- initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
640
- total_cost_title += initial_embedding_cost
641
- print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
642
-
643
-
644
- master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
645
- file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
646
  )
647
- else:
648
- print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
649
- plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
650
- master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
651
- if stop_flag is not None and stop_flag.value:
652
- print(f"πŸ›‘ Stop processing {accession}, aborting early...")
653
- return {}
654
- primary_word = iso
655
- alternative_word = acc
656
- print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
657
- if features.lower() not in all_output.lower():
658
- all_output += ". NCBI Features: " + features
659
- # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
660
- # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
661
- # model.call_llm_api, chunk=chunk, all_output=all_output)
662
- print("this is chunk for the model")
663
- print(chunk)
664
- print("this is all output for the model")
665
- print(all_output)
666
- if stop_flag is not None and stop_flag.value:
667
- print(f"πŸ›‘ Stop processing {accession}, aborting early...")
668
- return {}
669
- country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
670
- primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
671
- model.call_llm_api, chunk=chunk, all_output=all_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  print("country using ai: ", country)
673
  print("sample type using ai: ", sample_type)
674
  # if len(country) == 0: country = "unknown"
 
234
  print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
235
  return result, elapsed
236
  # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
 
 
 
 
 
 
237
 
238
  def unique_preserve_order(seq):
239
  seen = set()
240
  return [x for x in seq if not (x in seen or seen.add(x))]
241
  # Main execution
242
+ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
243
  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
244
  # there can be one accession number in the accessions
245
  # Prices are per 1,000 tokens
 
247
  if stop_flag is not None and stop_flag.value:
248
  print(f"πŸ›‘ Stop detected before starting {accession}, aborting early...")
249
  return {}
250
+ # PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
251
+ # PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
252
+ # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
253
+ # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
254
+ PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
255
+ PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
256
+
257
+ # Embedding-001 pricing per 1,000 input tokens
258
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens
259
  if not accessions:
260
  print("no input")
261
  return None
262
  else:
263
  accs_output = {}
264
+ #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
265
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
266
  for acc in accessions:
267
  print("start gemini: ", acc)
268
  start = time.time()
 
275
  #"ethnicity":{},
276
  "query_cost":total_cost_title,
277
  "time_cost":None,
278
+ "source":links,
279
+ "file_chunk":"",
280
+ "file_all_output":""}
281
  if niche_cases:
282
  for niche in niche_cases:
283
  acc_score[niche] = {}
 
348
  print("βœ… Files already exist in Google Drive. Downloading them...")
349
  chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
350
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
351
+ acc_score["file_chunk"] = str(chunk_filename)
352
+ acc_score["file_all_output"] = str(all_filename)
353
  print("chunk_id and all_id: ")
354
  print(chunk_id, all_id)
355
+ print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])
356
  file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
357
  print("πŸ“„ Name:", file["name"])
358
  print("πŸ“ Parent folder ID:", file["parents"][0])
 
403
  print(f"πŸ›‘ Stop processing {accession}, aborting early...")
404
  return {}
405
  # check doi first
406
+ if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
407
+ if doi != "unknown":
408
+ link = 'https://doi.org/' + doi
409
+ # get the file to create listOfFile for each id
410
+ print("link of doi: ", link)
411
+ html = extractHTML.HTML("",link)
412
+ jsonSM = html.getSupMaterial()
413
+ article_text = html.getListSection()
414
+ if article_text:
415
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
416
+ links.append(link)
417
+ if jsonSM:
418
+ links += sum((jsonSM[key] for key in jsonSM),[])
419
+ # no doi then google custom search api
420
+ if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
421
+ # might find the article
422
+ print("no article text, start tem link")
423
+ #tem_links = mtdna_classifier.search_google_custom(title, 2)
424
+ tem_links = smart_fallback.smart_google_search(meta_expand)
425
+ print("tem links: ", tem_links)
426
+ tem_link_acc = smart_fallback.google_accession_search(acc)
427
+ tem_links += tem_link_acc
428
+ tem_links = unique_preserve_order(tem_links)
429
+ print("tem link before filtering: ", tem_links)
430
+ # filter the quality link
431
+ print("saveLinkFolder as sample folder id: ", sample_folder_id)
432
+ print("start the smart filter link")
433
+ if stop_flag is not None and stop_flag.value:
434
+ print(f"πŸ›‘ Stop processing {accession}, aborting early...")
435
+ return {}
436
+ # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
437
+ # if success_process:
438
+ # links = output_process
439
+ # print("yes succeed for smart filter link")
440
+ # else:
441
+ # print("no suceed, fallback to all tem links")
442
+ # links = tem_links
443
+ links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
444
+ print("this is links: ",links)
445
+ links = unique_preserve_order(links)
446
+ acc_score["source"] = links
447
+ else:
448
+ try:
449
+ temp_source = False
450
+ if save_df is not None and not save_df.empty:
451
+ print("save df not none")
452
+ print(str(chunks_filename))
453
+ print(str(all_filename))
454
+ if str(chunks_filename) != "":
455
+ link = save_df.loc[save_df["file_chunk"]==str(chunks_filename),"Sources"].iloc[0]
456
+ #link = row["Sources"].iloc[0]
457
+ if "http" in link:
458
+ print("yeah http in save df source")
459
+ acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
460
+ else: # temporary
461
+ print("tempo source")
462
+ #acc_score["source"] = [str(all_filename), str(chunks_filename)]
463
+ temp_source = True
464
+ elif str(all_filename) != "":
465
+ link = save_df.loc[save_df["file_all_output"]==str(all_filename),"Sources"].iloc[0]
466
+ #link = row["Sources"].iloc[0]
467
+ print(link)
468
+ print("list of link")
469
+ print([x for x in link.split("\n") if x.strip()])
470
+ if "http" in link:
471
+ print("yeah http in save df source")
472
+ acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
473
+ else: # temporary
474
+ print("tempo source")
475
+ #acc_score["source"] = [str(all_filename), str(chunks_filename)]
476
+ temp_source = True
477
+ else: # temporary
478
+ print("tempo source")
479
+ #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
480
+ temp_source = True
481
+ else: # temporary
482
+ print("tempo source")
483
+ #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
484
+ temp_source = True
485
+ if temp_source:
486
+ if doi != "unknown":
487
+ link = 'https://doi.org/' + doi
488
+ # get the file to create listOfFile for each id
489
+ print("link of doi: ", link)
490
+ html = extractHTML.HTML("",link)
491
+ jsonSM = html.getSupMaterial()
492
+ article_text = html.getListSection()
493
+ if article_text:
494
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
495
+ links.append(link)
496
+ if jsonSM:
497
+ links += sum((jsonSM[key] for key in jsonSM),[])
498
+ # no doi then google custom search api
499
+ if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
500
+ # might find the article
501
+ print("no article text, start tem link")
502
+ #tem_links = mtdna_classifier.search_google_custom(title, 2)
503
+ tem_links = smart_fallback.smart_google_search(meta_expand)
504
+ print("tem links: ", tem_links)
505
+ tem_link_acc = smart_fallback.google_accession_search(acc)
506
+ tem_links += tem_link_acc
507
+ tem_links = unique_preserve_order(tem_links)
508
+ print("tem link before filtering: ", tem_links)
509
+ # filter the quality link
510
+ print("saveLinkFolder as sample folder id: ", sample_folder_id)
511
+ print("start the smart filter link")
512
+ if stop_flag is not None and stop_flag.value:
513
+ print(f"πŸ›‘ Stop processing {accession}, aborting early...")
514
+ return {}
515
+ # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
516
+ # if success_process:
517
+ # links = output_process
518
+ # print("yes succeed for smart filter link")
519
+ # else:
520
+ # print("no suceed, fallback to all tem links")
521
+ # links = tem_links
522
+ links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
523
+ print("this is links: ",links)
524
+ links = unique_preserve_order(links)
525
+ acc_score["source"] = links
526
+ except:
527
+ print("except for source")
528
+ acc_score["source"] = []
529
  # chunk_path = "/"+saveTitle+"_merged_document.docx"
530
  # all_path = "/"+saveTitle+"_all_merged_document.docx"
531
  # # if chunk and all output not exist yet
 
558
  if not chunk and not all_output:
559
  print("not chunk and all output")
560
  # else: check if we can reuse these chunk and all output of existed accession to find another
561
+ if str(chunks_filename) != "":
562
+ print("first time have chunk path: ", str(chunks_filename))
563
+ acc_score["file_chunk"] = str(chunks_filename)
564
+ if str(all_filename) != "":
565
+ print("first time have all path: ", str(all_filename))
566
+ acc_score["file_all_output"] = str(all_filename)
567
  if links:
568
  for link in links:
569
  print(link)
 
715
  download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
716
  download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
717
  download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
718
+ try:
719
+ print("try gemini 2.5")
720
+ print("move to load rag")
721
+ master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
722
+ faiss_index_path, document_chunks_path, structured_lookup_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  )
724
+
725
+ global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
726
+ if not all_output:
727
+ if chunk: all_output = chunk
728
+ else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
729
+ if faiss_index is None:
730
+ print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
731
+ total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
732
+ all_output
733
+ ).total_tokens
734
+
735
+ initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
736
+ total_cost_title += initial_embedding_cost
737
+ print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
738
+
739
+
740
+ master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
741
+ file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
742
+ )
743
+ else:
744
+ print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
745
+ plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
746
+ master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
747
+ if stop_flag is not None and stop_flag.value:
748
+ print(f"πŸ›‘ Stop processing {accession}, aborting early...")
749
+ return {}
750
+ primary_word = iso
751
+ alternative_word = acc
752
+ print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
753
+ if features.lower() not in all_output.lower():
754
+ all_output += ". NCBI Features: " + features
755
+ # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
756
+ # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
757
+ # model.call_llm_api, chunk=chunk, all_output=all_output)
758
+ print("this is chunk for the model")
759
+ print(chunk)
760
+ print("this is all output for the model")
761
+ print(all_output)
762
+ if stop_flag is not None and stop_flag.value:
763
+ print(f"πŸ›‘ Stop processing {accession}, aborting early...")
764
+ return {}
765
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
766
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
767
+ model.call_llm_api, chunk=chunk, all_output=all_output)
768
+ print("pass query of 2.5")
769
+ except:
770
+ print("try gemini 1.5")
771
+ country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
772
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
773
+ model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")
774
+ print("yeah pass the query of 1.5")
775
  print("country using ai: ", country)
776
  print("sample type using ai: ", sample_type)
777
  # if len(country) == 0: country = "unknown"