diff --git "a/pipeline.py" "b/pipeline.py"
--- "a/pipeline.py"
+++ "b/pipeline.py"
@@ -1,920 +1,1088 @@
-# test1: MJ17 direct
-# test2: "A1YU101" thailand cross-ref
-# test3: "EBK109" thailand cross-ref
-# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
-import data_preprocess
-import model
-import mtdna_classifier
-#import app
-import smart_fallback
-import pandas as pd
-from pathlib import Path
-import subprocess
-from NER.html import extractHTML
-import os
-import google.generativeai as genai
-import re
-import standardize_location
-# Helper functions in for this pipeline
-# Track time
-import time
-import multiprocessing
-import gspread
-from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
-from google.oauth2.service_account import Credentials
-from oauth2client.service_account import ServiceAccountCredentials
-import io
-import json
-#––– Authentication setup –––
-GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
-GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
-GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
-GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
-drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
-
-def get_or_create_drive_folder(name, parent_id=None):
-    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
-    if parent_id:
-        query += f" and '{parent_id}' in parents"
-    results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
-    items = results.get("files", [])
-    if items:
-        return items[0]["id"]
-    file_metadata = {
-        "name": name,
-        "mimeType": "application/vnd.google-apps.folder"
-    }
-    if parent_id:
-        file_metadata["parents"] = [parent_id]
-    file = drive_service.files().create(body=file_metadata, fields="id").execute()
-    return file["id"]
-# def find_drive_file(filename, parent_id):
-#     """
-#     Checks if a file with the given name exists inside the specified Google Drive folder.
-#     Returns the file ID if found, else None.
-#     """
-#     query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
-#     results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
-#     files = results.get('files', [])
-#     if files:
-#         return files[0]["id"]
-#     return None
-
-def find_drive_file(filename, parent_id):
-    """
-    Checks if a file with the given name exists inside the specified Google Drive folder.
-    Returns the file ID if found, else None.
-    """
-    try:
-        print(f"🔍 Searching for '{filename}' in folder: {parent_id}")
-        query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
-        results = drive_service.files().list(
-            q=query,
-            spaces='drive',
-            fields='files(id, name)',
-            pageSize=1
-        ).execute()
-        files = results.get('files', [])
-        if files:
-            print(f"✅ Found file: {files[0]['name']} with ID: {files[0]['id']}")
-            return files[0]["id"]
-        else:
-            print("⚠️ File not found.")
-            return None
-    except Exception as e:
-        print(f"❌ Error during find_drive_file: {e}")
-        return None
-
-
-
-# def upload_file_to_drive(local_path, remote_name, folder_id):
-#     file_metadata = {"name": remote_name, "parents": [folder_id]}
-#     media = MediaFileUpload(local_path, resumable=True)
-#     existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
-#     if existing:
-#         drive_service.files().delete(fileId=existing[0]["id"]).execute()
-#     file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
-#     result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
-#     if not result.get("files"):
-#         print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
-#     else:
-#         print(f"✅ Verified upload: {remote_name}")
-#     return file["id"]
-def upload_file_to_drive(local_path, remote_name, folder_id):
-    try:
-        if not os.path.exists(local_path):
-            raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
-
-        # Delete existing file on Drive if present
-        existing = drive_service.files().list(
-            q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
-            fields="files(id)"
-        ).execute().get("files", [])
-
-        if existing:
-            drive_service.files().delete(fileId=existing[0]["id"]).execute()
-            print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}")
-
-        file_metadata = {"name": remote_name, "parents": [folder_id]}
-        media = MediaFileUpload(local_path, resumable=True)
-        file = drive_service.files().create(
-            body=file_metadata,
-            media_body=media,
-            fields="id"
-        ).execute()
-
-        print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
-        return file["id"]
-
-    except Exception as e:
-        print(f"❌ Error during upload: {e}")
-        return None
-
-
-def download_file_from_drive(remote_name, folder_id, local_path):
-    results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
-    files = results.get("files", [])
-    if not files:
-        return False
-    file_id = files[0]["id"]
-    request = drive_service.files().get_media(fileId=file_id)
-    fh = io.FileIO(local_path, 'wb')
-    downloader = MediaIoBaseDownload(fh, request)
-    done = False
-    while not done:
-        _, done = downloader.next_chunk()
-    return True
-def download_drive_file_content(file_id):
-    request = drive_service.files().get_media(fileId=file_id)
-    fh = io.BytesIO()
-    downloader = MediaIoBaseDownload(fh, request)
-    done = False
-    while not done:
-        _, done = downloader.next_chunk()
-    fh.seek(0)
-    return fh.read().decode("utf-8")
-
-# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
-#     """
-#     Runs `func` with timeout in seconds. Kills if it exceeds.
-#     Returns: (success, result or None)
-#     """
-#     def wrapper(q, *args, **kwargs):
-#         try:
-#             q.put(func(*args, **kwargs))
-#         except Exception as e:
-#             q.put(e)
-
-#     q = multiprocessing.Queue()
-#     p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
-#     p.start()
-#     p.join(timeout)
-
-#     if p.is_alive():
-#         p.terminate()
-#         p.join()
-#         print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
-#         return False, None
-#     else:
-#         result = q.get()
-#         if isinstance(result, Exception):
-#             raise result
-#         return True, result
-# def run_with_timeout(func, args=(), kwargs={}, timeout=30):
-#     import concurrent.futures
-#     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
-#         future = executor.submit(func, *args, **kwargs)
-#         try:
-#             return True, future.result(timeout=timeout)
-#         except concurrent.futures.TimeoutError:
-#             print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
-#             return False, None
-
-import multiprocessing
-
-def run_with_timeout(func, args=(), kwargs={}, timeout=30):
-    def wrapper(q, *args, **kwargs):
-        try:
-            result = func(*args, **kwargs)
-            q.put((True, result))
-        except Exception as e:
-            q.put((False, e))
-
-    q = multiprocessing.Queue()
-    p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
-    p.start()
-    p.join(timeout)
-
-    if p.is_alive():
-        p.terminate()
-        p.join()
-        print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
-        return False, None
-
-    if not q.empty():
-        success, result = q.get()
-        if success:
-            return True, result
-        else:
-            raise result  # re-raise exception if needed
-
-    return False, None
-
-
-
-def time_it(func, *args, **kwargs):
-    """
-    Measure how long a function takes to run and return its result + time.
-    """
-    start = time.time()
-    result = func(*args, **kwargs)
-    end = time.time()
-    elapsed = end - start
-    print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
-    return result, elapsed
-# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---    
-
-def unique_preserve_order(seq):
-    seen = set()
-    return [x for x in seq if not (x in seen or seen.add(x))]
-# Main execution
-def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
-  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
-  # there can be one accession number in the accessions
-  # Prices are per 1,000 tokens
-  # Before each big step:
-  if stop_flag is not None and stop_flag.value:
-    print(f"🛑 Stop detected before starting {accession}, aborting early...")
-    return {}
-  # PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-  # PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-  # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-  # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
-  PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
-  PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
-
-  # Embedding-001 pricing per 1,000 input tokens
-  PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens  
-  if not accessions:
-    print("no input")
-    return None
-  else:  
-    accs_output = {}
-    #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-    genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))  
-    for acc in accessions:
-      print("start gemini: ", acc)  
-      start = time.time()
-      total_cost_title = 0
-      jsonSM, links, article_text = {},[], ""
-      acc_score = { "isolate": "",
-                    "country":{},
-                   "sample_type":{},
-                   #"specific_location":{},
-                   #"ethnicity":{},
-                   "query_cost":total_cost_title,
-                   "time_cost":None,
-                   "source":links,
-                    "file_chunk":"",
-                   "file_all_output":""}
-      if niche_cases:
-        for niche in niche_cases:
-          acc_score[niche] = {}
-            
-      meta = mtdna_classifier.fetch_ncbi_metadata(acc)
-      country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
-      acc_score["isolate"] = iso
-      print("meta: ",meta)  
-      meta_expand = smart_fallback.fetch_ncbi(acc)
-      print("meta expand: ", meta_expand)  
-      # set up step: create the folder to save document
-      chunk, all_output = "",""
-      if pudID: 
-        id = str(pudID)
-        saveTitle = title
-      else: 
-        try:
-          author_name = meta_expand["authors"].split(',')[0]  # Use last name only
-        except:
-          author_name = meta_expand["authors"] 
-        saveTitle = title + "_" + col_date + "_" + author_name
-        if title.lower() == "unknown" and col_date.lower()=="unknown" and   author_name.lower() == "unknown":
-            saveTitle += "_" + acc
-        id = "DirectSubmission"
-      # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
-      # if not folder_path.exists():
-      #     cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
-      #     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-      #     print("data/"+str(id) +" created.")
-      # else:
-      #     print("data/"+str(id) +" already exists.")
-      # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
-      # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
-      # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
-      # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
-      data_folder_id = GDRIVE_DATA_FOLDER_NAME  # Use the shared folder directly
-      sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
-      print("sample folder id: ", sample_folder_id)
-      
-      # Define document names
-      if len(saveTitle) > 50:  
-          saveName = saveTitle[:50]
-          saveName = saveName.replace(" ", "_")
-          chunk_filename = f"{saveName}_merged_document.docx"
-          all_filename = f"{saveName}_all_merged_document.docx"
-      else:
-          saveName = saveTitle.replace(" ", "_")
-          chunk_filename = f"{saveName}_merged_document.docx"
-          all_filename = f"{saveName}_all_merged_document.docx"
-      print("chunk file name and all filename: ", chunk_filename, all_filename)  
-      # Define local temp paths for reading/writing
-      # import tempfile
-      # tmp_dir = tempfile.mkdtemp()
-      LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
-      os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
-      file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
-      file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
-      # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
-      # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)  
-      if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Stop processing {accession}, aborting early...")
-        return {}
-      print("this is file chunk path: ", file_chunk_path)
-      chunk_id = find_drive_file(chunk_filename, sample_folder_id)
-      all_id = find_drive_file(all_filename, sample_folder_id)
-    
-      if chunk_id and all_id:
-        print("✅ Files already exist in Google Drive. Downloading them...")
-        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
-        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
-        acc_score["file_chunk"] = str(chunk_filename)
-        acc_score["file_all_output"] = str(all_filename)  
-        print("chunk_id and all_id: ")
-        print(chunk_id, all_id)  
-        print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])  
-        file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
-        print("📄 Name:", file["name"])
-        print("📁 Parent folder ID:", file["parents"][0])
-        print("🔗 View link:", file["webViewLink"])
-  
-  
-        # Read and parse these into `chunk` and `all_output`
-      else:
-        # 🔥 Remove any stale local copies
-        if os.path.exists(file_chunk_path):
-            os.remove(file_chunk_path)
-            print(f"🗑️ Removed stale: {file_chunk_path}")
-        if os.path.exists(file_all_path):
-            os.remove(file_all_path)
-            print(f"🗑️ Removed stale: {file_all_path}")  
-      # 🔥 Remove the local file first if it exists
-      # if os.path.exists(file_chunk_path):
-      #   os.remove(file_chunk_path)
-      #   print("remove chunk path")  
-      # if os.path.exists(file_all_path):
-      #   os.remove(file_all_path)    
-      #   print("remove all path")  
-      # Try to download if already exists on Drive
-        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
-        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
-      print("chunk exist: ", chunk_exists)  
-      # first way: ncbi method
-      print("country.lower: ",country.lower())  
-      if country.lower() != "unknown":
-        stand_country = standardize_location.smart_country_lookup(country.lower())
-        print("stand_country: ", stand_country)  
-        if stand_country.lower() != "not found":
-          acc_score["country"][stand_country.lower()] = ["ncbi"]
-        else: acc_score["country"][country.lower()] = ["ncbi"]   
-      # if spe_loc.lower() != "unknown":
-      #   acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
-      # if ethnic.lower() != "unknown":
-      #   acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
-      if sample_type.lower() != "unknown":
-        acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
-      # second way: LLM model
-      # Preprocess the input token
-      print(acc_score)  
-      accession, isolate = None, None
-      if acc != "unknown":  accession = acc
-      if iso != "unknown":  isolate = iso
-      if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Stop processing {accession}, aborting early...")
-        return {}    
-      # check doi first
-      print("chunk filename: ", chunk_filename)    
-      if chunk_exists:
-        print("File chunk exists!")
-        if not chunk:
-            print("start to get chunk")
-            text, table, document_title = model.read_docx_text(file_chunk_path)
-            chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
-        if str(chunk_filename) != "":
-            print("first time have chunk path at chunk exist: ", str(chunk_filename))
-            acc_score["file_chunk"] = str(chunk_filename)    
-      if all_exists:
-        print("File all output exists!")
-        if not all_output:
-            text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
-            all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
-        if str(all_filename) != "":
-            print("first time have all path at all exist: ", str(all_filename))
-            acc_score["file_all_output"] = str(all_filename)    
-      print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])  
-      if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:  
-          if doi != "unknown":
-            link = 'https://doi.org/' + doi
-            # get the file to create listOfFile for each id
-            print("link of doi: ", link)  
-            html = extractHTML.HTML("",link)
-            jsonSM = html.getSupMaterial()
-            article_text = html.getListSection()
-            if article_text:
-              if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
-                links.append(link)
-            if jsonSM:
-              links += sum((jsonSM[key] for key in jsonSM),[])
-          # no doi then google custom search api
-          if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
-            # might find the article
-            print("no article text, start tem link")  
-            #tem_links = mtdna_classifier.search_google_custom(title, 2)
-            tem_links = smart_fallback.smart_google_search(meta_expand)
-            print("tem links: ", tem_links)  
-            tem_link_acc = smart_fallback.google_accession_search(acc)
-            tem_links += tem_link_acc 
-            tem_links = unique_preserve_order(tem_links)
-            print("tem link before filtering: ", tem_links)  
-            # filter the quality link  
-            print("saveLinkFolder as sample folder id: ", sample_folder_id)  
-            print("start the smart filter link") 
-            if stop_flag is not None and stop_flag.value:
-                print(f"🛑 Stop processing {accession}, aborting early...")
-                return {}  
-            # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
-            # if success_process:
-            #   links = output_process
-            #   print("yes succeed for smart filter link")
-            # else: 
-            #   print("no suceed, fallback to all tem links")
-            #   links = tem_links
-            links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
-          print("this is links: ",links)    
-          links = unique_preserve_order(links)
-          acc_score["source"] = links
-      else:
-        print("inside the try of reusing chunk or all output")  
-        #print("chunk filename: ", str(chunks_filename))  
-          
-        try:
-            temp_source = False
-            if save_df is not None and not save_df.empty:
-                print("save df not none")  
-                print("chunk file name: ",str(chunk_filename))
-                print("all filename: ",str(all_filename))
-                if acc_score["file_chunk"]:
-                  link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
-                  #link = row["Sources"].iloc[0]
-                  if "http" in link:   
-                    print("yeah http in save df source") 
-                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
-                  else:  # temporary  
-                    print("tempo source") 
-                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
-                    temp_source = True
-                elif acc_score["file_all_output"]:
-                  link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
-                  #link = row["Sources"].iloc[0]
-                  print(link)
-                  print("list of link")
-                  print([x for x in link.split("\n") if x.strip()])
-                  if "http" in link:    
-                    print("yeah http in save df source")
-                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()   
-                  else:  # temporary  
-                    print("tempo source") 
-                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
-                    temp_source = True      
-                else:  # temporary  
-                  print("tempo source") 
-                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]  
-                  temp_source = True
-            else:  # temporary  
-                print("tempo source") 
-                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]  
-                temp_source = True      
-            if temp_source:
-                print("temp source is true so have to try again search link")
-                if doi != "unknown":
-                    link = 'https://doi.org/' + doi
-                    # get the file to create listOfFile for each id
-                    print("link of doi: ", link)  
-                    html = extractHTML.HTML("",link)
-                    jsonSM = html.getSupMaterial()
-                    article_text = html.getListSection()
-                    if article_text:
-                      if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
-                        links.append(link)
-                    if jsonSM:
-                      links += sum((jsonSM[key] for key in jsonSM),[])
-                  # no doi then google custom search api
-                if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
-                    # might find the article
-                    print("no article text, start tem link")  
-                    #tem_links = mtdna_classifier.search_google_custom(title, 2)
-                    tem_links = smart_fallback.smart_google_search(meta_expand)
-                    print("tem links: ", tem_links)  
-                    tem_link_acc = smart_fallback.google_accession_search(acc)
-                    tem_links += tem_link_acc 
-                    tem_links = unique_preserve_order(tem_links)
-                    print("tem link before filtering: ", tem_links)  
-                    # filter the quality link  
-                    print("saveLinkFolder as sample folder id: ", sample_folder_id)  
-                    print("start the smart filter link") 
-                    if stop_flag is not None and stop_flag.value:
-                        print(f"🛑 Stop processing {accession}, aborting early...")
-                        return {}  
-                    # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
-                    # if success_process:
-                    #   links = output_process
-                    #   print("yes succeed for smart filter link")
-                    # else: 
-                    #   print("no suceed, fallback to all tem links")
-                    #   links = tem_links
-                    links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
-                print("this is links: ",links)    
-                links = unique_preserve_order(links)
-                acc_score["source"] = links
-        except:
-            print("except for source")  
-            acc_score["source"] = []
-      # chunk_path = "/"+saveTitle+"_merged_document.docx"
-      # all_path = "/"+saveTitle+"_all_merged_document.docx"
-      # # if chunk and all output not exist yet
-      # file_chunk_path = saveLinkFolder + chunk_path
-      # file_all_path = saveLinkFolder + all_path
-      # if os.path.exists(file_chunk_path):
-      #   print("File chunk exists!")
-      #   if not chunk:
-      #     text, table, document_title = model.read_docx_text(file_chunk_path)
-      #     chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
-      # if os.path.exists(file_all_path):
-      #   print("File all output exists!")
-      #   if not all_output:
-      #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
-      #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
-      if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Stop processing {accession}, aborting early...")
-        return {}
-      # print("chunk filename: ", chunk_filename)    
-      # if chunk_exists:
-      #   print("File chunk exists!")
-      #   if not chunk:
-      #       print("start to get chunk")
-      #       text, table, document_title = model.read_docx_text(file_chunk_path)
-      #       chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
-      #   if str(chunk_filename) != "":
-      #       print("first time have chunk path at chunk exist: ", str(chunk_filename))
-      #       acc_score["file_chunk"] = str(chunk_filename)    
-      # if all_exists:
-      #   print("File all output exists!")
-      #   if not all_output:
-      #       text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
-      #       all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
-      #   if str(all_filename) != "":
-      #       print("first time have all path at all exist: ", str(all_filename))
-      #       acc_score["file_all_output"] = str(all_filename)    
-      if not chunk and not all_output:
-        print("not chunk and all output")  
-        # else: check if we can reuse these chunk and all output of existed accession to find another
-        if str(chunk_filename) != "":
-          print("first time have chunk path: ", str(chunk_filename))
-          acc_score["file_chunk"] = str(chunk_filename)
-        if str(all_filename) != "":
-          print("first time have all path: ", str(all_filename))
-          acc_score["file_all_output"] = str(all_filename)    
-        if links:
-          for link in links:
-              print(link)
-              # if len(all_output) > 1000*1000:
-              #   all_output = data_preprocess.normalize_for_overlap(all_output)
-              #   print("after normalizing all output: ", len(all_output))
-              if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
-                print("break here")
-                break
-              if iso != "unknown": query_kw = iso
-              else: query_kw = acc
-              #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
-              success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
-              if stop_flag is not None and stop_flag.value:
-                print(f"🛑 Stop processing {accession}, aborting early...")
-                return {}
-              if success_process:
-                text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
-                print("yes succeed for process document")
-              else: text_link, tables_link, final_input_link = "", "", ""  
-              context = data_preprocess.extract_context(final_input_link, query_kw)
-              if context !=  "Sample ID not found.":
-                if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
-                  success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
-                  if stop_flag is not None and stop_flag.value:
-                    print(f"🛑 Stop processing {accession}, aborting early...")
-                    return {}
-                  if success_chunk:
-                    chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
-                    print("yes succeed for chunk")
-                  else:
-                    chunk += context
-                    print("len context: ", len(context))
-                    print("basic fall back")
-                print("len chunk after: ", len(chunk))
-              if len(final_input_link) > 1000*1000:
-                if context !=  "Sample ID not found.":
-                  final_input_link =  context 
-                else:
-                  final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
-                  if len(final_input_link) > 1000 *1000:
-                    final_input_link = final_input_link[:100000] 
-              if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
-                print("Running merge_texts_skipping_overlap with timeout")
-                success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
-                if stop_flag is not None and stop_flag.value:
-                  print(f"🛑 Stop processing {accession}, aborting early...")
-                  return {}
-                print("Returned from timeout logic")
-                if success:
-                  all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
-                  print("yes succeed")
-                else:
-                  print("len all output: ", len(all_output))
-                  print("len final input link: ", len(final_input_link))
-                  all_output += final_input_link
-                  print("len final input: ", len(final_input_link))
-                  print("basic fall back")
-              else:
-                  print("both/either all output or final link too large more than 100000")
-                  print("len all output: ", len(all_output))
-                  print("len final input link: ", len(final_input_link))
-                  all_output += final_input_link
-                  print("len final input: ", len(final_input_link))
-                  print("basic fall back")    
-              print("len all output after: ", len(all_output))
-          #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
-        if stop_flag is not None and stop_flag.value:
-          print(f"🛑 Stop processing {accession}, aborting early...")
-          return {}  
-        else:
-          chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-          all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-        if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-        if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-        if len(all_output) > 1*1024*1024: 
-          all_output = data_preprocess.normalize_for_overlap(all_output)
-          if len(all_output) > 1*1024*1024:
-            all_output = all_output[:1*1024*1024]
-        print("chunk len: ", len(chunk))
-        print("all output len: ", len(all_output))    
-        data_preprocess.save_text_to_docx(chunk, file_chunk_path)
-        data_preprocess.save_text_to_docx(all_output, file_all_path)
-        # Later when saving new files
-        # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
-        # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
-        
-        # Upload to Drive
-        result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
-        result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
-        print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
-        print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")  
-        print("here 1")
-          
-      # else:
-      #   final_input = ""
-      #   if all_output:
-      #     final_input = all_output
-      #   else:  
-      #     if chunk: final_input = chunk
-      #   #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
-      #   if final_input:
-      #     keywords = []
-      #     if iso != "unknown":  keywords.append(iso)
-      #     if acc != "unknown":  keywords.append(acc)
-      #     for keyword in keywords:
-      #       chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
-      #       countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
-      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
-      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
-          
-      # Define paths for cached RAG assets
-      # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
-      # document_chunks_path = saveLinkFolder+"/document_chunks.json"
-      # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
-      print("here 2")
-      faiss_filename = "faiss_index.bin"
-      chunks_filename = "document_chunks.json"
-      lookup_filename = "structured_lookup.json"
-      print("name of faiss: ", faiss_filename)  
-          
-      faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
-      document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
-      structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
-      print("name if faiss path: ", faiss_index_path)  
-      # 🔥 Remove the local file first if it exists
-      print("start faiss id and also the sample folder id is: ", sample_folder_id)  
-      faiss_id = find_drive_file(faiss_filename, sample_folder_id)
-      print("done faiss id")  
-      document_id = find_drive_file(chunks_filename, sample_folder_id)
-      structure_id = find_drive_file(lookup_filename, sample_folder_id)  
-      if faiss_id and document_id and structure_id:
-        print("✅ 3 Files already exist in Google Drive. Downloading them...")
-        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
-        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
-        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)  
-        # Read and parse these into `chunk` and `all_output`
-      else:
-        "one of id not exist"  
-        if os.path.exists(faiss_index_path):
-            print("faiss index exist and start to remove: ", faiss_index_path)
-            os.remove(faiss_index_path)
-        if os.path.exists(document_chunks_path):
-            os.remove(document_chunks_path)
-        if os.path.exists(structured_lookup_path):
-            os.remove(structured_lookup_path)    
-        print("start to download the faiss, chunk, lookup")
-            
-        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
-        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
-        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
-      try:
-          print("try gemini 2.5")
-          print("move to load rag")
-          master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
-              faiss_index_path, document_chunks_path, structured_lookup_path
-          )
-    
-          global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
-          if not all_output:
-            if chunk: all_output = chunk
-            else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
-          if faiss_index is None:
-              print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
-              total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
-                  all_output
-              ).total_tokens
-    
-              initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
-              total_cost_title += initial_embedding_cost
-              print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
-    
-    
-              master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
-                  file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
-              )
-          else:
-              print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
-              plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
-              master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
-          if stop_flag is not None and stop_flag.value:
-            print(f"🛑 Stop processing {accession}, aborting early...")
-            return {}  
-          primary_word = iso
-          alternative_word = acc
-          print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
-          if features.lower() not in all_output.lower():  
-            all_output += ". NCBI Features: " + features
-          # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
-          #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
-          #     model.call_llm_api, chunk=chunk, all_output=all_output)
-          print("this is chunk for the model")
-          print(chunk)
-          print("this is all output for the model")
-          print(all_output)  
-          if stop_flag is not None and stop_flag.value:
-            print(f"🛑 Stop processing {accession}, aborting early...")
-            return {}
-          country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
-              primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
-              model.call_llm_api, chunk=chunk, all_output=all_output)
-          print("pass query of 2.5")
-      except:
-          print("try gemini 1.5")
-          country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
-            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
-            model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")      
-          print("yeah pass the query of 1.5")
-      print("country using ai: ", country)
-      print("sample type using ai: ", sample_type)  
-      # if len(country) == 0: country = "unknown"
-      # if len(sample_type) == 0: sample_type = "unknown"    
-      # if country_explanation: country_explanation = "-"+country_explanation        
-      # else: country_explanation = ""
-      # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
-      # else: sample_type_explanation = ""
-      if len(country) == 0: country = "unknown"
-      if len(sample_type) == 0: sample_type = "unknown"    
-      if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation        
-      else: country_explanation = ""
-      if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
-      else: sample_type_explanation = ""
-          
-      if method_used == "unknown": method_used = ""
-      if country.lower() != "unknown":
-        stand_country = standardize_location.smart_country_lookup(country.lower())
-        if stand_country.lower() != "not found":
-          if stand_country.lower() in acc_score["country"]:
-            if country_explanation:
-              acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
-          else:
-            acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
-        else:
-          if country.lower() in acc_score["country"]:
-            if country_explanation:
-              if len(method_used + country_explanation) > 0:
-                acc_score["country"][country.lower()].append(method_used + country_explanation)
-          else:
-            if len(method_used + country_explanation) > 0:
-              acc_score["country"][country.lower()] = [method_used + country_explanation]
-      # if spe_loc.lower() != "unknown":
-      #   if spe_loc.lower() in acc_score["specific_location"]:
-      #     acc_score["specific_location"][spe_loc.lower()].append(method_used)
-      #   else:
-      #     acc_score["specific_location"][spe_loc.lower()] = [method_used]
-      # if ethnic.lower() != "unknown":
-      #   if ethnic.lower() in acc_score["ethnicity"]:
-      #     acc_score["ethnicity"][ethnic.lower()].append(method_used)
-      #   else:
-      #     acc_score["ethnicity"][ethnic.lower()] = [method_used]
-      if sample_type.lower() != "unknown":
-        if sample_type.lower() in acc_score["sample_type"]:
-          if len(method_used + sample_type_explanation) > 0:
-            acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
-        else:
-          if len(method_used + sample_type_explanation)> 0:
-            acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
-      total_cost_title += total_query_cost
-      if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Stop processing {accession}, aborting early...")
-        return {}
-      # last resort: combine all information to give all output otherwise unknown
-      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":   
-        text = ""
-        for key in meta_expand:
-          text += str(key) + ": " + meta_expand[key] + "\n"    
-        if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
-          text += data_preprocess.normalize_for_overlap(all_output)          
-        if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
-          text += data_preprocess.normalize_for_overlap(chunk)            
-        text += ". NCBI Features: " + features 
-        print("this is text for the last resort model")
-        print(text)  
-        country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
-            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
-            model.call_llm_api, chunk=text, all_output=text)  
-        print("this is last resort results: ")
-        print("country: ", country)
-        print("sample type: ", sample_type)  
-        if len(country) == 0: country = "unknown"
-        if len(sample_type) == 0: sample_type = "unknown"    
-        # if country_explanation: country_explanation = "-"+country_explanation        
-        # else: country_explanation = ""
-        # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
-        # else: sample_type_explanation = ""
-        if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation        
-        else: country_explanation = ""
-        if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
-        else: sample_type_explanation = ""
-            
-        if method_used == "unknown": method_used = ""
-        if country.lower() != "unknown":
-          stand_country = standardize_location.smart_country_lookup(country.lower())
-          if stand_country.lower() != "not found":
-            if stand_country.lower() in acc_score["country"]:
-              if country_explanation:
-                acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
-            else:
-              acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
-          else:
-            if country.lower() in acc_score["country"]:
-              if country_explanation:
-                if len(method_used + country_explanation) > 0:
-                  acc_score["country"][country.lower()].append(method_used + country_explanation)
-            else:
-              if len(method_used + country_explanation) > 0:
-                acc_score["country"][country.lower()] = [method_used + country_explanation]
-        if sample_type.lower() != "unknown":
-            if sample_type.lower() in acc_score["sample_type"]:
-              if len(method_used + sample_type_explanation) > 0:
-                acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
-            else:
-              if len(method_used + sample_type_explanation)> 0:
-                acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]          
-        total_cost_title += total_query_cost
-      end = time.time()
-      #total_cost_title += total_query_cost
-      acc_score["query_cost"] = f"{total_cost_title:.6f}"
-      elapsed = end - start
-      acc_score["time_cost"] = f"{elapsed:.3f} seconds"
-      accs_output[acc] = acc_score
-      print(accs_output[acc])
-      
+# test1: MJ17 direct
+# test2: "A1YU101" thailand cross-ref
+# test3: "EBK109" thailand cross-ref
+# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
+import data_preprocess
+import model
+import mtdna_classifier
+#import app
+import smart_fallback
+import pandas as pd
+from pathlib import Path
+import subprocess
+from NER.html import extractHTML
+import os
+import google.generativeai as genai
+import re
+import standardize_location
+# Helper functions in for this pipeline
+# Track time
+import time
+import multiprocessing
+import gspread
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
+from google.oauth2.service_account import Credentials
+from oauth2client.service_account import ServiceAccountCredentials
+import io
+import json
+#––– Authentication setup –––
+GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
+GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
+GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
+GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
+drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
+
+def get_or_create_drive_folder(name, parent_id=None):
+    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
+    if parent_id:
+        query += f" and '{parent_id}' in parents"
+    results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
+    items = results.get("files", [])
+    if items:
+        return items[0]["id"]
+    file_metadata = {
+        "name": name,
+        "mimeType": "application/vnd.google-apps.folder"
+    }
+    if parent_id:
+        file_metadata["parents"] = [parent_id]
+    file = drive_service.files().create(body=file_metadata, fields="id").execute()
+    return file["id"]
+# def find_drive_file(filename, parent_id):
+#     """
+#     Checks if a file with the given name exists inside the specified Google Drive folder.
+#     Returns the file ID if found, else None.
+#     """
+#     query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
+#     results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
+#     files = results.get('files', [])
+#     if files:
+#         return files[0]["id"]
+#     return None
+
+def find_drive_file(filename, parent_id):
+    """
+    Checks if a file with the given name exists inside the specified Google Drive folder.
+    Returns the file ID if found, else None.
+    """
+    try:
+        print(f"🔍 Searching for '{filename}' in folder: {parent_id}")
+        query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
+        results = drive_service.files().list(
+            q=query,
+            spaces='drive',
+            fields='files(id, name)',
+            pageSize=1
+        ).execute()
+        files = results.get('files', [])
+        if files:
+            print(f"✅ Found file: {files[0]['name']} with ID: {files[0]['id']}")
+            return files[0]["id"]
+        else:
+            print("⚠️ File not found.")
+            return None
+    except Exception as e:
+        print(f"❌ Error during find_drive_file: {e}")
+        return None
+
+
+
+# def upload_file_to_drive(local_path, remote_name, folder_id):
+#     file_metadata = {"name": remote_name, "parents": [folder_id]}
+#     media = MediaFileUpload(local_path, resumable=True)
+#     existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
+#     if existing:
+#         drive_service.files().delete(fileId=existing[0]["id"]).execute()
+#     file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
+#     result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
+#     if not result.get("files"):
+#         print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
+#     else:
+#         print(f"✅ Verified upload: {remote_name}")
+#     return file["id"]
+def upload_file_to_drive(local_path, remote_name, folder_id):
+    try:
+        if not os.path.exists(local_path):
+            raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
+
+        # Delete existing file on Drive if present
+        existing = drive_service.files().list(
+            q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
+            fields="files(id)"
+        ).execute().get("files", [])
+
+        if existing:
+            drive_service.files().delete(fileId=existing[0]["id"]).execute()
+            print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}")
+
+        file_metadata = {"name": remote_name, "parents": [folder_id]}
+        media = MediaFileUpload(local_path, resumable=True)
+        file = drive_service.files().create(
+            body=file_metadata,
+            media_body=media,
+            fields="id"
+        ).execute()
+
+        print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
+        return file["id"]
+
+    except Exception as e:
+        print(f"❌ Error during upload: {e}")
+        return None
+
+
+def download_file_from_drive(remote_name, folder_id, local_path):
+    results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
+    files = results.get("files", [])
+    if not files:
+        return False
+    file_id = files[0]["id"]
+    request = drive_service.files().get_media(fileId=file_id)
+    fh = io.FileIO(local_path, 'wb')
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    while not done:
+        _, done = downloader.next_chunk()
+    return True
+def download_drive_file_content(file_id):
+    request = drive_service.files().get_media(fileId=file_id)
+    fh = io.BytesIO()
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    while not done:
+        _, done = downloader.next_chunk()
+    fh.seek(0)
+    return fh.read().decode("utf-8")
+
+# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
+#     """
+#     Runs `func` with timeout in seconds. Kills if it exceeds.
+#     Returns: (success, result or None)
+#     """
+#     def wrapper(q, *args, **kwargs):
+#         try:
+#             q.put(func(*args, **kwargs))
+#         except Exception as e:
+#             q.put(e)
+
+#     q = multiprocessing.Queue()
+#     p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
+#     p.start()
+#     p.join(timeout)
+
+#     if p.is_alive():
+#         p.terminate()
+#         p.join()
+#         print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
+#         return False, None
+#     else:
+#         result = q.get()
+#         if isinstance(result, Exception):
+#             raise result
+#         return True, result
+# def run_with_timeout(func, args=(), kwargs={}, timeout=30):
+#     import concurrent.futures
+#     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+#         future = executor.submit(func, *args, **kwargs)
+#         try:
+#             return True, future.result(timeout=timeout)
+#         except concurrent.futures.TimeoutError:
+#             print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
+#             return False, None
+
+import multiprocessing
+
+def run_with_timeout(func, args=(), kwargs={}, timeout=30):
+    def wrapper(q, *args, **kwargs):
+        try:
+            result = func(*args, **kwargs)
+            q.put((True, result))
+        except Exception as e:
+            q.put((False, e))
+
+    q = multiprocessing.Queue()
+    p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
+    p.start()
+    p.join(timeout)
+
+    if p.is_alive():
+        p.terminate()
+        p.join()
+        print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
+        return False, None
+
+    if not q.empty():
+        success, result = q.get()
+        if success:
+            return True, result
+        else:
+            raise result  # re-raise exception if needed
+
+    return False, None
+
+
+
+def time_it(func, *args, **kwargs):
+    """
+    Measure how long a function takes to run and return its result + time.
+    """
+    start = time.time()
+    result = func(*args, **kwargs)
+    end = time.time()
+    elapsed = end - start
+    print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
+    return result, elapsed
+# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---    
+
+def unique_preserve_order(seq):
+    seen = set()
+    return [x for x in seq if not (x in seen or seen.add(x))]
+
+def sanitize_filename(filename, max_length=100):
+    # Remove characters that are not letters, numbers, spaces, underscores, or hyphens
+    filename = re.sub(r'[<>:"/\\|?*\n\r\t]', '', filename)
+    # Replace spaces with underscores
+    filename = filename.replace(" ", "_")
+    # Limit length
+    return filename[:max_length]    
+# Main execution
+def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
+  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
+  # there can be one accession number in the accessions
+  # Prices are per 1,000 tokens
+  # Before each big step:
+  if stop_flag is not None and stop_flag.value:
+    print(f"🛑 Stop detected before starting {accession}, aborting early...")
+    return {}
+  # PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
+  # PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
+  # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
+  # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
+  PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
+  PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
+
+  # Embedding-001 pricing per 1,000 input tokens
+  PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens  
+  if not accessions:
+    print("no input")
+    return None
+  else:  
+    accs_output = {}
+    #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+    genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))  
+    for acc in accessions:
+      print("start gemini: ", acc)  
+      start = time.time()
+      total_cost_title = 0
+      jsonSM, links, article_text = {},[], ""
+      acc_score = { "isolate": "",
+                    "country":{},
+                   "sample_type":{},
+                   #"specific_location":{},
+                   #"ethnicity":{},
+                   "query_cost":total_cost_title,
+                   "time_cost":None,
+                   "source":links,
+                    "file_chunk":"",
+                   "file_all_output":""}
+      if niche_cases:
+        for niche in niche_cases:
+          acc_score[niche] = {}
+            
+      meta = mtdna_classifier.fetch_ncbi_metadata(acc)
+      country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
+      acc_score["isolate"] = iso
+      print("meta: ",meta)  
+      meta_expand = smart_fallback.fetch_ncbi(acc)
+      print("meta expand: ", meta_expand)  
+      # set up step: create the folder to save document
+      chunk, all_output, out_links = "","", {}
+      if pudID: 
+        id = str(pudID)
+        saveTitle = title
+      else: 
+        try:
+          author_name = meta_expand["authors"].split(',')[0]  # Use last name only
+        except:
+          author_name = meta_expand["authors"] 
+        saveTitle = title + "_" + col_date + "_" + author_name
+        if title.lower() == "unknown" and col_date.lower()=="unknown" and   author_name.lower() == "unknown":
+            saveTitle += "_" + acc
+        id = "DirectSubmission"
+      # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
+      # if not folder_path.exists():
+      #     cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
+      #     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      #     print("data/"+str(id) +" created.")
+      # else:
+      #     print("data/"+str(id) +" already exists.")
+      # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
+      # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
+      # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
+      # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      data_folder_id = GDRIVE_DATA_FOLDER_NAME  # Use the shared folder directly
+      sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      print("sample folder id: ", sample_folder_id)
+      
+      # Define document names
+      # if len(saveTitle) > 50:  
+      #     saveName = saveTitle[:50]
+      #     saveName = saveName.replace(" ", "_")
+      #     chunk_filename = f"{saveName}_merged_document.docx"
+      #     all_filename = f"{saveName}_all_merged_document.docx"
+      # else:
+      #     saveName = saveTitle.replace(" ", "_")
+      #     chunk_filename = f"{saveName}_merged_document.docx"
+      #     all_filename = f"{saveName}_all_merged_document.docx"
+      safe_title = sanitize_filename(saveTitle, 50)
+      chunk_filename = f"{safe_title}_merged_document.docx"
+      all_filename = f"{safe_title}_all_merged_document.docx"  
+      print("chunk file name and all filename: ", chunk_filename, all_filename)  
+      # Define local temp paths for reading/writing
+      # import tempfile
+      # tmp_dir = tempfile.mkdtemp()
+      LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
+      os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
+      file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
+      file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
+      # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
+      # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)  
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
+      print("this is file chunk path: ", file_chunk_path)
+      chunk_id = find_drive_file(chunk_filename, sample_folder_id)
+      all_id = find_drive_file(all_filename, sample_folder_id)
+    
+      if chunk_id and all_id:
+        print("✅ Files already exist in Google Drive. Downloading them...")
+        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
+        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+        acc_score["file_chunk"] = str(chunk_filename)
+        acc_score["file_all_output"] = str(all_filename)  
+        print("chunk_id and all_id: ")
+        print(chunk_id, all_id)  
+        print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])  
+        file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
+        print("📄 Name:", file["name"])
+        print("📁 Parent folder ID:", file["parents"][0])
+        print("🔗 View link:", file["webViewLink"])
+  
+  
+        # Read and parse these into `chunk` and `all_output`
+      else:
+        # 🔥 Remove any stale local copies
+        if os.path.exists(file_chunk_path):
+            os.remove(file_chunk_path)
+            print(f"🗑️ Removed stale: {file_chunk_path}")
+        if os.path.exists(file_all_path):
+            os.remove(file_all_path)
+            print(f"🗑️ Removed stale: {file_all_path}")  
+      # 🔥 Remove the local file first if it exists
+      # if os.path.exists(file_chunk_path):
+      #   os.remove(file_chunk_path)
+      #   print("remove chunk path")  
+      # if os.path.exists(file_all_path):
+      #   os.remove(file_all_path)    
+      #   print("remove all path")  
+      # Try to download if already exists on Drive
+        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
+        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+      print("chunk exist: ", chunk_exists)  
+      # first way: ncbi method
+      print("country.lower: ",country.lower())  
+      if country.lower() != "unknown":
+        stand_country = standardize_location.smart_country_lookup(country.lower())
+        print("stand_country: ", stand_country)  
+        if stand_country.lower() != "not found":
+          acc_score["country"][stand_country.lower()] = ["ncbi"]
+        else: acc_score["country"][country.lower()] = ["ncbi"]   
+      # if spe_loc.lower() != "unknown":
+      #   acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
+      # if ethnic.lower() != "unknown":
+      #   acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
+      if sample_type.lower() != "unknown":
+        acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
+      # second way: LLM model
+      # Preprocess the input token
+      print(acc_score)  
+      accession, isolate = None, None
+      if acc != "unknown":  accession = acc
+      if iso != "unknown":  isolate = iso
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}    
+      # check doi first
+      print("chunk filename: ", chunk_filename)    
+      if chunk_exists:
+        print("File chunk exists!")
+        if not chunk:
+            print("start to get chunk")
+            text, table, document_title = model.read_docx_text(file_chunk_path)
+            chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+        if str(chunk_filename) != "":
+            print("first time have chunk path at chunk exist: ", str(chunk_filename))
+            acc_score["file_chunk"] = str(chunk_filename)    
+      if all_exists:
+        print("File all output exists!")
+        if not all_output:
+            text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+            all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+        if str(all_filename) != "":
+            print("first time have all path at all exist: ", str(all_filename))
+            acc_score["file_all_output"] = str(all_filename)    
+      print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])  
+      if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:  
+          if doi != "unknown":
+            link = 'https://doi.org/' + doi
+            # get the file to create listOfFile for each id
+            print("link of doi: ", link)  
+            html = extractHTML.HTML("",link)
+            jsonSM = html.getSupMaterial()
+            article_text = html.getListSection()
+            if article_text:
+              if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
+                out_links[link] = article_text
+                links.append(link)
+            if jsonSM:
+              links += sum((jsonSM[key] for key in jsonSM),[])
+              if links:
+                for l in links:
+                    out_links[l] = ""  
+          # no doi then google custom search api
+          if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
+            # might find the article
+            print("no article text, start tem link")  
+            #tem_links = mtdna_classifier.search_google_custom(title, 2)
+            tem_links = smart_fallback.smart_google_search(meta_expand)
+            print("tem links: ", tem_links)  
+            tem_link_acc = smart_fallback.google_accession_search(acc)
+            tem_links += tem_link_acc 
+            tem_links = unique_preserve_order(tem_links)
+            print("tem link before filtering: ", tem_links)  
+            # filter the quality link  
+            print("saveLinkFolder as sample folder id: ", sample_folder_id)  
+            print("start the smart filter link") 
+            if stop_flag is not None and stop_flag.value:
+                print(f"🛑 Stop processing {accession}, aborting early...")
+                return {}  
+            # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
+            # if success_process:
+            #   links = output_process
+            #   print("yes succeed for smart filter link")
+            # else: 
+            #   print("no suceed, fallback to all tem links")
+            #   links = tem_links
+            #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
+            success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=90)
+            if success_process:
+                out_links.update(output_process)
+                print("yeah we have out_link and len: ", len(out_links))
+                print("yes succeed for smart filter link")
+                links += list(out_links.keys())
+                print("link keys: ", links)
+            else: 
+                print("no suceed, fallback to all tem links")
+                links += tem_links
+          print("this is links: ",links)    
+          links = unique_preserve_order(links)
+          acc_score["source"] = links
+      else:
+        print("inside the try of reusing chunk or all output")  
+        #print("chunk filename: ", str(chunks_filename))  
+          
+        try:
+            temp_source = False
+            if save_df is not None and not save_df.empty:
+                print("save df not none")  
+                print("chunk file name: ",str(chunk_filename))
+                print("all filename: ",str(all_filename))
+                print("acc score for file chunk: ", acc_score["file_chunk"])
+                print("acc score for file all output: ", acc_score["file_all_output"])
+                if acc_score["file_chunk"]:
+                  link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
+                  #link = row["Sources"].iloc[0]
+                  if "http" in link:   
+                    print("yeah http in save df source") 
+                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
+                  else:  # temporary  
+                    print("tempo source") 
+                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
+                    temp_source = True
+                elif acc_score["file_all_output"]:
+                  link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
+                  #link = row["Sources"].iloc[0]
+                  print(link)
+                  print("list of link")
+                  print([x for x in link.split("\n") if x.strip()])
+                  if "http" in link:    
+                    print("yeah http in save df source")
+                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()   
+                  else:  # temporary  
+                    print("tempo source") 
+                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
+                    temp_source = True      
+                else:  # temporary  
+                  print("tempo source") 
+                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]  
+                  temp_source = True
+            else:  # temporary  
+                print("tempo source") 
+                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]  
+                temp_source = True      
+            if temp_source:
+                print("temp source is true so have to try again search link")
+                if doi != "unknown":
+                    link = 'https://doi.org/' + doi
+                    # get the file to create listOfFile for each id
+                    print("link of doi: ", link)  
+                    html = extractHTML.HTML("",link)
+                    jsonSM = html.getSupMaterial()
+                    article_text = html.getListSection()
+                    if article_text:
+                      if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
+                        out_links[link] = article_text
+                        links.append(link)
+                    if jsonSM:
+                      links += sum((jsonSM[key] for key in jsonSM),[])
+                      if links:
+                        for l in links:
+                            out_links[l] = ""  
+                  # no doi then google custom search api
+                if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
+                    # might find the article
+                    print("no article text, start tem link")  
+                    #tem_links = mtdna_classifier.search_google_custom(title, 2)
+                    tem_links = smart_fallback.smart_google_search(meta_expand)
+                    print("tem links: ", tem_links)  
+                    tem_link_acc = smart_fallback.google_accession_search(acc)
+                    tem_links += tem_link_acc 
+                    tem_links = unique_preserve_order(tem_links)
+                    print("tem link before filtering: ", tem_links)  
+                    # filter the quality link  
+                    print("saveLinkFolder as sample folder id: ", sample_folder_id)  
+                    print("start the smart filter link") 
+                    if stop_flag is not None and stop_flag.value:
+                        print(f"🛑 Stop processing {accession}, aborting early...")
+                        return {}  
+                    # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
+                    # if success_process:
+                    #   links = output_process
+                    #   print("yes succeed for smart filter link")
+                    # else: 
+                    #   print("no suceed, fallback to all tem links")
+                    #   links = tem_links
+                    #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
+                    success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=90)
+                    if success_process:
+                        out_links.update(output_process)
+                        print("yeah we have out_link and len: ", len(out_links))
+                        print("yes succeed for smart filter link")
+                        links += list(out_links.keys())
+                        print("link keys: ", links)
+                    else: 
+                        print("no suceed, fallback to all tem links")
+                        links += tem_links
+                print("this is links: ",links)    
+                links = unique_preserve_order(links)
+                acc_score["source"] = links
+        except:
+            try:
+                print("in the exception and start to get link")
+                if doi != "unknown":
+                    link = 'https://doi.org/' + doi
+                    # get the file to create listOfFile for each id
+                    print("link of doi: ", link)  
+                    html = extractHTML.HTML("",link)
+                    jsonSM = html.getSupMaterial()
+                    article_text = html.getListSection()
+                    if article_text:
+                      if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
+                        out_links[link] = article_text
+                        links.append(link)
+                    if jsonSM:
+                      links += sum((jsonSM[key] for key in jsonSM),[])
+                      if links:
+                        for l in links:
+                            out_links[l] = ""  
+                  # no doi then google custom search api
+                if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
+                    # might find the article
+                    print("no article text, start tem link")  
+                    #tem_links = mtdna_classifier.search_google_custom(title, 2)
+                    tem_links = smart_fallback.smart_google_search(meta_expand)
+                    print("tem links: ", tem_links)  
+                    tem_link_acc = smart_fallback.google_accession_search(acc)
+                    tem_links += tem_link_acc 
+                    tem_links = unique_preserve_order(tem_links)
+                    print("tem link before filtering: ", tem_links)  
+                    # filter the quality link  
+                    print("saveLinkFolder as sample folder id: ", sample_folder_id)  
+                    print("start the smart filter link") 
+                    if stop_flag is not None and stop_flag.value:
+                        print(f"🛑 Stop processing {accession}, aborting early...")
+                        return {}  
+                    # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
+                    # if success_process:
+                    #   links = output_process
+                    #   print("yes succeed for smart filter link")
+                    # else: 
+                    #   print("no suceed, fallback to all tem links")
+                    #   links = tem_links
+                    #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
+                    success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=90)
+                    if success_process:
+                        out_links.update(output_process)
+                        print("yeah we have out_link and len: ", len(out_links))
+                        print("yes succeed for smart filter link")
+                        links += list(out_links.keys())
+                        print("link keys: ", links)
+                    else: 
+                        print("no suceed, fallback to all tem links")
+                        links += tem_links
+                print("this is links: ",links)    
+                links = unique_preserve_order(links)
+                acc_score["source"] = links
+            except:
+                print("except of except for source")  
+                acc_score["source"] = []
+      # chunk_path = "/"+saveTitle+"_merged_document.docx"
+      # all_path = "/"+saveTitle+"_all_merged_document.docx"
+      # # if chunk and all output not exist yet
+      # file_chunk_path = saveLinkFolder + chunk_path
+      # file_all_path = saveLinkFolder + all_path
+      # if os.path.exists(file_chunk_path):
+      #   print("File chunk exists!")
+      #   if not chunk:
+      #     text, table, document_title = model.read_docx_text(file_chunk_path)
+      #     chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+      # if os.path.exists(file_all_path):
+      #   print("File all output exists!")
+      #   if not all_output:
+      #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+      #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
+      # print("chunk filename: ", chunk_filename)    
+      # if chunk_exists:
+      #   print("File chunk exists!")
+      #   if not chunk:
+      #       print("start to get chunk")
+      #       text, table, document_title = model.read_docx_text(file_chunk_path)
+      #       chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+      #   if str(chunk_filename) != "":
+      #       print("first time have chunk path at chunk exist: ", str(chunk_filename))
+      #       acc_score["file_chunk"] = str(chunk_filename)    
+      # if all_exists:
+      #   print("File all output exists!")
+      #   if not all_output:
+      #       text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+      #       all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+      #   if str(all_filename) != "":
+      #       print("first time have all path at all exist: ", str(all_filename))
+      #       acc_score["file_all_output"] = str(all_filename)    
+      if not chunk and not all_output:
+        print("not chunk and all output")  
+        # else: check if we can reuse these chunk and all output of existed accession to find another
+        if str(chunk_filename) != "":
+          print("first time have chunk path: ", str(chunk_filename))
+          acc_score["file_chunk"] = str(chunk_filename)
+        if str(all_filename) != "":
+          print("first time have all path: ", str(all_filename))
+          acc_score["file_all_output"] = str(all_filename)    
+        if links:
+          for link in links:
+              print(link)
+              # if len(all_output) > 1000*1000:
+              #   all_output = data_preprocess.normalize_for_overlap(all_output)
+              #   print("after normalizing all output: ", len(all_output))
+              if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
+                print("break here")
+                break
+              if iso != "unknown": query_kw = iso
+              else: query_kw = acc
+              #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
+                  
+              # success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
+              # if stop_flag is not None and stop_flag.value:
+              #   print(f"🛑 Stop processing {accession}, aborting early...")
+              #   return {}
+              # if success_process:
+              #   text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
+              #   print("yes succeed for process document")
+              # else: text_link, tables_link, final_input_link = "", "", "" 
+              if out_links:
+                if link in out_links:
+                  print("yeah art_text available")
+                  art_text = out_links[link]
+                else:
+                  art_text = None  
+              else:
+                art_text = None    
+              if art_text:
+                print("article text already available")
+                text_link = art_text
+              else:  
+                try:
+                  print("start preprocess and extract text")
+                  text_link = data_preprocess.extract_text(link, sample_folder_id)
+                except: text_link = ""  
+              try: 
+                print("extract table start")
+                success, the_output = run_with_timeout(data_preprocess.extract_table,args=(link,sample_folder_id),timeout=10)
+                print("Returned from timeout logic")
+                if success:
+                  tables_link = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+                  print("yes succeed for extract table")
+                else:
+                  print("not suceed etxract table")
+                  tables_link = []
+                #tables = extract_table(link, saveFolder)
+              except: tables_link = []  
+              try:
+                # print("merge text and table start")
+                # success, the_output = pipeline.run_with_timeout(merge_text_and_tables,kwargs={"text":text,"tables":tables,"accession_id":accession, "isolate":isolate},timeout=30)
+                # print("Returned from timeout logic")
+                # if success:
+                #   final_input = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+                #   print("yes succeed")
+                # else:
+                #   print("not suceed")
+                print("just merge text and tables")
+                final_input_link = text_link + ", ".join(tables_link)  
+                #final_input = pipeline.timeout(merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
+              except: 
+                print("no succeed here in preprocess docu")
+                final_input_link = "" 
+              
+              context = data_preprocess.extract_context(final_input_link, query_kw)
+              chunk += context
+              # if context !=  "Sample ID not found.":
+              #   if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
+              #     success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
+              #     if stop_flag is not None and stop_flag.value:
+              #       print(f"🛑 Stop processing {accession}, aborting early...")
+              #       return {}
+              #     if success_chunk:
+              #       chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+              #       print("yes succeed for chunk")
+              #     else:
+              #       chunk += context
+              #       print("len context: ", len(context))
+              #       print("basic fall back")
+              #   print("len chunk after: ", len(chunk))
+              if len(final_input_link) > 1000*1000:
+                # if context !=  "Sample ID not found.":
+                #   final_input_link =  context 
+                # else:
+                final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
+                if len(final_input_link) > 1000 *1000:
+                    final_input_link = final_input_link[:100000] 
+              print("len normalized all output: ", len(data_preprocess.normalize_for_overlap(all_output)))      
+              # if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
+              #   print("Running merge_texts_skipping_overlap with timeout")
+              #   success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
+              #   if stop_flag is not None and stop_flag.value:
+              #     print(f"🛑 Stop processing {accession}, aborting early...")
+              #     return {}
+              #   print("Returned from timeout logic")
+              #   if success:
+              #     all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+              #     print("yes succeed")
+              #   else:
+              #     print("len all output: ", len(all_output))
+              #     print("len final input link: ", len(final_input_link))
+              #     all_output += final_input_link
+              #     print("len final input: ", len(final_input_link))
+              #     print("basic fall back")
+              # else:
+              #     print("both/either all output or final link too large more than 100000")
+              #     print("len all output: ", len(all_output))
+              #     print("len final input link: ", len(final_input_link))
+              #     all_output += final_input_link
+              #     print("len final input: ", len(final_input_link))
+              #     print("basic fall back")    
+              
+              print("len all output: ", len(all_output))
+              print("len final input link: ", len(final_input_link))
+              all_output = data_preprocess.normalize_for_overlap(all_output) + final_input_link
+              print("len final input: ", len(final_input_link))
+              print("basic fall back")    
+              print("len all output after: ", len(all_output))
+          #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
+        else:
+          chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+          all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+        if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+        if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+        if len(all_output) > 1*1000*1000: 
+          all_output = data_preprocess.normalize_for_overlap(all_output)
+          if len(all_output) > 1*1000*1000:
+            all_output = all_output[:1000000]
+        if len(chunk) > 1*1000*1000: 
+          chunk = data_preprocess.normalize_for_overlap(chunk)
+          if len(chunk) > 1*1000*1000:
+            chunk = chunk[:1*1000*1000]   
+        print("chunk len: ", len(chunk))
+        print("all output len: ", len(all_output))    
+        data_preprocess.save_text_to_docx(chunk, file_chunk_path)
+        data_preprocess.save_text_to_docx(all_output, file_all_path)
+        # Later when saving new files
+        # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
+        # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
+        
+        # Upload to Drive
+        result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
+        result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
+        print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
+        print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")  
+        print("here 1")
+          
+      # else:
+      #   final_input = ""
+      #   if all_output:
+      #     final_input = all_output
+      #   else:  
+      #     if chunk: final_input = chunk
+      #   #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
+      #   if final_input:
+      #     keywords = []
+      #     if iso != "unknown":  keywords.append(iso)
+      #     if acc != "unknown":  keywords.append(acc)
+      #     for keyword in keywords:
+      #       chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
+      #       countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
+      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
+      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
+          
+      # Define paths for cached RAG assets
+      # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
+      # document_chunks_path = saveLinkFolder+"/document_chunks.json"
+      # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
+      print("here 2")
+          
+      # faiss_filename = "faiss_index.bin"
+      # chunks_filename = "document_chunks.json"
+      # lookup_filename = "structured_lookup.json"
+      # print("name of faiss: ", faiss_filename)  
+          
+      # faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
+      # document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
+      # structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
+      # print("name if faiss path: ", faiss_index_path)  
+      # # 🔥 Remove the local file first if it exists
+      # print("start faiss id and also the sample folder id is: ", sample_folder_id)  
+      # faiss_id = find_drive_file(faiss_filename, sample_folder_id)
+      # print("done faiss id")  
+      # document_id = find_drive_file(chunks_filename, sample_folder_id)
+      # structure_id = find_drive_file(lookup_filename, sample_folder_id)  
+      # if faiss_id and document_id and structure_id:
+      #   print("✅ 3 Files already exist in Google Drive. Downloading them...")
+      #   download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
+      #   download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
+      #   download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)  
+      #   # Read and parse these into `chunk` and `all_output`
+      # else:
+      #   "one of id not exist"  
+      #   if os.path.exists(faiss_index_path):
+      #       print("faiss index exist and start to remove: ", faiss_index_path)
+      #       os.remove(faiss_index_path)
+      #   if os.path.exists(document_chunks_path):
+      #       os.remove(document_chunks_path)
+      #   if os.path.exists(structured_lookup_path):
+      #       os.remove(structured_lookup_path)    
+      #   print("start to download the faiss, chunk, lookup")
+            
+      #   download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
+      #   download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
+      #   download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
+      try:
+          print("try gemini 2.5")
+          # print("move to load rag")
+          # master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
+          #     faiss_index_path, document_chunks_path, structured_lookup_path
+          # )
+    
+          global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest')
+          if not all_output:
+            if chunk: all_output = chunk
+            else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
+          # if faiss_index is None:
+          #     print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
+          #     total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
+          #         all_output
+          #     ).total_tokens
+    
+          #     initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
+          #     total_cost_title += initial_embedding_cost
+          #     print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
+    
+    
+          #     master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
+          #         file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
+          #     )
+          # else:
+          #     print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
+          #     plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
+          #     master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
+          if stop_flag is not None and stop_flag.value:
+            print(f"🛑 Stop processing {accession}, aborting early...")
+            return {}  
+          primary_word = iso
+          alternative_word = acc
+          print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
+          if features.lower() not in all_output.lower():  
+            all_output += ". NCBI Features: " + features
+          # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
+          #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
+          #     model.call_llm_api, chunk=chunk, all_output=all_output)
+          print("this is chunk for the model")
+          print(chunk)
+          print("this is all output for the model")
+          print(all_output)  
+          if stop_flag is not None and stop_flag.value:
+            print(f"🛑 Stop processing {accession}, aborting early...")
+            return {}
+          # country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
+          #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
+          #     model.call_llm_api, chunk=chunk, all_output=all_output)
+          country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
+            query_word=primary_word, alternative_query_word=alternative_word, 
+            metadata=meta, 
+            master_structured_lookup=None, faiss_index=None, document_chunks=None, 
+            llm_api_function=model.call_llm_api, chunk=chunk, all_output=all_output)
+          print("pass query of 2.5")
+      except:
+          print("try gemini 1.5")
+          # country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
+          #   primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
+          #   model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")
+          country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
+            query_word=primary_word, alternative_query_word=alternative_word, 
+            metadata=meta, 
+            master_structured_lookup=None, faiss_index=None, document_chunks=None, 
+            llm_api_function=model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")     
+          print("yeah pass the query of 1.5")
+      print("country using ai: ", country)
+      print("sample type using ai: ", sample_type)  
+      # if len(country) == 0: country = "unknown"
+      # if len(sample_type) == 0: sample_type = "unknown"    
+      # if country_explanation: country_explanation = "-"+country_explanation        
+      # else: country_explanation = ""
+      # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+      # else: sample_type_explanation = ""
+      if len(country) == 0: country = "unknown"
+      if len(sample_type) == 0: sample_type = "unknown"    
+      if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation        
+      else: country_explanation = ""
+      if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
+      else: sample_type_explanation = ""
+          
+      if method_used == "unknown": method_used = ""
+      if country.lower() != "unknown":
+        stand_country = standardize_location.smart_country_lookup(country.lower())
+        if stand_country.lower() != "not found":
+          if stand_country.lower() in acc_score["country"]:
+            if country_explanation:
+              acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
+          else:
+            acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
+        else:
+          if country.lower() in acc_score["country"]:
+            if country_explanation:
+              if len(method_used + country_explanation) > 0:
+                acc_score["country"][country.lower()].append(method_used + country_explanation)
+          else:
+            if len(method_used + country_explanation) > 0:
+              acc_score["country"][country.lower()] = [method_used + country_explanation]
+      # if spe_loc.lower() != "unknown":
+      #   if spe_loc.lower() in acc_score["specific_location"]:
+      #     acc_score["specific_location"][spe_loc.lower()].append(method_used)
+      #   else:
+      #     acc_score["specific_location"][spe_loc.lower()] = [method_used]
+      # if ethnic.lower() != "unknown":
+      #   if ethnic.lower() in acc_score["ethnicity"]:
+      #     acc_score["ethnicity"][ethnic.lower()].append(method_used)
+      #   else:
+      #     acc_score["ethnicity"][ethnic.lower()] = [method_used]
+      if sample_type.lower() != "unknown":
+        if sample_type.lower() in acc_score["sample_type"]:
+          if len(method_used + sample_type_explanation) > 0:
+            acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
+        else:
+          if len(method_used + sample_type_explanation)> 0:
+            acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
+      total_cost_title += total_query_cost
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
+      # last resort: combine all information to give all output otherwise unknown
+      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":   
+        text = ""
+        for key in meta_expand:
+          text += str(key) + ": " + meta_expand[key] + "\n"    
+        if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
+          text += data_preprocess.normalize_for_overlap(all_output)          
+        if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
+          text += data_preprocess.normalize_for_overlap(chunk)            
+        text += ". NCBI Features: " + features 
+        print("this is text for the last resort model")
+        print(text)  
+        # country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
+        #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
+        #     model.call_llm_api, chunk=text, all_output=text)  
+        country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
+          query_word=primary_word, alternative_query_word=alternative_word, 
+          metadata=meta, 
+          master_structured_lookup=None, faiss_index=None, document_chunks=None, 
+          llm_api_function=model.call_llm_api, chunk=text, all_output=text)  
+        print("this is last resort results: ")
+        print("country: ", country)
+        print("sample type: ", sample_type)  
+        if len(country) == 0: country = "unknown"
+        if len(sample_type) == 0: sample_type = "unknown"    
+        # if country_explanation: country_explanation = "-"+country_explanation        
+        # else: country_explanation = ""
+        # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+        # else: sample_type_explanation = ""
+        if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation        
+        else: country_explanation = ""
+        if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
+        else: sample_type_explanation = ""
+            
+        if method_used == "unknown": method_used = ""
+        if country.lower() != "unknown":
+          stand_country = standardize_location.smart_country_lookup(country.lower())
+          if stand_country.lower() != "not found":
+            if stand_country.lower() in acc_score["country"]:
+              if country_explanation:
+                acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
+            else:
+              acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
+          else:
+            if country.lower() in acc_score["country"]:
+              if country_explanation:
+                if len(method_used + country_explanation) > 0:
+                  acc_score["country"][country.lower()].append(method_used + country_explanation)
+            else:
+              if len(method_used + country_explanation) > 0:
+                acc_score["country"][country.lower()] = [method_used + country_explanation]
+        if sample_type.lower() != "unknown":
+            if sample_type.lower() in acc_score["sample_type"]:
+              if len(method_used + sample_type_explanation) > 0:
+                acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
+            else:
+              if len(method_used + sample_type_explanation)> 0:
+                acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]          
+        total_cost_title += total_query_cost
+      end = time.time()
+      #total_cost_title += total_query_cost
+      acc_score["query_cost"] = f"{total_cost_title:.6f}"
+      elapsed = end - start
+      acc_score["time_cost"] = f"{elapsed:.3f} seconds"
+      accs_output[acc] = acc_score
+      print(accs_output[acc])
+      
     return accs_output
\ No newline at end of file