mtDNALocation / pipeline.py
VyLala's picture
Update pipeline.py
7a21c19 verified
raw
history blame
48.3 kB
# test1: MJ17 direct
# test2: "A1YU101" thailand cross-ref
# test3: "EBK109" thailand cross-ref
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
import data_preprocess
import model
import mtdna_classifier
#import app
import smart_fallback
import pandas as pd
from pathlib import Path
import subprocess
from NER.html import extractHTML
import os
import google.generativeai as genai
import re
import standardize_location
# Helper functions in for this pipeline
# Track time
import time
import multiprocessing
import gspread
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
from google.oauth2.service_account import Credentials
from oauth2client.service_account import ServiceAccountCredentials
import io
import json
#––– Authentication setup –––
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
def get_or_create_drive_folder(name, parent_id=None):
query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
if parent_id:
query += f" and '{parent_id}' in parents"
results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
items = results.get("files", [])
if items:
return items[0]["id"]
file_metadata = {
"name": name,
"mimeType": "application/vnd.google-apps.folder"
}
if parent_id:
file_metadata["parents"] = [parent_id]
file = drive_service.files().create(body=file_metadata, fields="id").execute()
return file["id"]
# def find_drive_file(filename, parent_id):
# """
# Checks if a file with the given name exists inside the specified Google Drive folder.
# Returns the file ID if found, else None.
# """
# query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
# results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
# files = results.get('files', [])
# if files:
# return files[0]["id"]
# return None
def find_drive_file(filename, parent_id):
"""
Checks if a file with the given name exists inside the specified Google Drive folder.
Returns the file ID if found, else None.
"""
try:
print(f"πŸ” Searching for '{filename}' in folder: {parent_id}")
query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
results = drive_service.files().list(
q=query,
spaces='drive',
fields='files(id, name)',
pageSize=1
).execute()
files = results.get('files', [])
if files:
print(f"βœ… Found file: {files[0]['name']} with ID: {files[0]['id']}")
return files[0]["id"]
else:
print("⚠️ File not found.")
return None
except Exception as e:
print(f"❌ Error during find_drive_file: {e}")
return None
# def upload_file_to_drive(local_path, remote_name, folder_id):
# file_metadata = {"name": remote_name, "parents": [folder_id]}
# media = MediaFileUpload(local_path, resumable=True)
# existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
# if existing:
# drive_service.files().delete(fileId=existing[0]["id"]).execute()
# file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
# result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
# if not result.get("files"):
# print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
# else:
# print(f"βœ… Verified upload: {remote_name}")
# return file["id"]
def upload_file_to_drive(local_path, remote_name, folder_id):
try:
if not os.path.exists(local_path):
raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
# Delete existing file on Drive if present
existing = drive_service.files().list(
q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
fields="files(id)"
).execute().get("files", [])
if existing:
drive_service.files().delete(fileId=existing[0]["id"]).execute()
print(f"πŸ—‘οΈ Deleted existing '{remote_name}' in Drive folder {folder_id}")
file_metadata = {"name": remote_name, "parents": [folder_id]}
media = MediaFileUpload(local_path, resumable=True)
file = drive_service.files().create(
body=file_metadata,
media_body=media,
fields="id"
).execute()
print(f"βœ… Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
return file["id"]
except Exception as e:
print(f"❌ Error during upload: {e}")
return None
def download_file_from_drive(remote_name, folder_id, local_path):
results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
files = results.get("files", [])
if not files:
return False
file_id = files[0]["id"]
request = drive_service.files().get_media(fileId=file_id)
fh = io.FileIO(local_path, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
_, done = downloader.next_chunk()
return True
def download_drive_file_content(file_id):
request = drive_service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
_, done = downloader.next_chunk()
fh.seek(0)
return fh.read().decode("utf-8")
# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
# """
# Runs `func` with timeout in seconds. Kills if it exceeds.
# Returns: (success, result or None)
# """
# def wrapper(q, *args, **kwargs):
# try:
# q.put(func(*args, **kwargs))
# except Exception as e:
# q.put(e)
# q = multiprocessing.Queue()
# p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
# p.start()
# p.join(timeout)
# if p.is_alive():
# p.terminate()
# p.join()
# print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
# return False, None
# else:
# result = q.get()
# if isinstance(result, Exception):
# raise result
# return True, result
# def run_with_timeout(func, args=(), kwargs={}, timeout=30):
# import concurrent.futures
# with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# future = executor.submit(func, *args, **kwargs)
# try:
# return True, future.result(timeout=timeout)
# except concurrent.futures.TimeoutError:
# print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
# return False, None
import multiprocessing
def run_with_timeout(func, args=(), kwargs={}, timeout=30):
def wrapper(q, *args, **kwargs):
try:
result = func(*args, **kwargs)
q.put((True, result))
except Exception as e:
q.put((False, e))
q = multiprocessing.Queue()
p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
p.start()
p.join(timeout)
if p.is_alive():
p.terminate()
p.join()
print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
return False, None
if not q.empty():
success, result = q.get()
if success:
return True, result
else:
raise result # re-raise exception if needed
return False, None
def time_it(func, *args, **kwargs):
"""
Measure how long a function takes to run and return its result + time.
"""
start = time.time()
result = func(*args, **kwargs)
end = time.time()
elapsed = end - start
print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
return result, elapsed
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
def unique_preserve_order(seq):
seen = set()
return [x for x in seq if not (x in seen or seen.add(x))]
# Main execution
def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
# there can be one accession number in the accessions
# Prices are per 1,000 tokens
# Before each big step:
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop detected before starting {accession}, aborting early...")
return {}
# PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
# PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
# PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
# Gemini 2.5 Flash-Lite pricing per 1,000 tokens
PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
# Embedding-001 pricing per 1,000 input tokens
PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens
if not accessions:
print("no input")
return None
else:
accs_output = {}
#genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
for acc in accessions:
print("start gemini: ", acc)
start = time.time()
total_cost_title = 0
jsonSM, links, article_text = {},[], ""
acc_score = { "isolate": "",
"country":{},
"sample_type":{},
#"specific_location":{},
#"ethnicity":{},
"query_cost":total_cost_title,
"time_cost":None,
"source":links,
"file_chunk":"",
"file_all_output":""}
if niche_cases:
for niche in niche_cases:
acc_score[niche] = {}
meta = mtdna_classifier.fetch_ncbi_metadata(acc)
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
acc_score["isolate"] = iso
print("meta: ",meta)
meta_expand = smart_fallback.fetch_ncbi(acc)
print("meta expand: ", meta_expand)
# set up step: create the folder to save document
chunk, all_output = "",""
if pudID:
id = str(pudID)
saveTitle = title
else:
try:
author_name = meta_expand["authors"].split(',')[0] # Use last name only
except:
author_name = meta_expand["authors"]
saveTitle = title + "_" + col_date + "_" + author_name
if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown":
saveTitle += "_" + acc
id = "DirectSubmission"
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
# if not folder_path.exists():
# cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
# result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# print("data/"+str(id) +" created.")
# else:
# print("data/"+str(id) +" already exists.")
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
# parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
# data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
# sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
print("sample folder id: ", sample_folder_id)
# Define document names
if len(saveTitle) > 50:
saveName = saveTitle[:50]
saveName = saveName.replace(" ", "_")
chunk_filename = f"{saveName}_merged_document.docx"
all_filename = f"{saveName}_all_merged_document.docx"
else:
saveName = saveTitle.replace(" ", "_")
chunk_filename = f"{saveName}_merged_document.docx"
all_filename = f"{saveName}_all_merged_document.docx"
print("chunk file name and all filename: ", chunk_filename, all_filename)
# Define local temp paths for reading/writing
# import tempfile
# tmp_dir = tempfile.mkdtemp()
LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
print("this is file chunk path: ", file_chunk_path)
chunk_id = find_drive_file(chunk_filename, sample_folder_id)
all_id = find_drive_file(all_filename, sample_folder_id)
if chunk_id and all_id:
print("βœ… Files already exist in Google Drive. Downloading them...")
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
acc_score["file_chunk"] = str(chunk_filename)
acc_score["file_all_output"] = str(all_filename)
print("chunk_id and all_id: ")
print(chunk_id, all_id)
print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
print("πŸ“„ Name:", file["name"])
print("πŸ“ Parent folder ID:", file["parents"][0])
print("πŸ”— View link:", file["webViewLink"])
# Read and parse these into `chunk` and `all_output`
else:
# πŸ”₯ Remove any stale local copies
if os.path.exists(file_chunk_path):
os.remove(file_chunk_path)
print(f"πŸ—‘οΈ Removed stale: {file_chunk_path}")
if os.path.exists(file_all_path):
os.remove(file_all_path)
print(f"πŸ—‘οΈ Removed stale: {file_all_path}")
# πŸ”₯ Remove the local file first if it exists
# if os.path.exists(file_chunk_path):
# os.remove(file_chunk_path)
# print("remove chunk path")
# if os.path.exists(file_all_path):
# os.remove(file_all_path)
# print("remove all path")
# Try to download if already exists on Drive
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
print("chunk exist: ", chunk_exists)
# first way: ncbi method
print("country.lower: ",country.lower())
if country.lower() != "unknown":
stand_country = standardize_location.smart_country_lookup(country.lower())
print("stand_country: ", stand_country)
if stand_country.lower() != "not found":
acc_score["country"][stand_country.lower()] = ["ncbi"]
else: acc_score["country"][country.lower()] = ["ncbi"]
# if spe_loc.lower() != "unknown":
# acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
# if ethnic.lower() != "unknown":
# acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
if sample_type.lower() != "unknown":
acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
# second way: LLM model
# Preprocess the input token
print(acc_score)
accession, isolate = None, None
if acc != "unknown": accession = acc
if iso != "unknown": isolate = iso
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
# check doi first
print("chunk filename: ", chunk_filename)
if chunk_exists:
print("File chunk exists!")
if not chunk:
print("start to get chunk")
text, table, document_title = model.read_docx_text(file_chunk_path)
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
if str(chunk_filename) != "":
print("first time have chunk path at chunk exist: ", str(chunk_filename))
acc_score["file_chunk"] = str(chunk_filename)
if all_exists:
print("File all output exists!")
if not all_output:
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
if str(all_filename) != "":
print("first time have all path at all exist: ", str(all_filename))
acc_score["file_all_output"] = str(all_filename)
print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
if doi != "unknown":
link = 'https://doi.org/' + doi
# get the file to create listOfFile for each id
print("link of doi: ", link)
html = extractHTML.HTML("",link)
jsonSM = html.getSupMaterial()
article_text = html.getListSection()
if article_text:
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
links.append(link)
if jsonSM:
links += sum((jsonSM[key] for key in jsonSM),[])
# no doi then google custom search api
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
# might find the article
print("no article text, start tem link")
#tem_links = mtdna_classifier.search_google_custom(title, 2)
tem_links = smart_fallback.smart_google_search(meta_expand)
print("tem links: ", tem_links)
tem_link_acc = smart_fallback.google_accession_search(acc)
tem_links += tem_link_acc
tem_links = unique_preserve_order(tem_links)
print("tem link before filtering: ", tem_links)
# filter the quality link
print("saveLinkFolder as sample folder id: ", sample_folder_id)
print("start the smart filter link")
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
# success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
# if success_process:
# links = output_process
# print("yes succeed for smart filter link")
# else:
# print("no suceed, fallback to all tem links")
# links = tem_links
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
print("this is links: ",links)
links = unique_preserve_order(links)
acc_score["source"] = links
else:
print("inside the try of reusing chunk or all output")
#print("chunk filename: ", str(chunks_filename))
try:
temp_source = False
if save_df is not None and not save_df.empty:
print("save df not none")
print("chunk file name: ",str(chunk_filename))
print("all filename: ",str(all_filename))
if acc_score["file_chunk"]:
link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
#link = row["Sources"].iloc[0]
if "http" in link:
print("yeah http in save df source")
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
else: # temporary
print("tempo source")
#acc_score["source"] = [str(all_filename), str(chunks_filename)]
temp_source = True
elif acc_score["file_all_output"]:
link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
#link = row["Sources"].iloc[0]
print(link)
print("list of link")
print([x for x in link.split("\n") if x.strip()])
if "http" in link:
print("yeah http in save df source")
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
else: # temporary
print("tempo source")
#acc_score["source"] = [str(all_filename), str(chunks_filename)]
temp_source = True
else: # temporary
print("tempo source")
#acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
temp_source = True
else: # temporary
print("tempo source")
#acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
temp_source = True
if temp_source:
print("temp source is true so have to try again search link")
if doi != "unknown":
link = 'https://doi.org/' + doi
# get the file to create listOfFile for each id
print("link of doi: ", link)
html = extractHTML.HTML("",link)
jsonSM = html.getSupMaterial()
article_text = html.getListSection()
if article_text:
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
links.append(link)
if jsonSM:
links += sum((jsonSM[key] for key in jsonSM),[])
# no doi then google custom search api
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
# might find the article
print("no article text, start tem link")
#tem_links = mtdna_classifier.search_google_custom(title, 2)
tem_links = smart_fallback.smart_google_search(meta_expand)
print("tem links: ", tem_links)
tem_link_acc = smart_fallback.google_accession_search(acc)
tem_links += tem_link_acc
tem_links = unique_preserve_order(tem_links)
print("tem link before filtering: ", tem_links)
# filter the quality link
print("saveLinkFolder as sample folder id: ", sample_folder_id)
print("start the smart filter link")
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
# success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
# if success_process:
# links = output_process
# print("yes succeed for smart filter link")
# else:
# print("no suceed, fallback to all tem links")
# links = tem_links
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
print("this is links: ",links)
links = unique_preserve_order(links)
acc_score["source"] = links
except:
print("except for source")
acc_score["source"] = []
# chunk_path = "/"+saveTitle+"_merged_document.docx"
# all_path = "/"+saveTitle+"_all_merged_document.docx"
# # if chunk and all output not exist yet
# file_chunk_path = saveLinkFolder + chunk_path
# file_all_path = saveLinkFolder + all_path
# if os.path.exists(file_chunk_path):
# print("File chunk exists!")
# if not chunk:
# text, table, document_title = model.read_docx_text(file_chunk_path)
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
# if os.path.exists(file_all_path):
# print("File all output exists!")
# if not all_output:
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
# print("chunk filename: ", chunk_filename)
# if chunk_exists:
# print("File chunk exists!")
# if not chunk:
# print("start to get chunk")
# text, table, document_title = model.read_docx_text(file_chunk_path)
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
# if str(chunk_filename) != "":
# print("first time have chunk path at chunk exist: ", str(chunk_filename))
# acc_score["file_chunk"] = str(chunk_filename)
# if all_exists:
# print("File all output exists!")
# if not all_output:
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
# if str(all_filename) != "":
# print("first time have all path at all exist: ", str(all_filename))
# acc_score["file_all_output"] = str(all_filename)
if not chunk and not all_output:
print("not chunk and all output")
# else: check if we can reuse these chunk and all output of existed accession to find another
if str(chunk_filename) != "":
print("first time have chunk path: ", str(chunk_filename))
acc_score["file_chunk"] = str(chunk_filename)
if str(all_filename) != "":
print("first time have all path: ", str(all_filename))
acc_score["file_all_output"] = str(all_filename)
if links:
for link in links:
print(link)
# if len(all_output) > 1000*1000:
# all_output = data_preprocess.normalize_for_overlap(all_output)
# print("after normalizing all output: ", len(all_output))
if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
print("break here")
break
if iso != "unknown": query_kw = iso
else: query_kw = acc
#text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
if success_process:
text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
print("yes succeed for process document")
else: text_link, tables_link, final_input_link = "", "", ""
context = data_preprocess.extract_context(final_input_link, query_kw)
if context != "Sample ID not found.":
if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
if success_chunk:
chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
print("yes succeed for chunk")
else:
chunk += context
print("len context: ", len(context))
print("basic fall back")
print("len chunk after: ", len(chunk))
if len(final_input_link) > 1000*1000:
if context != "Sample ID not found.":
final_input_link = context
else:
final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
if len(final_input_link) > 1000 *1000:
final_input_link = final_input_link[:100000]
if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
print("Running merge_texts_skipping_overlap with timeout")
success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
print("Returned from timeout logic")
if success:
all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
print("yes succeed")
else:
print("len all output: ", len(all_output))
print("len final input link: ", len(final_input_link))
all_output += final_input_link
print("len final input: ", len(final_input_link))
print("basic fall back")
else:
print("both/either all output or final link too large more than 100000")
print("len all output: ", len(all_output))
print("len final input link: ", len(final_input_link))
all_output += final_input_link
print("len final input: ", len(final_input_link))
print("basic fall back")
print("len all output after: ", len(all_output))
#country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
else:
chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
if len(all_output) > 1*1024*1024:
all_output = data_preprocess.normalize_for_overlap(all_output)
if len(all_output) > 1*1024*1024:
all_output = all_output[:1*1024*1024]
print("chunk len: ", len(chunk))
print("all output len: ", len(all_output))
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
data_preprocess.save_text_to_docx(all_output, file_all_path)
# Later when saving new files
# data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
# Upload to Drive
result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
print(f"πŸ”— Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
print("here 1")
# else:
# final_input = ""
# if all_output:
# final_input = all_output
# else:
# if chunk: final_input = chunk
# #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
# if final_input:
# keywords = []
# if iso != "unknown": keywords.append(iso)
# if acc != "unknown": keywords.append(acc)
# for keyword in keywords:
# chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
# countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
# Define paths for cached RAG assets
# faiss_index_path = saveLinkFolder+"/faiss_index.bin"
# document_chunks_path = saveLinkFolder+"/document_chunks.json"
# structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
print("here 2")
faiss_filename = "faiss_index.bin"
chunks_filename = "document_chunks.json"
lookup_filename = "structured_lookup.json"
print("name of faiss: ", faiss_filename)
faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
print("name if faiss path: ", faiss_index_path)
# πŸ”₯ Remove the local file first if it exists
print("start faiss id and also the sample folder id is: ", sample_folder_id)
faiss_id = find_drive_file(faiss_filename, sample_folder_id)
print("done faiss id")
document_id = find_drive_file(chunks_filename, sample_folder_id)
structure_id = find_drive_file(lookup_filename, sample_folder_id)
if faiss_id and document_id and structure_id:
print("βœ… 3 Files already exist in Google Drive. Downloading them...")
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
# Read and parse these into `chunk` and `all_output`
else:
"one of id not exist"
if os.path.exists(faiss_index_path):
print("faiss index exist and start to remove: ", faiss_index_path)
os.remove(faiss_index_path)
if os.path.exists(document_chunks_path):
os.remove(document_chunks_path)
if os.path.exists(structured_lookup_path):
os.remove(structured_lookup_path)
print("start to download the faiss, chunk, lookup")
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
try:
print("try gemini 2.5")
print("move to load rag")
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
faiss_index_path, document_chunks_path, structured_lookup_path
)
global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
if not all_output:
if chunk: all_output = chunk
else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
if faiss_index is None:
print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
all_output
).total_tokens
initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
total_cost_title += initial_embedding_cost
print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
)
else:
print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
primary_word = iso
alternative_word = acc
print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
if features.lower() not in all_output.lower():
all_output += ". NCBI Features: " + features
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
# model.call_llm_api, chunk=chunk, all_output=all_output)
print("this is chunk for the model")
print(chunk)
print("this is all output for the model")
print(all_output)
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
model.call_llm_api, chunk=chunk, all_output=all_output)
print("pass query of 2.5")
except:
print("try gemini 1.5")
country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")
print("yeah pass the query of 1.5")
print("country using ai: ", country)
print("sample type using ai: ", sample_type)
# if len(country) == 0: country = "unknown"
# if len(sample_type) == 0: sample_type = "unknown"
# if country_explanation: country_explanation = "-"+country_explanation
# else: country_explanation = ""
# if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
# else: sample_type_explanation = ""
if len(country) == 0: country = "unknown"
if len(sample_type) == 0: sample_type = "unknown"
if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation
else: country_explanation = ""
if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
else: sample_type_explanation = ""
if method_used == "unknown": method_used = ""
if country.lower() != "unknown":
stand_country = standardize_location.smart_country_lookup(country.lower())
if stand_country.lower() != "not found":
if stand_country.lower() in acc_score["country"]:
if country_explanation:
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
else:
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
else:
if country.lower() in acc_score["country"]:
if country_explanation:
if len(method_used + country_explanation) > 0:
acc_score["country"][country.lower()].append(method_used + country_explanation)
else:
if len(method_used + country_explanation) > 0:
acc_score["country"][country.lower()] = [method_used + country_explanation]
# if spe_loc.lower() != "unknown":
# if spe_loc.lower() in acc_score["specific_location"]:
# acc_score["specific_location"][spe_loc.lower()].append(method_used)
# else:
# acc_score["specific_location"][spe_loc.lower()] = [method_used]
# if ethnic.lower() != "unknown":
# if ethnic.lower() in acc_score["ethnicity"]:
# acc_score["ethnicity"][ethnic.lower()].append(method_used)
# else:
# acc_score["ethnicity"][ethnic.lower()] = [method_used]
if sample_type.lower() != "unknown":
if sample_type.lower() in acc_score["sample_type"]:
if len(method_used + sample_type_explanation) > 0:
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
else:
if len(method_used + sample_type_explanation)> 0:
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
total_cost_title += total_query_cost
if stop_flag is not None and stop_flag.value:
print(f"πŸ›‘ Stop processing {accession}, aborting early...")
return {}
# last resort: combine all information to give all output otherwise unknown
if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":
text = ""
for key in meta_expand:
text += str(key) + ": " + meta_expand[key] + "\n"
if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
text += data_preprocess.normalize_for_overlap(all_output)
if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
text += data_preprocess.normalize_for_overlap(chunk)
text += ". NCBI Features: " + features
print("this is text for the last resort model")
print(text)
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
model.call_llm_api, chunk=text, all_output=text)
print("this is last resort results: ")
print("country: ", country)
print("sample type: ", sample_type)
if len(country) == 0: country = "unknown"
if len(sample_type) == 0: sample_type = "unknown"
# if country_explanation: country_explanation = "-"+country_explanation
# else: country_explanation = ""
# if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
# else: sample_type_explanation = ""
if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation
else: country_explanation = ""
if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
else: sample_type_explanation = ""
if method_used == "unknown": method_used = ""
if country.lower() != "unknown":
stand_country = standardize_location.smart_country_lookup(country.lower())
if stand_country.lower() != "not found":
if stand_country.lower() in acc_score["country"]:
if country_explanation:
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
else:
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
else:
if country.lower() in acc_score["country"]:
if country_explanation:
if len(method_used + country_explanation) > 0:
acc_score["country"][country.lower()].append(method_used + country_explanation)
else:
if len(method_used + country_explanation) > 0:
acc_score["country"][country.lower()] = [method_used + country_explanation]
if sample_type.lower() != "unknown":
if sample_type.lower() in acc_score["sample_type"]:
if len(method_used + sample_type_explanation) > 0:
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
else:
if len(method_used + sample_type_explanation)> 0:
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
total_cost_title += total_query_cost
end = time.time()
#total_cost_title += total_query_cost
acc_score["query_cost"] = f"{total_cost_title:.6f}"
elapsed = end - start
acc_score["time_cost"] = f"{elapsed:.3f} seconds"
accs_output[acc] = acc_score
print(accs_output[acc])
return accs_output