Spaces:
Running
Running
# test1: MJ17 direct | |
# test2: "A1YU101" thailand cross-ref | |
# test3: "EBK109" thailand cross-ref | |
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and" | |
import data_preprocess | |
import model | |
import mtdna_classifier | |
#import app | |
import smart_fallback | |
import pandas as pd | |
from pathlib import Path | |
import subprocess | |
from NER.html import extractHTML | |
import os | |
import google.generativeai as genai | |
import re | |
import standardize_location | |
# Helper functions in for this pipeline | |
# Track time | |
import time | |
import multiprocessing | |
import gspread | |
from googleapiclient.discovery import build | |
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload | |
from google.oauth2.service_account import Credentials | |
from oauth2client.service_account import ServiceAccountCredentials | |
import io | |
import json | |
#βββ Authentication setup βββ | |
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier" | |
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"] | |
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets | |
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"]) | |
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS) | |
def get_or_create_drive_folder(name, parent_id=None): | |
query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'" | |
if parent_id: | |
query += f" and '{parent_id}' in parents" | |
results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute() | |
items = results.get("files", []) | |
if items: | |
return items[0]["id"] | |
file_metadata = { | |
"name": name, | |
"mimeType": "application/vnd.google-apps.folder" | |
} | |
if parent_id: | |
file_metadata["parents"] = [parent_id] | |
file = drive_service.files().create(body=file_metadata, fields="id").execute() | |
return file["id"] | |
# def find_drive_file(filename, parent_id): | |
# """ | |
# Checks if a file with the given name exists inside the specified Google Drive folder. | |
# Returns the file ID if found, else None. | |
# """ | |
# query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false" | |
# results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute() | |
# files = results.get('files', []) | |
# if files: | |
# return files[0]["id"] | |
# return None | |
def find_drive_file(filename, parent_id): | |
""" | |
Checks if a file with the given name exists inside the specified Google Drive folder. | |
Returns the file ID if found, else None. | |
""" | |
try: | |
print(f"π Searching for '{filename}' in folder: {parent_id}") | |
query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false" | |
results = drive_service.files().list( | |
q=query, | |
spaces='drive', | |
fields='files(id, name)', | |
pageSize=1 | |
).execute() | |
files = results.get('files', []) | |
if files: | |
print(f"β Found file: {files[0]['name']} with ID: {files[0]['id']}") | |
return files[0]["id"] | |
else: | |
print("β οΈ File not found.") | |
return None | |
except Exception as e: | |
print(f"β Error during find_drive_file: {e}") | |
return None | |
# def upload_file_to_drive(local_path, remote_name, folder_id): | |
# file_metadata = {"name": remote_name, "parents": [folder_id]} | |
# media = MediaFileUpload(local_path, resumable=True) | |
# existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", []) | |
# if existing: | |
# drive_service.files().delete(fileId=existing[0]["id"]).execute() | |
# file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute() | |
# result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute() | |
# if not result.get("files"): | |
# print(f"β Upload failed: File '{remote_name}' not found in folder after upload.") | |
# else: | |
# print(f"β Verified upload: {remote_name}") | |
# return file["id"] | |
def upload_file_to_drive(local_path, remote_name, folder_id): | |
try: | |
if not os.path.exists(local_path): | |
raise FileNotFoundError(f"β Local file does not exist: {local_path}") | |
# Delete existing file on Drive if present | |
existing = drive_service.files().list( | |
q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false", | |
fields="files(id)" | |
).execute().get("files", []) | |
if existing: | |
drive_service.files().delete(fileId=existing[0]["id"]).execute() | |
print(f"ποΈ Deleted existing '{remote_name}' in Drive folder {folder_id}") | |
file_metadata = {"name": remote_name, "parents": [folder_id]} | |
media = MediaFileUpload(local_path, resumable=True) | |
file = drive_service.files().create( | |
body=file_metadata, | |
media_body=media, | |
fields="id" | |
).execute() | |
print(f"β Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}") | |
return file["id"] | |
except Exception as e: | |
print(f"β Error during upload: {e}") | |
return None | |
def download_file_from_drive(remote_name, folder_id, local_path): | |
results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute() | |
files = results.get("files", []) | |
if not files: | |
return False | |
file_id = files[0]["id"] | |
request = drive_service.files().get_media(fileId=file_id) | |
fh = io.FileIO(local_path, 'wb') | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while not done: | |
_, done = downloader.next_chunk() | |
return True | |
def download_drive_file_content(file_id): | |
request = drive_service.files().get_media(fileId=file_id) | |
fh = io.BytesIO() | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while not done: | |
_, done = downloader.next_chunk() | |
fh.seek(0) | |
return fh.read().decode("utf-8") | |
# def run_with_timeout(func, args=(), kwargs={}, timeout=20): | |
# """ | |
# Runs `func` with timeout in seconds. Kills if it exceeds. | |
# Returns: (success, result or None) | |
# """ | |
# def wrapper(q, *args, **kwargs): | |
# try: | |
# q.put(func(*args, **kwargs)) | |
# except Exception as e: | |
# q.put(e) | |
# q = multiprocessing.Queue() | |
# p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs) | |
# p.start() | |
# p.join(timeout) | |
# if p.is_alive(): | |
# p.terminate() | |
# p.join() | |
# print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.") | |
# return False, None | |
# else: | |
# result = q.get() | |
# if isinstance(result, Exception): | |
# raise result | |
# return True, result | |
# def run_with_timeout(func, args=(), kwargs={}, timeout=30): | |
# import concurrent.futures | |
# with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: | |
# future = executor.submit(func, *args, **kwargs) | |
# try: | |
# return True, future.result(timeout=timeout) | |
# except concurrent.futures.TimeoutError: | |
# print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.") | |
# return False, None | |
import multiprocessing | |
def run_with_timeout(func, args=(), kwargs={}, timeout=30): | |
def wrapper(q, *args, **kwargs): | |
try: | |
result = func(*args, **kwargs) | |
q.put((True, result)) | |
except Exception as e: | |
q.put((False, e)) | |
q = multiprocessing.Queue() | |
p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs) | |
p.start() | |
p.join(timeout) | |
if p.is_alive(): | |
p.terminate() | |
p.join() | |
print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.") | |
return False, None | |
if not q.empty(): | |
success, result = q.get() | |
if success: | |
return True, result | |
else: | |
raise result # re-raise exception if needed | |
return False, None | |
def time_it(func, *args, **kwargs): | |
""" | |
Measure how long a function takes to run and return its result + time. | |
""" | |
start = time.time() | |
result = func(*args, **kwargs) | |
end = time.time() | |
elapsed = end - start | |
print(f"β±οΈ '{func.__name__}' took {elapsed:.3f} seconds") | |
return result, elapsed | |
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) --- | |
def unique_preserve_order(seq): | |
seen = set() | |
return [x for x in seq if not (x in seen or seen.add(x))] | |
# Main execution | |
def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None): | |
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain | |
# there can be one accession number in the accessions | |
# Prices are per 1,000 tokens | |
# Before each big step: | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop detected before starting {accession}, aborting early...") | |
return {} | |
# PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens | |
# PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens | |
# PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens | |
# Gemini 2.5 Flash-Lite pricing per 1,000 tokens | |
PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens | |
PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens | |
# Embedding-001 pricing per 1,000 input tokens | |
PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens | |
if not accessions: | |
print("no input") | |
return None | |
else: | |
accs_output = {} | |
#genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) | |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP")) | |
for acc in accessions: | |
print("start gemini: ", acc) | |
start = time.time() | |
total_cost_title = 0 | |
jsonSM, links, article_text = {},[], "" | |
acc_score = { "isolate": "", | |
"country":{}, | |
"sample_type":{}, | |
#"specific_location":{}, | |
#"ethnicity":{}, | |
"query_cost":total_cost_title, | |
"time_cost":None, | |
"source":links, | |
"file_chunk":"", | |
"file_all_output":""} | |
if niche_cases: | |
for niche in niche_cases: | |
acc_score[niche] = {} | |
meta = mtdna_classifier.fetch_ncbi_metadata(acc) | |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"] | |
acc_score["isolate"] = iso | |
print("meta: ",meta) | |
meta_expand = smart_fallback.fetch_ncbi(acc) | |
print("meta expand: ", meta_expand) | |
# set up step: create the folder to save document | |
chunk, all_output = "","" | |
if pudID: | |
id = str(pudID) | |
saveTitle = title | |
else: | |
try: | |
author_name = meta_expand["authors"].split(',')[0] # Use last name only | |
except: | |
author_name = meta_expand["authors"] | |
saveTitle = title + "_" + col_date + "_" + author_name | |
if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown": | |
saveTitle += "_" + acc | |
id = "DirectSubmission" | |
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)) | |
# if not folder_path.exists(): | |
# cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}' | |
# result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
# print("data/"+str(id) +" created.") | |
# else: | |
# print("data/"+str(id) +" already exists.") | |
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id) | |
# parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME) | |
# data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id) | |
# sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id) | |
data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly | |
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id) | |
print("sample folder id: ", sample_folder_id) | |
# Define document names | |
if len(saveTitle) > 50: | |
saveName = saveTitle[:50] | |
saveName = saveName.replace(" ", "_") | |
chunk_filename = f"{saveName}_merged_document.docx" | |
all_filename = f"{saveName}_all_merged_document.docx" | |
else: | |
saveName = saveTitle.replace(" ", "_") | |
chunk_filename = f"{saveName}_merged_document.docx" | |
all_filename = f"{saveName}_all_merged_document.docx" | |
print("chunk file name and all filename: ", chunk_filename, all_filename) | |
# Define local temp paths for reading/writing | |
# import tempfile | |
# tmp_dir = tempfile.mkdtemp() | |
LOCAL_TEMP_DIR = "/mnt/data/generated_docs" | |
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True) | |
file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename) | |
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename) | |
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename) | |
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
print("this is file chunk path: ", file_chunk_path) | |
chunk_id = find_drive_file(chunk_filename, sample_folder_id) | |
all_id = find_drive_file(all_filename, sample_folder_id) | |
if chunk_id and all_id: | |
print("β Files already exist in Google Drive. Downloading them...") | |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path) | |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path) | |
acc_score["file_chunk"] = str(chunk_filename) | |
acc_score["file_all_output"] = str(all_filename) | |
print("chunk_id and all_id: ") | |
print(chunk_id, all_id) | |
print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"]) | |
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute() | |
print("π Name:", file["name"]) | |
print("π Parent folder ID:", file["parents"][0]) | |
print("π View link:", file["webViewLink"]) | |
# Read and parse these into `chunk` and `all_output` | |
else: | |
# π₯ Remove any stale local copies | |
if os.path.exists(file_chunk_path): | |
os.remove(file_chunk_path) | |
print(f"ποΈ Removed stale: {file_chunk_path}") | |
if os.path.exists(file_all_path): | |
os.remove(file_all_path) | |
print(f"ποΈ Removed stale: {file_all_path}") | |
# π₯ Remove the local file first if it exists | |
# if os.path.exists(file_chunk_path): | |
# os.remove(file_chunk_path) | |
# print("remove chunk path") | |
# if os.path.exists(file_all_path): | |
# os.remove(file_all_path) | |
# print("remove all path") | |
# Try to download if already exists on Drive | |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path) | |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path) | |
print("chunk exist: ", chunk_exists) | |
# first way: ncbi method | |
print("country.lower: ",country.lower()) | |
if country.lower() != "unknown": | |
stand_country = standardize_location.smart_country_lookup(country.lower()) | |
print("stand_country: ", stand_country) | |
if stand_country.lower() != "not found": | |
acc_score["country"][stand_country.lower()] = ["ncbi"] | |
else: acc_score["country"][country.lower()] = ["ncbi"] | |
# if spe_loc.lower() != "unknown": | |
# acc_score["specific_location"][spe_loc.lower()] = ["ncbi"] | |
# if ethnic.lower() != "unknown": | |
# acc_score["ethnicity"][ethnic.lower()] = ["ncbi"] | |
if sample_type.lower() != "unknown": | |
acc_score["sample_type"][sample_type.lower()] = ["ncbi"] | |
# second way: LLM model | |
# Preprocess the input token | |
print(acc_score) | |
accession, isolate = None, None | |
if acc != "unknown": accession = acc | |
if iso != "unknown": isolate = iso | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
# check doi first | |
print("chunk filename: ", chunk_filename) | |
if chunk_exists: | |
print("File chunk exists!") | |
if not chunk: | |
print("start to get chunk") | |
text, table, document_title = model.read_docx_text(file_chunk_path) | |
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) | |
if str(chunk_filename) != "": | |
print("first time have chunk path at chunk exist: ", str(chunk_filename)) | |
acc_score["file_chunk"] = str(chunk_filename) | |
if all_exists: | |
print("File all output exists!") | |
if not all_output: | |
text_all, table_all, document_title_all = model.read_docx_text(file_all_path) | |
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) | |
if str(all_filename) != "": | |
print("first time have all path at all exist: ", str(all_filename)) | |
acc_score["file_all_output"] = str(all_filename) | |
print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"]) | |
if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0: | |
if doi != "unknown": | |
link = 'https://doi.org/' + doi | |
# get the file to create listOfFile for each id | |
print("link of doi: ", link) | |
html = extractHTML.HTML("",link) | |
jsonSM = html.getSupMaterial() | |
article_text = html.getListSection() | |
if article_text: | |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): | |
links.append(link) | |
if jsonSM: | |
links += sum((jsonSM[key] for key in jsonSM),[]) | |
# no doi then google custom search api | |
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): | |
# might find the article | |
print("no article text, start tem link") | |
#tem_links = mtdna_classifier.search_google_custom(title, 2) | |
tem_links = smart_fallback.smart_google_search(meta_expand) | |
print("tem links: ", tem_links) | |
tem_link_acc = smart_fallback.google_accession_search(acc) | |
tem_links += tem_link_acc | |
tem_links = unique_preserve_order(tem_links) | |
print("tem link before filtering: ", tem_links) | |
# filter the quality link | |
print("saveLinkFolder as sample folder id: ", sample_folder_id) | |
print("start the smart filter link") | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
# success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) | |
# if success_process: | |
# links = output_process | |
# print("yes succeed for smart filter link") | |
# else: | |
# print("no suceed, fallback to all tem links") | |
# links = tem_links | |
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) | |
print("this is links: ",links) | |
links = unique_preserve_order(links) | |
acc_score["source"] = links | |
else: | |
print("inside the try of reusing chunk or all output") | |
#print("chunk filename: ", str(chunks_filename)) | |
try: | |
temp_source = False | |
if save_df is not None and not save_df.empty: | |
print("save df not none") | |
print("chunk file name: ",str(chunk_filename)) | |
print("all filename: ",str(all_filename)) | |
if acc_score["file_chunk"]: | |
link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0] | |
#link = row["Sources"].iloc[0] | |
if "http" in link: | |
print("yeah http in save df source") | |
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist() | |
else: # temporary | |
print("tempo source") | |
#acc_score["source"] = [str(all_filename), str(chunks_filename)] | |
temp_source = True | |
elif acc_score["file_all_output"]: | |
link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0] | |
#link = row["Sources"].iloc[0] | |
print(link) | |
print("list of link") | |
print([x for x in link.split("\n") if x.strip()]) | |
if "http" in link: | |
print("yeah http in save df source") | |
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist() | |
else: # temporary | |
print("tempo source") | |
#acc_score["source"] = [str(all_filename), str(chunks_filename)] | |
temp_source = True | |
else: # temporary | |
print("tempo source") | |
#acc_score["source"] = [str(file_all_path), str(file_chunk_path)] | |
temp_source = True | |
else: # temporary | |
print("tempo source") | |
#acc_score["source"] = [str(file_all_path), str(file_chunk_path)] | |
temp_source = True | |
if temp_source: | |
print("temp source is true so have to try again search link") | |
if doi != "unknown": | |
link = 'https://doi.org/' + doi | |
# get the file to create listOfFile for each id | |
print("link of doi: ", link) | |
html = extractHTML.HTML("",link) | |
jsonSM = html.getSupMaterial() | |
article_text = html.getListSection() | |
if article_text: | |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): | |
links.append(link) | |
if jsonSM: | |
links += sum((jsonSM[key] for key in jsonSM),[]) | |
# no doi then google custom search api | |
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): | |
# might find the article | |
print("no article text, start tem link") | |
#tem_links = mtdna_classifier.search_google_custom(title, 2) | |
tem_links = smart_fallback.smart_google_search(meta_expand) | |
print("tem links: ", tem_links) | |
tem_link_acc = smart_fallback.google_accession_search(acc) | |
tem_links += tem_link_acc | |
tem_links = unique_preserve_order(tem_links) | |
print("tem link before filtering: ", tem_links) | |
# filter the quality link | |
print("saveLinkFolder as sample folder id: ", sample_folder_id) | |
print("start the smart filter link") | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
# success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) | |
# if success_process: | |
# links = output_process | |
# print("yes succeed for smart filter link") | |
# else: | |
# print("no suceed, fallback to all tem links") | |
# links = tem_links | |
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) | |
print("this is links: ",links) | |
links = unique_preserve_order(links) | |
acc_score["source"] = links | |
except: | |
print("except for source") | |
acc_score["source"] = [] | |
# chunk_path = "/"+saveTitle+"_merged_document.docx" | |
# all_path = "/"+saveTitle+"_all_merged_document.docx" | |
# # if chunk and all output not exist yet | |
# file_chunk_path = saveLinkFolder + chunk_path | |
# file_all_path = saveLinkFolder + all_path | |
# if os.path.exists(file_chunk_path): | |
# print("File chunk exists!") | |
# if not chunk: | |
# text, table, document_title = model.read_docx_text(file_chunk_path) | |
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) | |
# if os.path.exists(file_all_path): | |
# print("File all output exists!") | |
# if not all_output: | |
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path) | |
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
# print("chunk filename: ", chunk_filename) | |
# if chunk_exists: | |
# print("File chunk exists!") | |
# if not chunk: | |
# print("start to get chunk") | |
# text, table, document_title = model.read_docx_text(file_chunk_path) | |
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) | |
# if str(chunk_filename) != "": | |
# print("first time have chunk path at chunk exist: ", str(chunk_filename)) | |
# acc_score["file_chunk"] = str(chunk_filename) | |
# if all_exists: | |
# print("File all output exists!") | |
# if not all_output: | |
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path) | |
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) | |
# if str(all_filename) != "": | |
# print("first time have all path at all exist: ", str(all_filename)) | |
# acc_score["file_all_output"] = str(all_filename) | |
if not chunk and not all_output: | |
print("not chunk and all output") | |
# else: check if we can reuse these chunk and all output of existed accession to find another | |
if str(chunk_filename) != "": | |
print("first time have chunk path: ", str(chunk_filename)) | |
acc_score["file_chunk"] = str(chunk_filename) | |
if str(all_filename) != "": | |
print("first time have all path: ", str(all_filename)) | |
acc_score["file_all_output"] = str(all_filename) | |
if links: | |
for link in links: | |
print(link) | |
# if len(all_output) > 1000*1000: | |
# all_output = data_preprocess.normalize_for_overlap(all_output) | |
# print("after normalizing all output: ", len(all_output)) | |
if len(data_preprocess.normalize_for_overlap(all_output)) > 600000: | |
print("break here") | |
break | |
if iso != "unknown": query_kw = iso | |
else: query_kw = acc | |
#text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw) | |
success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
if success_process: | |
text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2] | |
print("yes succeed for process document") | |
else: text_link, tables_link, final_input_link = "", "", "" | |
context = data_preprocess.extract_context(final_input_link, query_kw) | |
if context != "Sample ID not found.": | |
if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000: | |
success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context)) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
if success_chunk: | |
chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) | |
print("yes succeed for chunk") | |
else: | |
chunk += context | |
print("len context: ", len(context)) | |
print("basic fall back") | |
print("len chunk after: ", len(chunk)) | |
if len(final_input_link) > 1000*1000: | |
if context != "Sample ID not found.": | |
final_input_link = context | |
else: | |
final_input_link = data_preprocess.normalize_for_overlap(final_input_link) | |
if len(final_input_link) > 1000 *1000: | |
final_input_link = final_input_link[:100000] | |
if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000: | |
print("Running merge_texts_skipping_overlap with timeout") | |
success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
print("Returned from timeout logic") | |
if success: | |
all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) | |
print("yes succeed") | |
else: | |
print("len all output: ", len(all_output)) | |
print("len final input link: ", len(final_input_link)) | |
all_output += final_input_link | |
print("len final input: ", len(final_input_link)) | |
print("basic fall back") | |
else: | |
print("both/either all output or final link too large more than 100000") | |
print("len all output: ", len(all_output)) | |
print("len final input link: ", len(final_input_link)) | |
all_output += final_input_link | |
print("len final input: ", len(final_input_link)) | |
print("basic fall back") | |
print("len all output after: ", len(all_output)) | |
#country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
else: | |
chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features | |
all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features | |
if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features | |
if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features | |
if len(all_output) > 1*1024*1024: | |
all_output = data_preprocess.normalize_for_overlap(all_output) | |
if len(all_output) > 1*1024*1024: | |
all_output = all_output[:1*1024*1024] | |
print("chunk len: ", len(chunk)) | |
print("all output len: ", len(all_output)) | |
data_preprocess.save_text_to_docx(chunk, file_chunk_path) | |
data_preprocess.save_text_to_docx(all_output, file_all_path) | |
# Later when saving new files | |
# data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id) | |
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id) | |
# Upload to Drive | |
result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id) | |
result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id) | |
print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload) | |
print(f"π Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view") | |
print("here 1") | |
# else: | |
# final_input = "" | |
# if all_output: | |
# final_input = all_output | |
# else: | |
# if chunk: final_input = chunk | |
# #data_preprocess.merge_texts_skipping_overlap(final_input, all_output) | |
# if final_input: | |
# keywords = [] | |
# if iso != "unknown": keywords.append(iso) | |
# if acc != "unknown": keywords.append(acc) | |
# for keyword in keywords: | |
# chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword) | |
# countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword) | |
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS) | |
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS) | |
# Define paths for cached RAG assets | |
# faiss_index_path = saveLinkFolder+"/faiss_index.bin" | |
# document_chunks_path = saveLinkFolder+"/document_chunks.json" | |
# structured_lookup_path = saveLinkFolder+"/structured_lookup.json" | |
print("here 2") | |
faiss_filename = "faiss_index.bin" | |
chunks_filename = "document_chunks.json" | |
lookup_filename = "structured_lookup.json" | |
print("name of faiss: ", faiss_filename) | |
faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename) | |
document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename) | |
structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename) | |
print("name if faiss path: ", faiss_index_path) | |
# π₯ Remove the local file first if it exists | |
print("start faiss id and also the sample folder id is: ", sample_folder_id) | |
faiss_id = find_drive_file(faiss_filename, sample_folder_id) | |
print("done faiss id") | |
document_id = find_drive_file(chunks_filename, sample_folder_id) | |
structure_id = find_drive_file(lookup_filename, sample_folder_id) | |
if faiss_id and document_id and structure_id: | |
print("β 3 Files already exist in Google Drive. Downloading them...") | |
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path) | |
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path) | |
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path) | |
# Read and parse these into `chunk` and `all_output` | |
else: | |
"one of id not exist" | |
if os.path.exists(faiss_index_path): | |
print("faiss index exist and start to remove: ", faiss_index_path) | |
os.remove(faiss_index_path) | |
if os.path.exists(document_chunks_path): | |
os.remove(document_chunks_path) | |
if os.path.exists(structured_lookup_path): | |
os.remove(structured_lookup_path) | |
print("start to download the faiss, chunk, lookup") | |
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path) | |
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path) | |
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path) | |
try: | |
print("try gemini 2.5") | |
print("move to load rag") | |
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets( | |
faiss_index_path, document_chunks_path, structured_lookup_path | |
) | |
global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest') | |
if not all_output: | |
if chunk: all_output = chunk | |
else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features | |
if faiss_index is None: | |
print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...") | |
total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens( | |
all_output | |
).total_tokens | |
initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT | |
total_cost_title += initial_embedding_cost | |
print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}") | |
master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data( | |
file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path | |
) | |
else: | |
print("\nRAG assets loaded from file. No re-embedding of entire document will occur.") | |
plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path) | |
master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
primary_word = iso | |
alternative_word = acc | |
print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---") | |
if features.lower() not in all_output.lower(): | |
all_output += ". NCBI Features: " + features | |
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info( | |
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, | |
# model.call_llm_api, chunk=chunk, all_output=all_output) | |
print("this is chunk for the model") | |
print(chunk) | |
print("this is all output for the model") | |
print(all_output) | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( | |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, | |
model.call_llm_api, chunk=chunk, all_output=all_output) | |
print("pass query of 2.5") | |
except: | |
print("try gemini 1.5") | |
country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info( | |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, | |
model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest") | |
print("yeah pass the query of 1.5") | |
print("country using ai: ", country) | |
print("sample type using ai: ", sample_type) | |
# if len(country) == 0: country = "unknown" | |
# if len(sample_type) == 0: sample_type = "unknown" | |
# if country_explanation: country_explanation = "-"+country_explanation | |
# else: country_explanation = "" | |
# if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation | |
# else: sample_type_explanation = "" | |
if len(country) == 0: country = "unknown" | |
if len(sample_type) == 0: sample_type = "unknown" | |
if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation | |
else: country_explanation = "" | |
if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation | |
else: sample_type_explanation = "" | |
if method_used == "unknown": method_used = "" | |
if country.lower() != "unknown": | |
stand_country = standardize_location.smart_country_lookup(country.lower()) | |
if stand_country.lower() != "not found": | |
if stand_country.lower() in acc_score["country"]: | |
if country_explanation: | |
acc_score["country"][stand_country.lower()].append(method_used + country_explanation) | |
else: | |
acc_score["country"][stand_country.lower()] = [method_used + country_explanation] | |
else: | |
if country.lower() in acc_score["country"]: | |
if country_explanation: | |
if len(method_used + country_explanation) > 0: | |
acc_score["country"][country.lower()].append(method_used + country_explanation) | |
else: | |
if len(method_used + country_explanation) > 0: | |
acc_score["country"][country.lower()] = [method_used + country_explanation] | |
# if spe_loc.lower() != "unknown": | |
# if spe_loc.lower() in acc_score["specific_location"]: | |
# acc_score["specific_location"][spe_loc.lower()].append(method_used) | |
# else: | |
# acc_score["specific_location"][spe_loc.lower()] = [method_used] | |
# if ethnic.lower() != "unknown": | |
# if ethnic.lower() in acc_score["ethnicity"]: | |
# acc_score["ethnicity"][ethnic.lower()].append(method_used) | |
# else: | |
# acc_score["ethnicity"][ethnic.lower()] = [method_used] | |
if sample_type.lower() != "unknown": | |
if sample_type.lower() in acc_score["sample_type"]: | |
if len(method_used + sample_type_explanation) > 0: | |
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation) | |
else: | |
if len(method_used + sample_type_explanation)> 0: | |
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation] | |
total_cost_title += total_query_cost | |
if stop_flag is not None and stop_flag.value: | |
print(f"π Stop processing {accession}, aborting early...") | |
return {} | |
# last resort: combine all information to give all output otherwise unknown | |
if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown": | |
text = "" | |
for key in meta_expand: | |
text += str(key) + ": " + meta_expand[key] + "\n" | |
if len(data_preprocess.normalize_for_overlap(all_output)) > 0: | |
text += data_preprocess.normalize_for_overlap(all_output) | |
if len(data_preprocess.normalize_for_overlap(chunk)) > 0: | |
text += data_preprocess.normalize_for_overlap(chunk) | |
text += ". NCBI Features: " + features | |
print("this is text for the last resort model") | |
print(text) | |
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( | |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, | |
model.call_llm_api, chunk=text, all_output=text) | |
print("this is last resort results: ") | |
print("country: ", country) | |
print("sample type: ", sample_type) | |
if len(country) == 0: country = "unknown" | |
if len(sample_type) == 0: sample_type = "unknown" | |
# if country_explanation: country_explanation = "-"+country_explanation | |
# else: country_explanation = "" | |
# if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation | |
# else: sample_type_explanation = "" | |
if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation | |
else: country_explanation = "" | |
if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation | |
else: sample_type_explanation = "" | |
if method_used == "unknown": method_used = "" | |
if country.lower() != "unknown": | |
stand_country = standardize_location.smart_country_lookup(country.lower()) | |
if stand_country.lower() != "not found": | |
if stand_country.lower() in acc_score["country"]: | |
if country_explanation: | |
acc_score["country"][stand_country.lower()].append(method_used + country_explanation) | |
else: | |
acc_score["country"][stand_country.lower()] = [method_used + country_explanation] | |
else: | |
if country.lower() in acc_score["country"]: | |
if country_explanation: | |
if len(method_used + country_explanation) > 0: | |
acc_score["country"][country.lower()].append(method_used + country_explanation) | |
else: | |
if len(method_used + country_explanation) > 0: | |
acc_score["country"][country.lower()] = [method_used + country_explanation] | |
if sample_type.lower() != "unknown": | |
if sample_type.lower() in acc_score["sample_type"]: | |
if len(method_used + sample_type_explanation) > 0: | |
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation) | |
else: | |
if len(method_used + sample_type_explanation)> 0: | |
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation] | |
total_cost_title += total_query_cost | |
end = time.time() | |
#total_cost_title += total_query_cost | |
acc_score["query_cost"] = f"{total_cost_title:.6f}" | |
elapsed = end - start | |
acc_score["time_cost"] = f"{elapsed:.3f} seconds" | |
accs_output[acc] = acc_score | |
print(accs_output[acc]) | |
return accs_output |