diff --git "a/pipeline.py" "b/pipeline.py" --- "a/pipeline.py" +++ "b/pipeline.py" @@ -1,920 +1,1088 @@ -# test1: MJ17 direct -# test2: "A1YU101" thailand cross-ref -# test3: "EBK109" thailand cross-ref -# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and" -import data_preprocess -import model -import mtdna_classifier -#import app -import smart_fallback -import pandas as pd -from pathlib import Path -import subprocess -from NER.html import extractHTML -import os -import google.generativeai as genai -import re -import standardize_location -# Helper functions in for this pipeline -# Track time -import time -import multiprocessing -import gspread -from googleapiclient.discovery import build -from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload -from google.oauth2.service_account import Credentials -from oauth2client.service_account import ServiceAccountCredentials -import io -import json -#––– Authentication setup ––– -GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier" -GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"] -GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets -GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"]) -drive_service = build("drive", "v3", credentials=GDRIVE_CREDS) - -def get_or_create_drive_folder(name, parent_id=None): - query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'" - if parent_id: - query += f" and '{parent_id}' in parents" - results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute() - items = results.get("files", []) - if items: - return items[0]["id"] - file_metadata = { - "name": name, - "mimeType": "application/vnd.google-apps.folder" - } - if parent_id: - file_metadata["parents"] = [parent_id] - file = drive_service.files().create(body=file_metadata, fields="id").execute() - return file["id"] -# def find_drive_file(filename, parent_id): -# """ -# Checks if a file with the given name exists inside the specified Google Drive folder. -# Returns the file ID if found, else None. -# """ -# query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false" -# results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute() -# files = results.get('files', []) -# if files: -# return files[0]["id"] -# return None - -def find_drive_file(filename, parent_id): - """ - Checks if a file with the given name exists inside the specified Google Drive folder. - Returns the file ID if found, else None. - """ - try: - print(f"🔍 Searching for '{filename}' in folder: {parent_id}") - query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false" - results = drive_service.files().list( - q=query, - spaces='drive', - fields='files(id, name)', - pageSize=1 - ).execute() - files = results.get('files', []) - if files: - print(f"✅ Found file: {files[0]['name']} with ID: {files[0]['id']}") - return files[0]["id"] - else: - print("⚠️ File not found.") - return None - except Exception as e: - print(f"❌ Error during find_drive_file: {e}") - return None - - - -# def upload_file_to_drive(local_path, remote_name, folder_id): -# file_metadata = {"name": remote_name, "parents": [folder_id]} -# media = MediaFileUpload(local_path, resumable=True) -# existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", []) -# if existing: -# drive_service.files().delete(fileId=existing[0]["id"]).execute() -# file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute() -# result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute() -# if not result.get("files"): -# print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.") -# else: -# print(f"✅ Verified upload: {remote_name}") -# return file["id"] -def upload_file_to_drive(local_path, remote_name, folder_id): - try: - if not os.path.exists(local_path): - raise FileNotFoundError(f"❌ Local file does not exist: {local_path}") - - # Delete existing file on Drive if present - existing = drive_service.files().list( - q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false", - fields="files(id)" - ).execute().get("files", []) - - if existing: - drive_service.files().delete(fileId=existing[0]["id"]).execute() - print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}") - - file_metadata = {"name": remote_name, "parents": [folder_id]} - media = MediaFileUpload(local_path, resumable=True) - file = drive_service.files().create( - body=file_metadata, - media_body=media, - fields="id" - ).execute() - - print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}") - return file["id"] - - except Exception as e: - print(f"❌ Error during upload: {e}") - return None - - -def download_file_from_drive(remote_name, folder_id, local_path): - results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute() - files = results.get("files", []) - if not files: - return False - file_id = files[0]["id"] - request = drive_service.files().get_media(fileId=file_id) - fh = io.FileIO(local_path, 'wb') - downloader = MediaIoBaseDownload(fh, request) - done = False - while not done: - _, done = downloader.next_chunk() - return True -def download_drive_file_content(file_id): - request = drive_service.files().get_media(fileId=file_id) - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request) - done = False - while not done: - _, done = downloader.next_chunk() - fh.seek(0) - return fh.read().decode("utf-8") - -# def run_with_timeout(func, args=(), kwargs={}, timeout=20): -# """ -# Runs `func` with timeout in seconds. Kills if it exceeds. -# Returns: (success, result or None) -# """ -# def wrapper(q, *args, **kwargs): -# try: -# q.put(func(*args, **kwargs)) -# except Exception as e: -# q.put(e) - -# q = multiprocessing.Queue() -# p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs) -# p.start() -# p.join(timeout) - -# if p.is_alive(): -# p.terminate() -# p.join() -# print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.") -# return False, None -# else: -# result = q.get() -# if isinstance(result, Exception): -# raise result -# return True, result -# def run_with_timeout(func, args=(), kwargs={}, timeout=30): -# import concurrent.futures -# with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: -# future = executor.submit(func, *args, **kwargs) -# try: -# return True, future.result(timeout=timeout) -# except concurrent.futures.TimeoutError: -# print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.") -# return False, None - -import multiprocessing - -def run_with_timeout(func, args=(), kwargs={}, timeout=30): - def wrapper(q, *args, **kwargs): - try: - result = func(*args, **kwargs) - q.put((True, result)) - except Exception as e: - q.put((False, e)) - - q = multiprocessing.Queue() - p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs) - p.start() - p.join(timeout) - - if p.is_alive(): - p.terminate() - p.join() - print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.") - return False, None - - if not q.empty(): - success, result = q.get() - if success: - return True, result - else: - raise result # re-raise exception if needed - - return False, None - - - -def time_it(func, *args, **kwargs): - """ - Measure how long a function takes to run and return its result + time. - """ - start = time.time() - result = func(*args, **kwargs) - end = time.time() - elapsed = end - start - print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds") - return result, elapsed -# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) --- - -def unique_preserve_order(seq): - seen = set() - return [x for x in seq if not (x in seen or seen.add(x))] -# Main execution -def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None): - # output: country, sample_type, ethnic, location, money_cost, time_cost, explain - # there can be one accession number in the accessions - # Prices are per 1,000 tokens - # Before each big step: - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop detected before starting {accession}, aborting early...") - return {} - # PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens - # PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens - # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens - # Gemini 2.5 Flash-Lite pricing per 1,000 tokens - PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens - PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens - - # Embedding-001 pricing per 1,000 input tokens - PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens - if not accessions: - print("no input") - return None - else: - accs_output = {} - #genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) - genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP")) - for acc in accessions: - print("start gemini: ", acc) - start = time.time() - total_cost_title = 0 - jsonSM, links, article_text = {},[], "" - acc_score = { "isolate": "", - "country":{}, - "sample_type":{}, - #"specific_location":{}, - #"ethnicity":{}, - "query_cost":total_cost_title, - "time_cost":None, - "source":links, - "file_chunk":"", - "file_all_output":""} - if niche_cases: - for niche in niche_cases: - acc_score[niche] = {} - - meta = mtdna_classifier.fetch_ncbi_metadata(acc) - country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"] - acc_score["isolate"] = iso - print("meta: ",meta) - meta_expand = smart_fallback.fetch_ncbi(acc) - print("meta expand: ", meta_expand) - # set up step: create the folder to save document - chunk, all_output = "","" - if pudID: - id = str(pudID) - saveTitle = title - else: - try: - author_name = meta_expand["authors"].split(',')[0] # Use last name only - except: - author_name = meta_expand["authors"] - saveTitle = title + "_" + col_date + "_" + author_name - if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown": - saveTitle += "_" + acc - id = "DirectSubmission" - # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)) - # if not folder_path.exists(): - # cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}' - # result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - # print("data/"+str(id) +" created.") - # else: - # print("data/"+str(id) +" already exists.") - # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id) - # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME) - # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id) - # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id) - data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly - sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id) - print("sample folder id: ", sample_folder_id) - - # Define document names - if len(saveTitle) > 50: - saveName = saveTitle[:50] - saveName = saveName.replace(" ", "_") - chunk_filename = f"{saveName}_merged_document.docx" - all_filename = f"{saveName}_all_merged_document.docx" - else: - saveName = saveTitle.replace(" ", "_") - chunk_filename = f"{saveName}_merged_document.docx" - all_filename = f"{saveName}_all_merged_document.docx" - print("chunk file name and all filename: ", chunk_filename, all_filename) - # Define local temp paths for reading/writing - # import tempfile - # tmp_dir = tempfile.mkdtemp() - LOCAL_TEMP_DIR = "/mnt/data/generated_docs" - os.makedirs(LOCAL_TEMP_DIR, exist_ok=True) - file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename) - file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename) - # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename) - # file_all_path = os.path.join(tempfile.gettempdir(), all_filename) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - print("this is file chunk path: ", file_chunk_path) - chunk_id = find_drive_file(chunk_filename, sample_folder_id) - all_id = find_drive_file(all_filename, sample_folder_id) - - if chunk_id and all_id: - print("✅ Files already exist in Google Drive. Downloading them...") - chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path) - all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path) - acc_score["file_chunk"] = str(chunk_filename) - acc_score["file_all_output"] = str(all_filename) - print("chunk_id and all_id: ") - print(chunk_id, all_id) - print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"]) - file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute() - print("📄 Name:", file["name"]) - print("📁 Parent folder ID:", file["parents"][0]) - print("🔗 View link:", file["webViewLink"]) - - - # Read and parse these into `chunk` and `all_output` - else: - # 🔥 Remove any stale local copies - if os.path.exists(file_chunk_path): - os.remove(file_chunk_path) - print(f"🗑️ Removed stale: {file_chunk_path}") - if os.path.exists(file_all_path): - os.remove(file_all_path) - print(f"🗑️ Removed stale: {file_all_path}") - # 🔥 Remove the local file first if it exists - # if os.path.exists(file_chunk_path): - # os.remove(file_chunk_path) - # print("remove chunk path") - # if os.path.exists(file_all_path): - # os.remove(file_all_path) - # print("remove all path") - # Try to download if already exists on Drive - chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path) - all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path) - print("chunk exist: ", chunk_exists) - # first way: ncbi method - print("country.lower: ",country.lower()) - if country.lower() != "unknown": - stand_country = standardize_location.smart_country_lookup(country.lower()) - print("stand_country: ", stand_country) - if stand_country.lower() != "not found": - acc_score["country"][stand_country.lower()] = ["ncbi"] - else: acc_score["country"][country.lower()] = ["ncbi"] - # if spe_loc.lower() != "unknown": - # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"] - # if ethnic.lower() != "unknown": - # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"] - if sample_type.lower() != "unknown": - acc_score["sample_type"][sample_type.lower()] = ["ncbi"] - # second way: LLM model - # Preprocess the input token - print(acc_score) - accession, isolate = None, None - if acc != "unknown": accession = acc - if iso != "unknown": isolate = iso - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - # check doi first - print("chunk filename: ", chunk_filename) - if chunk_exists: - print("File chunk exists!") - if not chunk: - print("start to get chunk") - text, table, document_title = model.read_docx_text(file_chunk_path) - chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) - if str(chunk_filename) != "": - print("first time have chunk path at chunk exist: ", str(chunk_filename)) - acc_score["file_chunk"] = str(chunk_filename) - if all_exists: - print("File all output exists!") - if not all_output: - text_all, table_all, document_title_all = model.read_docx_text(file_all_path) - all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) - if str(all_filename) != "": - print("first time have all path at all exist: ", str(all_filename)) - acc_score["file_all_output"] = str(all_filename) - print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"]) - if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0: - if doi != "unknown": - link = 'https://doi.org/' + doi - # get the file to create listOfFile for each id - print("link of doi: ", link) - html = extractHTML.HTML("",link) - jsonSM = html.getSupMaterial() - article_text = html.getListSection() - if article_text: - if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): - links.append(link) - if jsonSM: - links += sum((jsonSM[key] for key in jsonSM),[]) - # no doi then google custom search api - if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): - # might find the article - print("no article text, start tem link") - #tem_links = mtdna_classifier.search_google_custom(title, 2) - tem_links = smart_fallback.smart_google_search(meta_expand) - print("tem links: ", tem_links) - tem_link_acc = smart_fallback.google_accession_search(acc) - tem_links += tem_link_acc - tem_links = unique_preserve_order(tem_links) - print("tem link before filtering: ", tem_links) - # filter the quality link - print("saveLinkFolder as sample folder id: ", sample_folder_id) - print("start the smart filter link") - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) - # if success_process: - # links = output_process - # print("yes succeed for smart filter link") - # else: - # print("no suceed, fallback to all tem links") - # links = tem_links - links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) - print("this is links: ",links) - links = unique_preserve_order(links) - acc_score["source"] = links - else: - print("inside the try of reusing chunk or all output") - #print("chunk filename: ", str(chunks_filename)) - - try: - temp_source = False - if save_df is not None and not save_df.empty: - print("save df not none") - print("chunk file name: ",str(chunk_filename)) - print("all filename: ",str(all_filename)) - if acc_score["file_chunk"]: - link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0] - #link = row["Sources"].iloc[0] - if "http" in link: - print("yeah http in save df source") - acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist() - else: # temporary - print("tempo source") - #acc_score["source"] = [str(all_filename), str(chunks_filename)] - temp_source = True - elif acc_score["file_all_output"]: - link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0] - #link = row["Sources"].iloc[0] - print(link) - print("list of link") - print([x for x in link.split("\n") if x.strip()]) - if "http" in link: - print("yeah http in save df source") - acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist() - else: # temporary - print("tempo source") - #acc_score["source"] = [str(all_filename), str(chunks_filename)] - temp_source = True - else: # temporary - print("tempo source") - #acc_score["source"] = [str(file_all_path), str(file_chunk_path)] - temp_source = True - else: # temporary - print("tempo source") - #acc_score["source"] = [str(file_all_path), str(file_chunk_path)] - temp_source = True - if temp_source: - print("temp source is true so have to try again search link") - if doi != "unknown": - link = 'https://doi.org/' + doi - # get the file to create listOfFile for each id - print("link of doi: ", link) - html = extractHTML.HTML("",link) - jsonSM = html.getSupMaterial() - article_text = html.getListSection() - if article_text: - if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): - links.append(link) - if jsonSM: - links += sum((jsonSM[key] for key in jsonSM),[]) - # no doi then google custom search api - if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): - # might find the article - print("no article text, start tem link") - #tem_links = mtdna_classifier.search_google_custom(title, 2) - tem_links = smart_fallback.smart_google_search(meta_expand) - print("tem links: ", tem_links) - tem_link_acc = smart_fallback.google_accession_search(acc) - tem_links += tem_link_acc - tem_links = unique_preserve_order(tem_links) - print("tem link before filtering: ", tem_links) - # filter the quality link - print("saveLinkFolder as sample folder id: ", sample_folder_id) - print("start the smart filter link") - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) - # if success_process: - # links = output_process - # print("yes succeed for smart filter link") - # else: - # print("no suceed, fallback to all tem links") - # links = tem_links - links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) - print("this is links: ",links) - links = unique_preserve_order(links) - acc_score["source"] = links - except: - print("except for source") - acc_score["source"] = [] - # chunk_path = "/"+saveTitle+"_merged_document.docx" - # all_path = "/"+saveTitle+"_all_merged_document.docx" - # # if chunk and all output not exist yet - # file_chunk_path = saveLinkFolder + chunk_path - # file_all_path = saveLinkFolder + all_path - # if os.path.exists(file_chunk_path): - # print("File chunk exists!") - # if not chunk: - # text, table, document_title = model.read_docx_text(file_chunk_path) - # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) - # if os.path.exists(file_all_path): - # print("File all output exists!") - # if not all_output: - # text_all, table_all, document_title_all = model.read_docx_text(file_all_path) - # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - # print("chunk filename: ", chunk_filename) - # if chunk_exists: - # print("File chunk exists!") - # if not chunk: - # print("start to get chunk") - # text, table, document_title = model.read_docx_text(file_chunk_path) - # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) - # if str(chunk_filename) != "": - # print("first time have chunk path at chunk exist: ", str(chunk_filename)) - # acc_score["file_chunk"] = str(chunk_filename) - # if all_exists: - # print("File all output exists!") - # if not all_output: - # text_all, table_all, document_title_all = model.read_docx_text(file_all_path) - # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) - # if str(all_filename) != "": - # print("first time have all path at all exist: ", str(all_filename)) - # acc_score["file_all_output"] = str(all_filename) - if not chunk and not all_output: - print("not chunk and all output") - # else: check if we can reuse these chunk and all output of existed accession to find another - if str(chunk_filename) != "": - print("first time have chunk path: ", str(chunk_filename)) - acc_score["file_chunk"] = str(chunk_filename) - if str(all_filename) != "": - print("first time have all path: ", str(all_filename)) - acc_score["file_all_output"] = str(all_filename) - if links: - for link in links: - print(link) - # if len(all_output) > 1000*1000: - # all_output = data_preprocess.normalize_for_overlap(all_output) - # print("after normalizing all output: ", len(all_output)) - if len(data_preprocess.normalize_for_overlap(all_output)) > 600000: - print("break here") - break - if iso != "unknown": query_kw = iso - else: query_kw = acc - #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw) - success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - if success_process: - text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2] - print("yes succeed for process document") - else: text_link, tables_link, final_input_link = "", "", "" - context = data_preprocess.extract_context(final_input_link, query_kw) - if context != "Sample ID not found.": - if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000: - success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context)) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - if success_chunk: - chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) - print("yes succeed for chunk") - else: - chunk += context - print("len context: ", len(context)) - print("basic fall back") - print("len chunk after: ", len(chunk)) - if len(final_input_link) > 1000*1000: - if context != "Sample ID not found.": - final_input_link = context - else: - final_input_link = data_preprocess.normalize_for_overlap(final_input_link) - if len(final_input_link) > 1000 *1000: - final_input_link = final_input_link[:100000] - if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000: - print("Running merge_texts_skipping_overlap with timeout") - success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - print("Returned from timeout logic") - if success: - all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) - print("yes succeed") - else: - print("len all output: ", len(all_output)) - print("len final input link: ", len(final_input_link)) - all_output += final_input_link - print("len final input: ", len(final_input_link)) - print("basic fall back") - else: - print("both/either all output or final link too large more than 100000") - print("len all output: ", len(all_output)) - print("len final input link: ", len(final_input_link)) - all_output += final_input_link - print("len final input: ", len(final_input_link)) - print("basic fall back") - print("len all output after: ", len(all_output)) - #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - else: - chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features - all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features - if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features - if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features - if len(all_output) > 1*1024*1024: - all_output = data_preprocess.normalize_for_overlap(all_output) - if len(all_output) > 1*1024*1024: - all_output = all_output[:1*1024*1024] - print("chunk len: ", len(chunk)) - print("all output len: ", len(all_output)) - data_preprocess.save_text_to_docx(chunk, file_chunk_path) - data_preprocess.save_text_to_docx(all_output, file_all_path) - # Later when saving new files - # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id) - # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id) - - # Upload to Drive - result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id) - result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id) - print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload) - print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view") - print("here 1") - - # else: - # final_input = "" - # if all_output: - # final_input = all_output - # else: - # if chunk: final_input = chunk - # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output) - # if final_input: - # keywords = [] - # if iso != "unknown": keywords.append(iso) - # if acc != "unknown": keywords.append(acc) - # for keyword in keywords: - # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword) - # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword) - # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS) - # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS) - - # Define paths for cached RAG assets - # faiss_index_path = saveLinkFolder+"/faiss_index.bin" - # document_chunks_path = saveLinkFolder+"/document_chunks.json" - # structured_lookup_path = saveLinkFolder+"/structured_lookup.json" - print("here 2") - faiss_filename = "faiss_index.bin" - chunks_filename = "document_chunks.json" - lookup_filename = "structured_lookup.json" - print("name of faiss: ", faiss_filename) - - faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename) - document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename) - structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename) - print("name if faiss path: ", faiss_index_path) - # 🔥 Remove the local file first if it exists - print("start faiss id and also the sample folder id is: ", sample_folder_id) - faiss_id = find_drive_file(faiss_filename, sample_folder_id) - print("done faiss id") - document_id = find_drive_file(chunks_filename, sample_folder_id) - structure_id = find_drive_file(lookup_filename, sample_folder_id) - if faiss_id and document_id and structure_id: - print("✅ 3 Files already exist in Google Drive. Downloading them...") - download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path) - download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path) - download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path) - # Read and parse these into `chunk` and `all_output` - else: - "one of id not exist" - if os.path.exists(faiss_index_path): - print("faiss index exist and start to remove: ", faiss_index_path) - os.remove(faiss_index_path) - if os.path.exists(document_chunks_path): - os.remove(document_chunks_path) - if os.path.exists(structured_lookup_path): - os.remove(structured_lookup_path) - print("start to download the faiss, chunk, lookup") - - download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path) - download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path) - download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path) - try: - print("try gemini 2.5") - print("move to load rag") - master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets( - faiss_index_path, document_chunks_path, structured_lookup_path - ) - - global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest') - if not all_output: - if chunk: all_output = chunk - else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features - if faiss_index is None: - print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...") - total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens( - all_output - ).total_tokens - - initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT - total_cost_title += initial_embedding_cost - print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}") - - - master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data( - file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path - ) - else: - print("\nRAG assets loaded from file. No re-embedding of entire document will occur.") - plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path) - master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - primary_word = iso - alternative_word = acc - print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---") - if features.lower() not in all_output.lower(): - all_output += ". NCBI Features: " + features - # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info( - # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, - # model.call_llm_api, chunk=chunk, all_output=all_output) - print("this is chunk for the model") - print(chunk) - print("this is all output for the model") - print(all_output) - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( - primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, - model.call_llm_api, chunk=chunk, all_output=all_output) - print("pass query of 2.5") - except: - print("try gemini 1.5") - country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info( - primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, - model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest") - print("yeah pass the query of 1.5") - print("country using ai: ", country) - print("sample type using ai: ", sample_type) - # if len(country) == 0: country = "unknown" - # if len(sample_type) == 0: sample_type = "unknown" - # if country_explanation: country_explanation = "-"+country_explanation - # else: country_explanation = "" - # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation - # else: sample_type_explanation = "" - if len(country) == 0: country = "unknown" - if len(sample_type) == 0: sample_type = "unknown" - if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation - else: country_explanation = "" - if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation - else: sample_type_explanation = "" - - if method_used == "unknown": method_used = "" - if country.lower() != "unknown": - stand_country = standardize_location.smart_country_lookup(country.lower()) - if stand_country.lower() != "not found": - if stand_country.lower() in acc_score["country"]: - if country_explanation: - acc_score["country"][stand_country.lower()].append(method_used + country_explanation) - else: - acc_score["country"][stand_country.lower()] = [method_used + country_explanation] - else: - if country.lower() in acc_score["country"]: - if country_explanation: - if len(method_used + country_explanation) > 0: - acc_score["country"][country.lower()].append(method_used + country_explanation) - else: - if len(method_used + country_explanation) > 0: - acc_score["country"][country.lower()] = [method_used + country_explanation] - # if spe_loc.lower() != "unknown": - # if spe_loc.lower() in acc_score["specific_location"]: - # acc_score["specific_location"][spe_loc.lower()].append(method_used) - # else: - # acc_score["specific_location"][spe_loc.lower()] = [method_used] - # if ethnic.lower() != "unknown": - # if ethnic.lower() in acc_score["ethnicity"]: - # acc_score["ethnicity"][ethnic.lower()].append(method_used) - # else: - # acc_score["ethnicity"][ethnic.lower()] = [method_used] - if sample_type.lower() != "unknown": - if sample_type.lower() in acc_score["sample_type"]: - if len(method_used + sample_type_explanation) > 0: - acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation) - else: - if len(method_used + sample_type_explanation)> 0: - acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation] - total_cost_title += total_query_cost - if stop_flag is not None and stop_flag.value: - print(f"🛑 Stop processing {accession}, aborting early...") - return {} - # last resort: combine all information to give all output otherwise unknown - if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown": - text = "" - for key in meta_expand: - text += str(key) + ": " + meta_expand[key] + "\n" - if len(data_preprocess.normalize_for_overlap(all_output)) > 0: - text += data_preprocess.normalize_for_overlap(all_output) - if len(data_preprocess.normalize_for_overlap(chunk)) > 0: - text += data_preprocess.normalize_for_overlap(chunk) - text += ". NCBI Features: " + features - print("this is text for the last resort model") - print(text) - country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( - primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, - model.call_llm_api, chunk=text, all_output=text) - print("this is last resort results: ") - print("country: ", country) - print("sample type: ", sample_type) - if len(country) == 0: country = "unknown" - if len(sample_type) == 0: sample_type = "unknown" - # if country_explanation: country_explanation = "-"+country_explanation - # else: country_explanation = "" - # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation - # else: sample_type_explanation = "" - if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation - else: country_explanation = "" - if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation - else: sample_type_explanation = "" - - if method_used == "unknown": method_used = "" - if country.lower() != "unknown": - stand_country = standardize_location.smart_country_lookup(country.lower()) - if stand_country.lower() != "not found": - if stand_country.lower() in acc_score["country"]: - if country_explanation: - acc_score["country"][stand_country.lower()].append(method_used + country_explanation) - else: - acc_score["country"][stand_country.lower()] = [method_used + country_explanation] - else: - if country.lower() in acc_score["country"]: - if country_explanation: - if len(method_used + country_explanation) > 0: - acc_score["country"][country.lower()].append(method_used + country_explanation) - else: - if len(method_used + country_explanation) > 0: - acc_score["country"][country.lower()] = [method_used + country_explanation] - if sample_type.lower() != "unknown": - if sample_type.lower() in acc_score["sample_type"]: - if len(method_used + sample_type_explanation) > 0: - acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation) - else: - if len(method_used + sample_type_explanation)> 0: - acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation] - total_cost_title += total_query_cost - end = time.time() - #total_cost_title += total_query_cost - acc_score["query_cost"] = f"{total_cost_title:.6f}" - elapsed = end - start - acc_score["time_cost"] = f"{elapsed:.3f} seconds" - accs_output[acc] = acc_score - print(accs_output[acc]) - +# test1: MJ17 direct +# test2: "A1YU101" thailand cross-ref +# test3: "EBK109" thailand cross-ref +# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and" +import data_preprocess +import model +import mtdna_classifier +#import app +import smart_fallback +import pandas as pd +from pathlib import Path +import subprocess +from NER.html import extractHTML +import os +import google.generativeai as genai +import re +import standardize_location +# Helper functions in for this pipeline +# Track time +import time +import multiprocessing +import gspread +from googleapiclient.discovery import build +from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload +from google.oauth2.service_account import Credentials +from oauth2client.service_account import ServiceAccountCredentials +import io +import json +#––– Authentication setup ––– +GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier" +GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"] +GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets +GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"]) +drive_service = build("drive", "v3", credentials=GDRIVE_CREDS) + +def get_or_create_drive_folder(name, parent_id=None): + query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'" + if parent_id: + query += f" and '{parent_id}' in parents" + results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute() + items = results.get("files", []) + if items: + return items[0]["id"] + file_metadata = { + "name": name, + "mimeType": "application/vnd.google-apps.folder" + } + if parent_id: + file_metadata["parents"] = [parent_id] + file = drive_service.files().create(body=file_metadata, fields="id").execute() + return file["id"] +# def find_drive_file(filename, parent_id): +# """ +# Checks if a file with the given name exists inside the specified Google Drive folder. +# Returns the file ID if found, else None. +# """ +# query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false" +# results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute() +# files = results.get('files', []) +# if files: +# return files[0]["id"] +# return None + +def find_drive_file(filename, parent_id): + """ + Checks if a file with the given name exists inside the specified Google Drive folder. + Returns the file ID if found, else None. + """ + try: + print(f"🔍 Searching for '{filename}' in folder: {parent_id}") + query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false" + results = drive_service.files().list( + q=query, + spaces='drive', + fields='files(id, name)', + pageSize=1 + ).execute() + files = results.get('files', []) + if files: + print(f"✅ Found file: {files[0]['name']} with ID: {files[0]['id']}") + return files[0]["id"] + else: + print("⚠️ File not found.") + return None + except Exception as e: + print(f"❌ Error during find_drive_file: {e}") + return None + + + +# def upload_file_to_drive(local_path, remote_name, folder_id): +# file_metadata = {"name": remote_name, "parents": [folder_id]} +# media = MediaFileUpload(local_path, resumable=True) +# existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", []) +# if existing: +# drive_service.files().delete(fileId=existing[0]["id"]).execute() +# file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute() +# result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute() +# if not result.get("files"): +# print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.") +# else: +# print(f"✅ Verified upload: {remote_name}") +# return file["id"] +def upload_file_to_drive(local_path, remote_name, folder_id): + try: + if not os.path.exists(local_path): + raise FileNotFoundError(f"❌ Local file does not exist: {local_path}") + + # Delete existing file on Drive if present + existing = drive_service.files().list( + q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false", + fields="files(id)" + ).execute().get("files", []) + + if existing: + drive_service.files().delete(fileId=existing[0]["id"]).execute() + print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}") + + file_metadata = {"name": remote_name, "parents": [folder_id]} + media = MediaFileUpload(local_path, resumable=True) + file = drive_service.files().create( + body=file_metadata, + media_body=media, + fields="id" + ).execute() + + print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}") + return file["id"] + + except Exception as e: + print(f"❌ Error during upload: {e}") + return None + + +def download_file_from_drive(remote_name, folder_id, local_path): + results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute() + files = results.get("files", []) + if not files: + return False + file_id = files[0]["id"] + request = drive_service.files().get_media(fileId=file_id) + fh = io.FileIO(local_path, 'wb') + downloader = MediaIoBaseDownload(fh, request) + done = False + while not done: + _, done = downloader.next_chunk() + return True +def download_drive_file_content(file_id): + request = drive_service.files().get_media(fileId=file_id) + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request) + done = False + while not done: + _, done = downloader.next_chunk() + fh.seek(0) + return fh.read().decode("utf-8") + +# def run_with_timeout(func, args=(), kwargs={}, timeout=20): +# """ +# Runs `func` with timeout in seconds. Kills if it exceeds. +# Returns: (success, result or None) +# """ +# def wrapper(q, *args, **kwargs): +# try: +# q.put(func(*args, **kwargs)) +# except Exception as e: +# q.put(e) + +# q = multiprocessing.Queue() +# p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs) +# p.start() +# p.join(timeout) + +# if p.is_alive(): +# p.terminate() +# p.join() +# print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.") +# return False, None +# else: +# result = q.get() +# if isinstance(result, Exception): +# raise result +# return True, result +# def run_with_timeout(func, args=(), kwargs={}, timeout=30): +# import concurrent.futures +# with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: +# future = executor.submit(func, *args, **kwargs) +# try: +# return True, future.result(timeout=timeout) +# except concurrent.futures.TimeoutError: +# print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.") +# return False, None + +import multiprocessing + +def run_with_timeout(func, args=(), kwargs={}, timeout=30): + def wrapper(q, *args, **kwargs): + try: + result = func(*args, **kwargs) + q.put((True, result)) + except Exception as e: + q.put((False, e)) + + q = multiprocessing.Queue() + p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs) + p.start() + p.join(timeout) + + if p.is_alive(): + p.terminate() + p.join() + print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.") + return False, None + + if not q.empty(): + success, result = q.get() + if success: + return True, result + else: + raise result # re-raise exception if needed + + return False, None + + + +def time_it(func, *args, **kwargs): + """ + Measure how long a function takes to run and return its result + time. + """ + start = time.time() + result = func(*args, **kwargs) + end = time.time() + elapsed = end - start + print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds") + return result, elapsed +# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) --- + +def unique_preserve_order(seq): + seen = set() + return [x for x in seq if not (x in seen or seen.add(x))] + +def sanitize_filename(filename, max_length=100): + # Remove characters that are not letters, numbers, spaces, underscores, or hyphens + filename = re.sub(r'[<>:"/\\|?*\n\r\t]', '', filename) + # Replace spaces with underscores + filename = filename.replace(" ", "_") + # Limit length + return filename[:max_length] +# Main execution +def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None): + # output: country, sample_type, ethnic, location, money_cost, time_cost, explain + # there can be one accession number in the accessions + # Prices are per 1,000 tokens + # Before each big step: + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop detected before starting {accession}, aborting early...") + return {} + # PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens + # PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens + # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens + # Gemini 2.5 Flash-Lite pricing per 1,000 tokens + PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens + PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens + + # Embedding-001 pricing per 1,000 input tokens + PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens + if not accessions: + print("no input") + return None + else: + accs_output = {} + #genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) + genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP")) + for acc in accessions: + print("start gemini: ", acc) + start = time.time() + total_cost_title = 0 + jsonSM, links, article_text = {},[], "" + acc_score = { "isolate": "", + "country":{}, + "sample_type":{}, + #"specific_location":{}, + #"ethnicity":{}, + "query_cost":total_cost_title, + "time_cost":None, + "source":links, + "file_chunk":"", + "file_all_output":""} + if niche_cases: + for niche in niche_cases: + acc_score[niche] = {} + + meta = mtdna_classifier.fetch_ncbi_metadata(acc) + country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"] + acc_score["isolate"] = iso + print("meta: ",meta) + meta_expand = smart_fallback.fetch_ncbi(acc) + print("meta expand: ", meta_expand) + # set up step: create the folder to save document + chunk, all_output, out_links = "","", {} + if pudID: + id = str(pudID) + saveTitle = title + else: + try: + author_name = meta_expand["authors"].split(',')[0] # Use last name only + except: + author_name = meta_expand["authors"] + saveTitle = title + "_" + col_date + "_" + author_name + if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown": + saveTitle += "_" + acc + id = "DirectSubmission" + # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)) + # if not folder_path.exists(): + # cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}' + # result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + # print("data/"+str(id) +" created.") + # else: + # print("data/"+str(id) +" already exists.") + # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id) + # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME) + # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id) + # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id) + data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly + sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id) + print("sample folder id: ", sample_folder_id) + + # Define document names + # if len(saveTitle) > 50: + # saveName = saveTitle[:50] + # saveName = saveName.replace(" ", "_") + # chunk_filename = f"{saveName}_merged_document.docx" + # all_filename = f"{saveName}_all_merged_document.docx" + # else: + # saveName = saveTitle.replace(" ", "_") + # chunk_filename = f"{saveName}_merged_document.docx" + # all_filename = f"{saveName}_all_merged_document.docx" + safe_title = sanitize_filename(saveTitle, 50) + chunk_filename = f"{safe_title}_merged_document.docx" + all_filename = f"{safe_title}_all_merged_document.docx" + print("chunk file name and all filename: ", chunk_filename, all_filename) + # Define local temp paths for reading/writing + # import tempfile + # tmp_dir = tempfile.mkdtemp() + LOCAL_TEMP_DIR = "/mnt/data/generated_docs" + os.makedirs(LOCAL_TEMP_DIR, exist_ok=True) + file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename) + file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename) + # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename) + # file_all_path = os.path.join(tempfile.gettempdir(), all_filename) + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + print("this is file chunk path: ", file_chunk_path) + chunk_id = find_drive_file(chunk_filename, sample_folder_id) + all_id = find_drive_file(all_filename, sample_folder_id) + + if chunk_id and all_id: + print("✅ Files already exist in Google Drive. Downloading them...") + chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path) + all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path) + acc_score["file_chunk"] = str(chunk_filename) + acc_score["file_all_output"] = str(all_filename) + print("chunk_id and all_id: ") + print(chunk_id, all_id) + print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"]) + file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute() + print("📄 Name:", file["name"]) + print("📁 Parent folder ID:", file["parents"][0]) + print("🔗 View link:", file["webViewLink"]) + + + # Read and parse these into `chunk` and `all_output` + else: + # 🔥 Remove any stale local copies + if os.path.exists(file_chunk_path): + os.remove(file_chunk_path) + print(f"🗑️ Removed stale: {file_chunk_path}") + if os.path.exists(file_all_path): + os.remove(file_all_path) + print(f"🗑️ Removed stale: {file_all_path}") + # 🔥 Remove the local file first if it exists + # if os.path.exists(file_chunk_path): + # os.remove(file_chunk_path) + # print("remove chunk path") + # if os.path.exists(file_all_path): + # os.remove(file_all_path) + # print("remove all path") + # Try to download if already exists on Drive + chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path) + all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path) + print("chunk exist: ", chunk_exists) + # first way: ncbi method + print("country.lower: ",country.lower()) + if country.lower() != "unknown": + stand_country = standardize_location.smart_country_lookup(country.lower()) + print("stand_country: ", stand_country) + if stand_country.lower() != "not found": + acc_score["country"][stand_country.lower()] = ["ncbi"] + else: acc_score["country"][country.lower()] = ["ncbi"] + # if spe_loc.lower() != "unknown": + # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"] + # if ethnic.lower() != "unknown": + # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"] + if sample_type.lower() != "unknown": + acc_score["sample_type"][sample_type.lower()] = ["ncbi"] + # second way: LLM model + # Preprocess the input token + print(acc_score) + accession, isolate = None, None + if acc != "unknown": accession = acc + if iso != "unknown": isolate = iso + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # check doi first + print("chunk filename: ", chunk_filename) + if chunk_exists: + print("File chunk exists!") + if not chunk: + print("start to get chunk") + text, table, document_title = model.read_docx_text(file_chunk_path) + chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) + if str(chunk_filename) != "": + print("first time have chunk path at chunk exist: ", str(chunk_filename)) + acc_score["file_chunk"] = str(chunk_filename) + if all_exists: + print("File all output exists!") + if not all_output: + text_all, table_all, document_title_all = model.read_docx_text(file_all_path) + all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) + if str(all_filename) != "": + print("first time have all path at all exist: ", str(all_filename)) + acc_score["file_all_output"] = str(all_filename) + print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"]) + if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0: + if doi != "unknown": + link = 'https://doi.org/' + doi + # get the file to create listOfFile for each id + print("link of doi: ", link) + html = extractHTML.HTML("",link) + jsonSM = html.getSupMaterial() + article_text = html.getListSection() + if article_text: + if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): + out_links[link] = article_text + links.append(link) + if jsonSM: + links += sum((jsonSM[key] for key in jsonSM),[]) + if links: + for l in links: + out_links[l] = "" + # no doi then google custom search api + if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): + # might find the article + print("no article text, start tem link") + #tem_links = mtdna_classifier.search_google_custom(title, 2) + tem_links = smart_fallback.smart_google_search(meta_expand) + print("tem links: ", tem_links) + tem_link_acc = smart_fallback.google_accession_search(acc) + tem_links += tem_link_acc + tem_links = unique_preserve_order(tem_links) + print("tem link before filtering: ", tem_links) + # filter the quality link + print("saveLinkFolder as sample folder id: ", sample_folder_id) + print("start the smart filter link") + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) + # if success_process: + # links = output_process + # print("yes succeed for smart filter link") + # else: + # print("no suceed, fallback to all tem links") + # links = tem_links + #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) + success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=90) + if success_process: + out_links.update(output_process) + print("yeah we have out_link and len: ", len(out_links)) + print("yes succeed for smart filter link") + links += list(out_links.keys()) + print("link keys: ", links) + else: + print("no suceed, fallback to all tem links") + links += tem_links + print("this is links: ",links) + links = unique_preserve_order(links) + acc_score["source"] = links + else: + print("inside the try of reusing chunk or all output") + #print("chunk filename: ", str(chunks_filename)) + + try: + temp_source = False + if save_df is not None and not save_df.empty: + print("save df not none") + print("chunk file name: ",str(chunk_filename)) + print("all filename: ",str(all_filename)) + print("acc score for file chunk: ", acc_score["file_chunk"]) + print("acc score for file all output: ", acc_score["file_all_output"]) + if acc_score["file_chunk"]: + link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0] + #link = row["Sources"].iloc[0] + if "http" in link: + print("yeah http in save df source") + acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist() + else: # temporary + print("tempo source") + #acc_score["source"] = [str(all_filename), str(chunks_filename)] + temp_source = True + elif acc_score["file_all_output"]: + link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0] + #link = row["Sources"].iloc[0] + print(link) + print("list of link") + print([x for x in link.split("\n") if x.strip()]) + if "http" in link: + print("yeah http in save df source") + acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist() + else: # temporary + print("tempo source") + #acc_score["source"] = [str(all_filename), str(chunks_filename)] + temp_source = True + else: # temporary + print("tempo source") + #acc_score["source"] = [str(file_all_path), str(file_chunk_path)] + temp_source = True + else: # temporary + print("tempo source") + #acc_score["source"] = [str(file_all_path), str(file_chunk_path)] + temp_source = True + if temp_source: + print("temp source is true so have to try again search link") + if doi != "unknown": + link = 'https://doi.org/' + doi + # get the file to create listOfFile for each id + print("link of doi: ", link) + html = extractHTML.HTML("",link) + jsonSM = html.getSupMaterial() + article_text = html.getListSection() + if article_text: + if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): + out_links[link] = article_text + links.append(link) + if jsonSM: + links += sum((jsonSM[key] for key in jsonSM),[]) + if links: + for l in links: + out_links[l] = "" + # no doi then google custom search api + if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): + # might find the article + print("no article text, start tem link") + #tem_links = mtdna_classifier.search_google_custom(title, 2) + tem_links = smart_fallback.smart_google_search(meta_expand) + print("tem links: ", tem_links) + tem_link_acc = smart_fallback.google_accession_search(acc) + tem_links += tem_link_acc + tem_links = unique_preserve_order(tem_links) + print("tem link before filtering: ", tem_links) + # filter the quality link + print("saveLinkFolder as sample folder id: ", sample_folder_id) + print("start the smart filter link") + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) + # if success_process: + # links = output_process + # print("yes succeed for smart filter link") + # else: + # print("no suceed, fallback to all tem links") + # links = tem_links + #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) + success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=90) + if success_process: + out_links.update(output_process) + print("yeah we have out_link and len: ", len(out_links)) + print("yes succeed for smart filter link") + links += list(out_links.keys()) + print("link keys: ", links) + else: + print("no suceed, fallback to all tem links") + links += tem_links + print("this is links: ",links) + links = unique_preserve_order(links) + acc_score["source"] = links + except: + try: + print("in the exception and start to get link") + if doi != "unknown": + link = 'https://doi.org/' + doi + # get the file to create listOfFile for each id + print("link of doi: ", link) + html = extractHTML.HTML("",link) + jsonSM = html.getSupMaterial() + article_text = html.getListSection() + if article_text: + if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower(): + out_links[link] = article_text + links.append(link) + if jsonSM: + links += sum((jsonSM[key] for key in jsonSM),[]) + if links: + for l in links: + out_links[l] = "" + # no doi then google custom search api + if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower(): + # might find the article + print("no article text, start tem link") + #tem_links = mtdna_classifier.search_google_custom(title, 2) + tem_links = smart_fallback.smart_google_search(meta_expand) + print("tem links: ", tem_links) + tem_link_acc = smart_fallback.google_accession_search(acc) + tem_links += tem_link_acc + tem_links = unique_preserve_order(tem_links) + print("tem link before filtering: ", tem_links) + # filter the quality link + print("saveLinkFolder as sample folder id: ", sample_folder_id) + print("start the smart filter link") + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc}) + # if success_process: + # links = output_process + # print("yes succeed for smart filter link") + # else: + # print("no suceed, fallback to all tem links") + # links = tem_links + #links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag) + success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc},timeout=90) + if success_process: + out_links.update(output_process) + print("yeah we have out_link and len: ", len(out_links)) + print("yes succeed for smart filter link") + links += list(out_links.keys()) + print("link keys: ", links) + else: + print("no suceed, fallback to all tem links") + links += tem_links + print("this is links: ",links) + links = unique_preserve_order(links) + acc_score["source"] = links + except: + print("except of except for source") + acc_score["source"] = [] + # chunk_path = "/"+saveTitle+"_merged_document.docx" + # all_path = "/"+saveTitle+"_all_merged_document.docx" + # # if chunk and all output not exist yet + # file_chunk_path = saveLinkFolder + chunk_path + # file_all_path = saveLinkFolder + all_path + # if os.path.exists(file_chunk_path): + # print("File chunk exists!") + # if not chunk: + # text, table, document_title = model.read_docx_text(file_chunk_path) + # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) + # if os.path.exists(file_all_path): + # print("File all output exists!") + # if not all_output: + # text_all, table_all, document_title_all = model.read_docx_text(file_all_path) + # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # print("chunk filename: ", chunk_filename) + # if chunk_exists: + # print("File chunk exists!") + # if not chunk: + # print("start to get chunk") + # text, table, document_title = model.read_docx_text(file_chunk_path) + # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table)) + # if str(chunk_filename) != "": + # print("first time have chunk path at chunk exist: ", str(chunk_filename)) + # acc_score["file_chunk"] = str(chunk_filename) + # if all_exists: + # print("File all output exists!") + # if not all_output: + # text_all, table_all, document_title_all = model.read_docx_text(file_all_path) + # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all)) + # if str(all_filename) != "": + # print("first time have all path at all exist: ", str(all_filename)) + # acc_score["file_all_output"] = str(all_filename) + if not chunk and not all_output: + print("not chunk and all output") + # else: check if we can reuse these chunk and all output of existed accession to find another + if str(chunk_filename) != "": + print("first time have chunk path: ", str(chunk_filename)) + acc_score["file_chunk"] = str(chunk_filename) + if str(all_filename) != "": + print("first time have all path: ", str(all_filename)) + acc_score["file_all_output"] = str(all_filename) + if links: + for link in links: + print(link) + # if len(all_output) > 1000*1000: + # all_output = data_preprocess.normalize_for_overlap(all_output) + # print("after normalizing all output: ", len(all_output)) + if len(data_preprocess.normalize_for_overlap(all_output)) > 600000: + print("break here") + break + if iso != "unknown": query_kw = iso + else: query_kw = acc + #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw) + + # success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100) + # if stop_flag is not None and stop_flag.value: + # print(f"🛑 Stop processing {accession}, aborting early...") + # return {} + # if success_process: + # text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2] + # print("yes succeed for process document") + # else: text_link, tables_link, final_input_link = "", "", "" + if out_links: + if link in out_links: + print("yeah art_text available") + art_text = out_links[link] + else: + art_text = None + else: + art_text = None + if art_text: + print("article text already available") + text_link = art_text + else: + try: + print("start preprocess and extract text") + text_link = data_preprocess.extract_text(link, sample_folder_id) + except: text_link = "" + try: + print("extract table start") + success, the_output = run_with_timeout(data_preprocess.extract_table,args=(link,sample_folder_id),timeout=10) + print("Returned from timeout logic") + if success: + tables_link = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) + print("yes succeed for extract table") + else: + print("not suceed etxract table") + tables_link = [] + #tables = extract_table(link, saveFolder) + except: tables_link = [] + try: + # print("merge text and table start") + # success, the_output = pipeline.run_with_timeout(merge_text_and_tables,kwargs={"text":text,"tables":tables,"accession_id":accession, "isolate":isolate},timeout=30) + # print("Returned from timeout logic") + # if success: + # final_input = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) + # print("yes succeed") + # else: + # print("not suceed") + print("just merge text and tables") + final_input_link = text_link + ", ".join(tables_link) + #final_input = pipeline.timeout(merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate) + except: + print("no succeed here in preprocess docu") + final_input_link = "" + + context = data_preprocess.extract_context(final_input_link, query_kw) + chunk += context + # if context != "Sample ID not found.": + # if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000: + # success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context)) + # if stop_flag is not None and stop_flag.value: + # print(f"🛑 Stop processing {accession}, aborting early...") + # return {} + # if success_chunk: + # chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) + # print("yes succeed for chunk") + # else: + # chunk += context + # print("len context: ", len(context)) + # print("basic fall back") + # print("len chunk after: ", len(chunk)) + if len(final_input_link) > 1000*1000: + # if context != "Sample ID not found.": + # final_input_link = context + # else: + final_input_link = data_preprocess.normalize_for_overlap(final_input_link) + if len(final_input_link) > 1000 *1000: + final_input_link = final_input_link[:100000] + print("len normalized all output: ", len(data_preprocess.normalize_for_overlap(all_output))) + # if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000: + # print("Running merge_texts_skipping_overlap with timeout") + # success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30) + # if stop_flag is not None and stop_flag.value: + # print(f"🛑 Stop processing {accession}, aborting early...") + # return {} + # print("Returned from timeout logic") + # if success: + # all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link) + # print("yes succeed") + # else: + # print("len all output: ", len(all_output)) + # print("len final input link: ", len(final_input_link)) + # all_output += final_input_link + # print("len final input: ", len(final_input_link)) + # print("basic fall back") + # else: + # print("both/either all output or final link too large more than 100000") + # print("len all output: ", len(all_output)) + # print("len final input link: ", len(final_input_link)) + # all_output += final_input_link + # print("len final input: ", len(final_input_link)) + # print("basic fall back") + + print("len all output: ", len(all_output)) + print("len final input link: ", len(final_input_link)) + all_output = data_preprocess.normalize_for_overlap(all_output) + final_input_link + print("len final input: ", len(final_input_link)) + print("basic fall back") + print("len all output after: ", len(all_output)) + #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate) + else: + chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features + all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features + if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features + if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features + if len(all_output) > 1*1000*1000: + all_output = data_preprocess.normalize_for_overlap(all_output) + if len(all_output) > 1*1000*1000: + all_output = all_output[:1000000] + if len(chunk) > 1*1000*1000: + chunk = data_preprocess.normalize_for_overlap(chunk) + if len(chunk) > 1*1000*1000: + chunk = chunk[:1*1000*1000] + print("chunk len: ", len(chunk)) + print("all output len: ", len(all_output)) + data_preprocess.save_text_to_docx(chunk, file_chunk_path) + data_preprocess.save_text_to_docx(all_output, file_all_path) + # Later when saving new files + # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id) + # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id) + + # Upload to Drive + result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id) + result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id) + print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload) + print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view") + print("here 1") + + # else: + # final_input = "" + # if all_output: + # final_input = all_output + # else: + # if chunk: final_input = chunk + # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output) + # if final_input: + # keywords = [] + # if iso != "unknown": keywords.append(iso) + # if acc != "unknown": keywords.append(acc) + # for keyword in keywords: + # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword) + # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword) + # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS) + # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS) + + # Define paths for cached RAG assets + # faiss_index_path = saveLinkFolder+"/faiss_index.bin" + # document_chunks_path = saveLinkFolder+"/document_chunks.json" + # structured_lookup_path = saveLinkFolder+"/structured_lookup.json" + print("here 2") + + # faiss_filename = "faiss_index.bin" + # chunks_filename = "document_chunks.json" + # lookup_filename = "structured_lookup.json" + # print("name of faiss: ", faiss_filename) + + # faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename) + # document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename) + # structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename) + # print("name if faiss path: ", faiss_index_path) + # # 🔥 Remove the local file first if it exists + # print("start faiss id and also the sample folder id is: ", sample_folder_id) + # faiss_id = find_drive_file(faiss_filename, sample_folder_id) + # print("done faiss id") + # document_id = find_drive_file(chunks_filename, sample_folder_id) + # structure_id = find_drive_file(lookup_filename, sample_folder_id) + # if faiss_id and document_id and structure_id: + # print("✅ 3 Files already exist in Google Drive. Downloading them...") + # download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path) + # download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path) + # download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path) + # # Read and parse these into `chunk` and `all_output` + # else: + # "one of id not exist" + # if os.path.exists(faiss_index_path): + # print("faiss index exist and start to remove: ", faiss_index_path) + # os.remove(faiss_index_path) + # if os.path.exists(document_chunks_path): + # os.remove(document_chunks_path) + # if os.path.exists(structured_lookup_path): + # os.remove(structured_lookup_path) + # print("start to download the faiss, chunk, lookup") + + # download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path) + # download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path) + # download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path) + try: + print("try gemini 2.5") + # print("move to load rag") + # master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets( + # faiss_index_path, document_chunks_path, structured_lookup_path + # ) + + global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest') + if not all_output: + if chunk: all_output = chunk + else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features + # if faiss_index is None: + # print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...") + # total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens( + # all_output + # ).total_tokens + + # initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT + # total_cost_title += initial_embedding_cost + # print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}") + + + # master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data( + # file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path + # ) + # else: + # print("\nRAG assets loaded from file. No re-embedding of entire document will occur.") + # plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path) + # master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all) + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + primary_word = iso + alternative_word = acc + print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---") + if features.lower() not in all_output.lower(): + all_output += ". NCBI Features: " + features + # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info( + # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, + # model.call_llm_api, chunk=chunk, all_output=all_output) + print("this is chunk for the model") + print(chunk) + print("this is all output for the model") + print(all_output) + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( + # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, + # model.call_llm_api, chunk=chunk, all_output=all_output) + country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( + query_word=primary_word, alternative_query_word=alternative_word, + metadata=meta, + master_structured_lookup=None, faiss_index=None, document_chunks=None, + llm_api_function=model.call_llm_api, chunk=chunk, all_output=all_output) + print("pass query of 2.5") + except: + print("try gemini 1.5") + # country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info( + # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, + # model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest") + country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( + query_word=primary_word, alternative_query_word=alternative_word, + metadata=meta, + master_structured_lookup=None, faiss_index=None, document_chunks=None, + llm_api_function=model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest") + print("yeah pass the query of 1.5") + print("country using ai: ", country) + print("sample type using ai: ", sample_type) + # if len(country) == 0: country = "unknown" + # if len(sample_type) == 0: sample_type = "unknown" + # if country_explanation: country_explanation = "-"+country_explanation + # else: country_explanation = "" + # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation + # else: sample_type_explanation = "" + if len(country) == 0: country = "unknown" + if len(sample_type) == 0: sample_type = "unknown" + if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation + else: country_explanation = "" + if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation + else: sample_type_explanation = "" + + if method_used == "unknown": method_used = "" + if country.lower() != "unknown": + stand_country = standardize_location.smart_country_lookup(country.lower()) + if stand_country.lower() != "not found": + if stand_country.lower() in acc_score["country"]: + if country_explanation: + acc_score["country"][stand_country.lower()].append(method_used + country_explanation) + else: + acc_score["country"][stand_country.lower()] = [method_used + country_explanation] + else: + if country.lower() in acc_score["country"]: + if country_explanation: + if len(method_used + country_explanation) > 0: + acc_score["country"][country.lower()].append(method_used + country_explanation) + else: + if len(method_used + country_explanation) > 0: + acc_score["country"][country.lower()] = [method_used + country_explanation] + # if spe_loc.lower() != "unknown": + # if spe_loc.lower() in acc_score["specific_location"]: + # acc_score["specific_location"][spe_loc.lower()].append(method_used) + # else: + # acc_score["specific_location"][spe_loc.lower()] = [method_used] + # if ethnic.lower() != "unknown": + # if ethnic.lower() in acc_score["ethnicity"]: + # acc_score["ethnicity"][ethnic.lower()].append(method_used) + # else: + # acc_score["ethnicity"][ethnic.lower()] = [method_used] + if sample_type.lower() != "unknown": + if sample_type.lower() in acc_score["sample_type"]: + if len(method_used + sample_type_explanation) > 0: + acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation) + else: + if len(method_used + sample_type_explanation)> 0: + acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation] + total_cost_title += total_query_cost + if stop_flag is not None and stop_flag.value: + print(f"🛑 Stop processing {accession}, aborting early...") + return {} + # last resort: combine all information to give all output otherwise unknown + if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown": + text = "" + for key in meta_expand: + text += str(key) + ": " + meta_expand[key] + "\n" + if len(data_preprocess.normalize_for_overlap(all_output)) > 0: + text += data_preprocess.normalize_for_overlap(all_output) + if len(data_preprocess.normalize_for_overlap(chunk)) > 0: + text += data_preprocess.normalize_for_overlap(chunk) + text += ". NCBI Features: " + features + print("this is text for the last resort model") + print(text) + # country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( + # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, + # model.call_llm_api, chunk=text, all_output=text) + country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info( + query_word=primary_word, alternative_query_word=alternative_word, + metadata=meta, + master_structured_lookup=None, faiss_index=None, document_chunks=None, + llm_api_function=model.call_llm_api, chunk=text, all_output=text) + print("this is last resort results: ") + print("country: ", country) + print("sample type: ", sample_type) + if len(country) == 0: country = "unknown" + if len(sample_type) == 0: sample_type = "unknown" + # if country_explanation: country_explanation = "-"+country_explanation + # else: country_explanation = "" + # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation + # else: sample_type_explanation = "" + if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation + else: country_explanation = "" + if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation + else: sample_type_explanation = "" + + if method_used == "unknown": method_used = "" + if country.lower() != "unknown": + stand_country = standardize_location.smart_country_lookup(country.lower()) + if stand_country.lower() != "not found": + if stand_country.lower() in acc_score["country"]: + if country_explanation: + acc_score["country"][stand_country.lower()].append(method_used + country_explanation) + else: + acc_score["country"][stand_country.lower()] = [method_used + country_explanation] + else: + if country.lower() in acc_score["country"]: + if country_explanation: + if len(method_used + country_explanation) > 0: + acc_score["country"][country.lower()].append(method_used + country_explanation) + else: + if len(method_used + country_explanation) > 0: + acc_score["country"][country.lower()] = [method_used + country_explanation] + if sample_type.lower() != "unknown": + if sample_type.lower() in acc_score["sample_type"]: + if len(method_used + sample_type_explanation) > 0: + acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation) + else: + if len(method_used + sample_type_explanation)> 0: + acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation] + total_cost_title += total_query_cost + end = time.time() + #total_cost_title += total_query_cost + acc_score["query_cost"] = f"{total_cost_title:.6f}" + elapsed = end - start + acc_score["time_cost"] = f"{elapsed:.3f} seconds" + accs_output[acc] = acc_score + print(accs_output[acc]) + return accs_output \ No newline at end of file