VyLala commited on
Commit
efbe3bf
Β·
verified Β·
1 Parent(s): 8835144

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +650 -648
pipeline.py CHANGED
@@ -1,649 +1,651 @@
1
- # test1: MJ17 direct
2
- # test2: "A1YU101" thailand cross-ref
3
- # test3: "EBK109" thailand cross-ref
4
- # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
- import data_preprocess
6
- import model
7
- import mtdna_classifier
8
- #import app
9
- import smart_fallback
10
- import pandas as pd
11
- from pathlib import Path
12
- import subprocess
13
- from NER.html import extractHTML
14
- import os
15
- import google.generativeai as genai
16
- import re
17
- import standardize_location
18
- # Helper functions in for this pipeline
19
- # Track time
20
- import time
21
- import multiprocessing
22
- import gspread
23
- from googleapiclient.discovery import build
24
- from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
25
- from google.oauth2.service_account import Credentials
26
- from oauth2client.service_account import ServiceAccountCredentials
27
- import io
28
- import json
29
- #––– Authentication setup –––
30
- GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
31
- GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
32
- GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
33
- GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
34
- drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
35
-
36
- def get_or_create_drive_folder(name, parent_id=None):
37
- query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
38
- if parent_id:
39
- query += f" and '{parent_id}' in parents"
40
- results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
41
- items = results.get("files", [])
42
- if items:
43
- return items[0]["id"]
44
- file_metadata = {
45
- "name": name,
46
- "mimeType": "application/vnd.google-apps.folder"
47
- }
48
- if parent_id:
49
- file_metadata["parents"] = [parent_id]
50
- file = drive_service.files().create(body=file_metadata, fields="id").execute()
51
- return file["id"]
52
- def find_drive_file(filename, parent_id):
53
- """
54
- Checks if a file with the given name exists inside the specified Google Drive folder.
55
- Returns the file ID if found, else None.
56
- """
57
- query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
58
- results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
59
- files = results.get('files', [])
60
- if files:
61
- return files[0]["id"]
62
- return None
63
-
64
-
65
- # def upload_file_to_drive(local_path, remote_name, folder_id):
66
- # file_metadata = {"name": remote_name, "parents": [folder_id]}
67
- # media = MediaFileUpload(local_path, resumable=True)
68
- # existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
69
- # if existing:
70
- # drive_service.files().delete(fileId=existing[0]["id"]).execute()
71
- # file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
72
- # result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
73
- # if not result.get("files"):
74
- # print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
75
- # else:
76
- # print(f"βœ… Verified upload: {remote_name}")
77
- # return file["id"]
78
- def upload_file_to_drive(local_path, remote_name, folder_id):
79
- try:
80
- if not os.path.exists(local_path):
81
- raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
82
-
83
- # Delete existing file on Drive if present
84
- existing = drive_service.files().list(
85
- q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
86
- fields="files(id)"
87
- ).execute().get("files", [])
88
-
89
- if existing:
90
- drive_service.files().delete(fileId=existing[0]["id"]).execute()
91
- print(f"πŸ—‘οΈ Deleted existing '{remote_name}' in Drive folder {folder_id}")
92
-
93
- file_metadata = {"name": remote_name, "parents": [folder_id]}
94
- media = MediaFileUpload(local_path, resumable=True)
95
- file = drive_service.files().create(
96
- body=file_metadata,
97
- media_body=media,
98
- fields="id"
99
- ).execute()
100
-
101
- print(f"βœ… Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
102
- return file["id"]
103
-
104
- except Exception as e:
105
- print(f"❌ Error during upload: {e}")
106
- return None
107
-
108
-
109
- def download_file_from_drive(remote_name, folder_id, local_path):
110
- results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
111
- files = results.get("files", [])
112
- if not files:
113
- return False
114
- file_id = files[0]["id"]
115
- request = drive_service.files().get_media(fileId=file_id)
116
- fh = io.FileIO(local_path, 'wb')
117
- downloader = MediaIoBaseDownload(fh, request)
118
- done = False
119
- while not done:
120
- _, done = downloader.next_chunk()
121
- return True
122
- def download_drive_file_content(file_id):
123
- request = drive_service.files().get_media(fileId=file_id)
124
- fh = io.BytesIO()
125
- downloader = MediaIoBaseDownload(fh, request)
126
- done = False
127
- while not done:
128
- _, done = downloader.next_chunk()
129
- fh.seek(0)
130
- return fh.read().decode("utf-8")
131
-
132
- # def run_with_timeout(func, args=(), kwargs={}, timeout=20):
133
- # """
134
- # Runs `func` with timeout in seconds. Kills if it exceeds.
135
- # Returns: (success, result or None)
136
- # """
137
- # def wrapper(q, *args, **kwargs):
138
- # try:
139
- # q.put(func(*args, **kwargs))
140
- # except Exception as e:
141
- # q.put(e)
142
-
143
- # q = multiprocessing.Queue()
144
- # p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
145
- # p.start()
146
- # p.join(timeout)
147
-
148
- # if p.is_alive():
149
- # p.terminate()
150
- # p.join()
151
- # print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
152
- # return False, None
153
- # else:
154
- # result = q.get()
155
- # if isinstance(result, Exception):
156
- # raise result
157
- # return True, result
158
- def run_with_timeout(func, args=(), kwargs={}, timeout=30):
159
- import concurrent.futures
160
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
161
- future = executor.submit(func, *args, **kwargs)
162
- try:
163
- return True, future.result(timeout=timeout)
164
- except concurrent.futures.TimeoutError:
165
- print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
166
- return False, None
167
-
168
- def time_it(func, *args, **kwargs):
169
- """
170
- Measure how long a function takes to run and return its result + time.
171
- """
172
- start = time.time()
173
- result = func(*args, **kwargs)
174
- end = time.time()
175
- elapsed = end - start
176
- print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
177
- return result, elapsed
178
- # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
179
- def track_gemini_cost():
180
- # Prices are per 1,000 tokens
181
- PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
182
- PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
183
- PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
184
- return True
185
-
186
- def unique_preserve_order(seq):
187
- seen = set()
188
- return [x for x in seq if not (x in seen or seen.add(x))]
189
- # Main execution
190
- def pipeline_with_gemini(accessions):
191
- # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
192
- # there can be one accession number in the accessions
193
- # Prices are per 1,000 tokens
194
- PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
195
- PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
196
- PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
197
- if not accessions:
198
- print("no input")
199
- return None
200
- else:
201
- accs_output = {}
202
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
203
- for acc in accessions:
204
- print("start gemini: ", acc)
205
- start = time.time()
206
- total_cost_title = 0
207
- jsonSM, links, article_text = {},[], ""
208
- acc_score = { "isolate": "",
209
- "country":{},
210
- "sample_type":{},
211
- #"specific_location":{},
212
- #"ethnicity":{},
213
- "query_cost":total_cost_title,
214
- "time_cost":None,
215
- "source":links}
216
- meta = mtdna_classifier.fetch_ncbi_metadata("unknown")
217
- country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
218
- acc_score["isolate"] = iso
219
- print("meta: ",meta)
220
- meta_expand = smart_fallback.fetch_ncbi("unknown")
221
- print("meta expand: ", meta_expand)
222
- # set up step: create the folder to save document
223
- chunk, all_output = "",""
224
- if pudID:
225
- id = str(pudID)
226
- saveTitle = title
227
- else:
228
- try:
229
- author_name = meta_expand["authors"].split(',')[0] # Use last name only
230
- except:
231
- author_name = meta_expand["authors"]
232
- saveTitle = title + "_" + col_date + "_" + author_name
233
- id = "DirectSubmission"
234
- # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
235
- # if not folder_path.exists():
236
- # cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
237
- # result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
238
- # print("data/"+str(id) +" created.")
239
- # else:
240
- # print("data/"+str(id) +" already exists.")
241
- # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
242
- # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
243
- # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
244
- # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
245
- data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
246
- sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
247
- print("sample folder id: ", sample_folder_id)
248
-
249
- # Define document names
250
- if len(saveTitle) > 50:
251
- saveName = saveTitle[:50]
252
- saveName = saveName.replace(" ", "_")
253
- chunk_filename = f"{saveName}_merged_document.docx"
254
- all_filename = f"{saveName}_all_merged_document.docx"
255
- else:
256
- saveName = saveTitle.replace(" ", "_")
257
- chunk_filename = f"{saveName}_merged_document.docx"
258
- all_filename = f"{saveName}_all_merged_document.docx"
259
- print(chunk_filename, all_filename)
260
- # Define local temp paths for reading/writing
261
- # import tempfile
262
- # tmp_dir = tempfile.mkdtemp()
263
- LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
264
- os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
265
- file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
266
- file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
267
- # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
268
- # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
269
- print(file_chunk_path)
270
- chunk_id = find_drive_file(chunk_filename, sample_folder_id)
271
- all_id = find_drive_file(all_filename, sample_folder_id)
272
-
273
- if chunk_id and all_id:
274
- print("βœ… Files already exist in Google Drive. Downloading them...")
275
- chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
276
- all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
277
- print("chunk_id and all_id: ")
278
- print(chunk_id, all_id)
279
- file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
280
- print("πŸ“„ Name:", file["name"])
281
- print("πŸ“ Parent folder ID:", file["parents"][0])
282
- print("πŸ”— View link:", file["webViewLink"])
283
-
284
-
285
- # Read and parse these into `chunk` and `all_output`
286
- else:
287
- # πŸ”₯ Remove any stale local copies
288
- if os.path.exists(file_chunk_path):
289
- os.remove(file_chunk_path)
290
- print(f"πŸ—‘οΈ Removed stale: {file_chunk_path}")
291
- if os.path.exists(file_all_path):
292
- os.remove(file_all_path)
293
- print(f"πŸ—‘οΈ Removed stale: {file_all_path}")
294
- # πŸ”₯ Remove the local file first if it exists
295
- # if os.path.exists(file_chunk_path):
296
- # os.remove(file_chunk_path)
297
- # print("remove chunk path")
298
- # if os.path.exists(file_all_path):
299
- # os.remove(file_all_path)
300
- # print("remove all path")
301
- # Try to download if already exists on Drive
302
- chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
303
- all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
304
- print("chunk exist: ", chunk_exists)
305
- # first way: ncbi method
306
- print("country.lower: ",country.lower())
307
- if country.lower() != "unknown":
308
- stand_country = standardize_location.smart_country_lookup(country.lower())
309
- print("stand_country: ", stand_country)
310
- if stand_country.lower() != "not found":
311
- acc_score["country"][stand_country.lower()] = ["ncbi"]
312
- else: acc_score["country"][country.lower()] = ["ncbi"]
313
- # if spe_loc.lower() != "unknown":
314
- # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
315
- # if ethnic.lower() != "unknown":
316
- # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
317
- if sample_type.lower() != "unknown":
318
- acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
319
- # second way: LLM model
320
- # Preprocess the input token
321
- print(acc_score)
322
- accession, isolate = None, None
323
- if acc != "unknown": accession = acc
324
- if iso != "unknown": isolate = iso
325
- # check doi first
326
- if doi != "unknown":
327
- link = 'https://doi.org/' + doi
328
- # get the file to create listOfFile for each id
329
- print("link of doi: ", link)
330
- html = extractHTML.HTML("",link)
331
- jsonSM = html.getSupMaterial()
332
- article_text = html.getListSection()
333
- if article_text:
334
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
335
- links.append(link)
336
- if jsonSM:
337
- links += sum((jsonSM[key] for key in jsonSM),[])
338
- # no doi then google custom search api
339
- if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
340
- # might find the article
341
- print("no article text, start tem link")
342
- #tem_links = mtdna_classifier.search_google_custom(title, 2)
343
- tem_links = smart_fallback.smart_google_search(meta_expand)
344
- print("tem links: ", tem_links)
345
- tem_link_acc = smart_fallback.google_accession_search(acc)
346
- tem_links += tem_link_acc
347
- tem_links = unique_preserve_order(tem_links)
348
- print("tem link before filtering: ", tem_links)
349
- # filter the quality link
350
- print("saveLinkFolder as sample folder id: ", sample_folder_id)
351
- links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc)
352
- print("this is links: ",links)
353
- links = unique_preserve_order(links)
354
- acc_score["source"] = links
355
- # chunk_path = "/"+saveTitle+"_merged_document.docx"
356
- # all_path = "/"+saveTitle+"_all_merged_document.docx"
357
- # # if chunk and all output not exist yet
358
- # file_chunk_path = saveLinkFolder + chunk_path
359
- # file_all_path = saveLinkFolder + all_path
360
- # if os.path.exists(file_chunk_path):
361
- # print("File chunk exists!")
362
- # if not chunk:
363
- # text, table, document_title = model.read_docx_text(file_chunk_path)
364
- # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
365
- # if os.path.exists(file_all_path):
366
- # print("File all output exists!")
367
- # if not all_output:
368
- # text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
369
- # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
370
- if chunk_exists:
371
- print("File chunk exists!")
372
- if not chunk:
373
- print("start to get chunk")
374
- text, table, document_title = model.read_docx_text(file_chunk_path)
375
- chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
376
- if all_exists:
377
- print("File all output exists!")
378
- if not all_output:
379
- text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
380
- all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
381
- if not chunk and not all_output:
382
- print("not chunk and all output")
383
- # else: check if we can reuse these chunk and all output of existed accession to find another
384
- if links:
385
- for link in links:
386
- print(link)
387
- # if len(all_output) > 1000*1000:
388
- # all_output = data_preprocess.normalize_for_overlap(all_output)
389
- # print("after normalizing all output: ", len(all_output))
390
- if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
391
- print("break here")
392
- break
393
- if iso != "unknown": query_kw = iso
394
- else: query_kw = acc
395
- #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
396
- success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=180)
397
- if success_process:
398
- text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
399
- print("yes succeed for process document")
400
- else: text_link, tables_link, final_input_link = "", "", ""
401
- context = data_preprocess.extract_context(final_input_link, query_kw)
402
- if context != "Sample ID not found.":
403
- if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
404
- success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
405
- if success_chunk:
406
- chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
407
- print("yes succeed for chunk")
408
- else:
409
- chunk += context
410
- print("len context: ", len(context))
411
- print("basic fall back")
412
- print("len chunk after: ", len(chunk))
413
- if len(final_input_link) > 1000*1000:
414
- if context != "Sample ID not found.":
415
- final_input_link = context
416
- else:
417
- final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
418
- if len(final_input_link) > 1000 *1000:
419
- final_input_link = final_input_link[:100000]
420
- if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
421
- success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
422
- if success:
423
- all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
424
- print("yes succeed")
425
- else:
426
- all_output += final_input_link
427
- print("len final input: ", len(final_input_link))
428
- print("basic fall back")
429
- print("len all output after: ", len(all_output))
430
- #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
431
-
432
- else:
433
- chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
434
- all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
435
- if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
436
- if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
437
- if len(all_output) > 1*1024*1024:
438
- all_output = data_preprocess.normalize_for_overlap(all_output)
439
- if len(all_output) > 1*1024*1024:
440
- all_output = all_output[:1*1024*1024]
441
- print("chunk len: ", len(chunk))
442
- print("all output len: ", len(all_output))
443
- data_preprocess.save_text_to_docx(chunk, file_chunk_path)
444
- data_preprocess.save_text_to_docx(all_output, file_all_path)
445
- # Later when saving new files
446
- # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
447
- # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
448
-
449
- # Upload to Drive
450
- result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
451
- result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
452
- print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
453
- print(f"πŸ”— Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
454
- print("here 1")
455
-
456
- # else:
457
- # final_input = ""
458
- # if all_output:
459
- # final_input = all_output
460
- # else:
461
- # if chunk: final_input = chunk
462
- # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
463
- # if final_input:
464
- # keywords = []
465
- # if iso != "unknown": keywords.append(iso)
466
- # if acc != "unknown": keywords.append(acc)
467
- # for keyword in keywords:
468
- # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
469
- # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
470
- # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
471
- # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
472
-
473
- # Define paths for cached RAG assets
474
- # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
475
- # document_chunks_path = saveLinkFolder+"/document_chunks.json"
476
- # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
477
- print("here 2")
478
- faiss_filename = "faiss_index.bin"
479
- chunks_filename = "document_chunks.json"
480
- lookup_filename = "structured_lookup.json"
481
- print("name of faiss: ", faiss_filename)
482
-
483
- faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
484
- document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
485
- structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
486
- print("name if faiss path: ", faiss_index_path)
487
- # πŸ”₯ Remove the local file first if it exists
488
- faiss_id = find_drive_file(faiss_filename, sample_folder_id)
489
- document_id = find_drive_file(chunks_filename, sample_folder_id)
490
- structure_id = find_drive_file(lookup_filename, sample_folder_id)
491
- if faiss_id and document_id and structure_id:
492
- print("βœ… 3 Files already exist in Google Drive. Downloading them...")
493
- download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
494
- download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
495
- download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
496
- # Read and parse these into `chunk` and `all_output`
497
- else:
498
- if os.path.exists(faiss_index_path):
499
- os.remove(faiss_index_path)
500
- if os.path.exists(document_chunks_path):
501
- os.remove(document_chunks_path)
502
- if os.path.exists(structured_lookup_path):
503
- os.remove(structured_lookup_path)
504
- download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
505
- download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
506
- download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
507
-
508
- print("move to load rag")
509
- master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
510
- faiss_index_path, document_chunks_path, structured_lookup_path
511
- )
512
-
513
- global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
514
- if not all_output:
515
- if chunk: all_output = chunk
516
- else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
517
- if faiss_index is None:
518
- print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
519
- total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
520
- all_output
521
- ).total_tokens
522
-
523
- initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
524
- total_cost_title += initial_embedding_cost
525
- print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
526
-
527
-
528
- master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
529
- file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
530
- )
531
- else:
532
- print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
533
- plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
534
- master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
535
-
536
- primary_word = iso
537
- alternative_word = acc
538
- print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
539
- if features.lower() not in all_output.lower():
540
- all_output += ". NCBI Features: " + features
541
- # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
542
- # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
543
- # model.call_llm_api, chunk=chunk, all_output=all_output)
544
- print("this is chunk for the model")
545
- print(chunk)
546
- print("this is all output for the model")
547
- print(all_output)
548
- country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
549
- primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
550
- model.call_llm_api, chunk=chunk, all_output=all_output)
551
- print("country using ai: ", country)
552
- print("sample type using ai: ", sample_type)
553
- if len(country) == 0: country = "unknown"
554
- if len(sample_type) == 0: sample_type = "unknown"
555
- if country_explanation: country_explanation = "-"+country_explanation
556
- else: country_explanation = ""
557
- if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
558
- else: sample_type_explanation = ""
559
- if method_used == "unknown": method_used = ""
560
- if country.lower() != "unknown":
561
- stand_country = standardize_location.smart_country_lookup(country.lower())
562
- if stand_country.lower() != "not found":
563
- if stand_country.lower() in acc_score["country"]:
564
- if country_explanation:
565
- acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
566
- else:
567
- acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
568
- else:
569
- if country.lower() in acc_score["country"]:
570
- if country_explanation:
571
- if len(method_used + country_explanation) > 0:
572
- acc_score["country"][country.lower()].append(method_used + country_explanation)
573
- else:
574
- if len(method_used + country_explanation) > 0:
575
- acc_score["country"][country.lower()] = [method_used + country_explanation]
576
- # if spe_loc.lower() != "unknown":
577
- # if spe_loc.lower() in acc_score["specific_location"]:
578
- # acc_score["specific_location"][spe_loc.lower()].append(method_used)
579
- # else:
580
- # acc_score["specific_location"][spe_loc.lower()] = [method_used]
581
- # if ethnic.lower() != "unknown":
582
- # if ethnic.lower() in acc_score["ethnicity"]:
583
- # acc_score["ethnicity"][ethnic.lower()].append(method_used)
584
- # else:
585
- # acc_score["ethnicity"][ethnic.lower()] = [method_used]
586
- if sample_type.lower() != "unknown":
587
- if sample_type.lower() in acc_score["sample_type"]:
588
- if len(method_used + sample_type_explanation) > 0:
589
- acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
590
- else:
591
- if len(method_used + sample_type_explanation)> 0:
592
- acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
593
- # last resort: combine all information to give all output otherwise unknown
594
- if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
595
- text = ""
596
- for key in meta_expand:
597
- text += str(key) + ": " + meta_expand[key] + "\n"
598
- if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
599
- text += data_preprocess.normalize_for_overlap(all_output)
600
- if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
601
- text += data_preprocess.normalize_for_overlap(chunk)
602
- text += ". NCBI Features: " + features
603
- print("this is text for the last resort model")
604
- print(text)
605
- country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
606
- primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
607
- model.call_llm_api, chunk=text, all_output=text)
608
- print("this is last resort results: ")
609
- print("country: ", country)
610
- print("sample type: ", sample_type)
611
- if len(country) == 0: country = "unknown"
612
- if len(sample_type) == 0: sample_type = "unknown"
613
- if country_explanation: country_explanation = "-"+country_explanation
614
- else: country_explanation = ""
615
- if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
616
- else: sample_type_explanation = ""
617
- if method_used == "unknown": method_used = ""
618
- if country.lower() != "unknown":
619
- stand_country = standardize_location.smart_country_lookup(country.lower())
620
- if stand_country.lower() != "not found":
621
- if stand_country.lower() in acc_score["country"]:
622
- if country_explanation:
623
- acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
624
- else:
625
- acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
626
- else:
627
- if country.lower() in acc_score["country"]:
628
- if country_explanation:
629
- if len(method_used + country_explanation) > 0:
630
- acc_score["country"][country.lower()].append(method_used + country_explanation)
631
- else:
632
- if len(method_used + country_explanation) > 0:
633
- acc_score["country"][country.lower()] = [method_used + country_explanation]
634
- if sample_type.lower() != "unknown":
635
- if sample_type.lower() in acc_score["sample_type"]:
636
- if len(method_used + sample_type_explanation) > 0:
637
- acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
638
- else:
639
- if len(method_used + sample_type_explanation)> 0:
640
- acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
641
- end = time.time()
642
- total_cost_title += total_query_cost
643
- acc_score["query_cost"] = f"{total_cost_title:.6f}"
644
- elapsed = end - start
645
- acc_score["time_cost"] = f"{elapsed:.3f} seconds"
646
- accs_output[acc] = acc_score
647
- print(accs_output[acc])
648
-
 
 
649
  return accs_output
 
1
+ # test1: MJ17 direct
2
+ # test2: "A1YU101" thailand cross-ref
3
+ # test3: "EBK109" thailand cross-ref
4
+ # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
+ import data_preprocess
6
+ import model
7
+ import mtdna_classifier
8
+ #import app
9
+ import smart_fallback
10
+ import pandas as pd
11
+ from pathlib import Path
12
+ import subprocess
13
+ from NER.html import extractHTML
14
+ import os
15
+ import google.generativeai as genai
16
+ import re
17
+ import standardize_location
18
+ # Helper functions in for this pipeline
19
+ # Track time
20
+ import time
21
+ import multiprocessing
22
+ import gspread
23
+ from googleapiclient.discovery import build
24
+ from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
25
+ from google.oauth2.service_account import Credentials
26
+ from oauth2client.service_account import ServiceAccountCredentials
27
+ import io
28
+ import json
29
+ #––– Authentication setup –––
30
+ GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
31
+ GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
32
+ GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
33
+ GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
34
+ drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
35
+
36
+ def get_or_create_drive_folder(name, parent_id=None):
37
+ query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
38
+ if parent_id:
39
+ query += f" and '{parent_id}' in parents"
40
+ results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
41
+ items = results.get("files", [])
42
+ if items:
43
+ return items[0]["id"]
44
+ file_metadata = {
45
+ "name": name,
46
+ "mimeType": "application/vnd.google-apps.folder"
47
+ }
48
+ if parent_id:
49
+ file_metadata["parents"] = [parent_id]
50
+ file = drive_service.files().create(body=file_metadata, fields="id").execute()
51
+ return file["id"]
52
+ def find_drive_file(filename, parent_id):
53
+ """
54
+ Checks if a file with the given name exists inside the specified Google Drive folder.
55
+ Returns the file ID if found, else None.
56
+ """
57
+ query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
58
+ results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
59
+ files = results.get('files', [])
60
+ if files:
61
+ return files[0]["id"]
62
+ return None
63
+
64
+
65
+ # def upload_file_to_drive(local_path, remote_name, folder_id):
66
+ # file_metadata = {"name": remote_name, "parents": [folder_id]}
67
+ # media = MediaFileUpload(local_path, resumable=True)
68
+ # existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
69
+ # if existing:
70
+ # drive_service.files().delete(fileId=existing[0]["id"]).execute()
71
+ # file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
72
+ # result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
73
+ # if not result.get("files"):
74
+ # print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
75
+ # else:
76
+ # print(f"βœ… Verified upload: {remote_name}")
77
+ # return file["id"]
78
+ def upload_file_to_drive(local_path, remote_name, folder_id):
79
+ try:
80
+ if not os.path.exists(local_path):
81
+ raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
82
+
83
+ # Delete existing file on Drive if present
84
+ existing = drive_service.files().list(
85
+ q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
86
+ fields="files(id)"
87
+ ).execute().get("files", [])
88
+
89
+ if existing:
90
+ drive_service.files().delete(fileId=existing[0]["id"]).execute()
91
+ print(f"πŸ—‘οΈ Deleted existing '{remote_name}' in Drive folder {folder_id}")
92
+
93
+ file_metadata = {"name": remote_name, "parents": [folder_id]}
94
+ media = MediaFileUpload(local_path, resumable=True)
95
+ file = drive_service.files().create(
96
+ body=file_metadata,
97
+ media_body=media,
98
+ fields="id"
99
+ ).execute()
100
+
101
+ print(f"βœ… Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
102
+ return file["id"]
103
+
104
+ except Exception as e:
105
+ print(f"❌ Error during upload: {e}")
106
+ return None
107
+
108
+
109
+ def download_file_from_drive(remote_name, folder_id, local_path):
110
+ results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
111
+ files = results.get("files", [])
112
+ if not files:
113
+ return False
114
+ file_id = files[0]["id"]
115
+ request = drive_service.files().get_media(fileId=file_id)
116
+ fh = io.FileIO(local_path, 'wb')
117
+ downloader = MediaIoBaseDownload(fh, request)
118
+ done = False
119
+ while not done:
120
+ _, done = downloader.next_chunk()
121
+ return True
122
+ def download_drive_file_content(file_id):
123
+ request = drive_service.files().get_media(fileId=file_id)
124
+ fh = io.BytesIO()
125
+ downloader = MediaIoBaseDownload(fh, request)
126
+ done = False
127
+ while not done:
128
+ _, done = downloader.next_chunk()
129
+ fh.seek(0)
130
+ return fh.read().decode("utf-8")
131
+
132
+ # def run_with_timeout(func, args=(), kwargs={}, timeout=20):
133
+ # """
134
+ # Runs `func` with timeout in seconds. Kills if it exceeds.
135
+ # Returns: (success, result or None)
136
+ # """
137
+ # def wrapper(q, *args, **kwargs):
138
+ # try:
139
+ # q.put(func(*args, **kwargs))
140
+ # except Exception as e:
141
+ # q.put(e)
142
+
143
+ # q = multiprocessing.Queue()
144
+ # p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
145
+ # p.start()
146
+ # p.join(timeout)
147
+
148
+ # if p.is_alive():
149
+ # p.terminate()
150
+ # p.join()
151
+ # print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
152
+ # return False, None
153
+ # else:
154
+ # result = q.get()
155
+ # if isinstance(result, Exception):
156
+ # raise result
157
+ # return True, result
158
+ def run_with_timeout(func, args=(), kwargs={}, timeout=30):
159
+ import concurrent.futures
160
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
161
+ future = executor.submit(func, *args, **kwargs)
162
+ try:
163
+ return True, future.result(timeout=timeout)
164
+ except concurrent.futures.TimeoutError:
165
+ print(f"⏱️ Timeout exceeded ({timeout} sec) β€” function killed.")
166
+ return False, None
167
+
168
+ def time_it(func, *args, **kwargs):
169
+ """
170
+ Measure how long a function takes to run and return its result + time.
171
+ """
172
+ start = time.time()
173
+ result = func(*args, **kwargs)
174
+ end = time.time()
175
+ elapsed = end - start
176
+ print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
177
+ return result, elapsed
178
+ # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
179
+ def track_gemini_cost():
180
+ # Prices are per 1,000 tokens
181
+ PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
182
+ PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
183
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
184
+ return True
185
+
186
+ def unique_preserve_order(seq):
187
+ seen = set()
188
+ return [x for x in seq if not (x in seen or seen.add(x))]
189
+ # Main execution
190
+ def pipeline_with_gemini(accessions):
191
+ # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
192
+ # there can be one accession number in the accessions
193
+ # Prices are per 1,000 tokens
194
+ PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
195
+ PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
196
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
197
+ if not accessions:
198
+ print("no input")
199
+ return None
200
+ else:
201
+ accs_output = {}
202
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
203
+ for acc in accessions:
204
+ print("start gemini: ", acc)
205
+ start = time.time()
206
+ total_cost_title = 0
207
+ jsonSM, links, article_text = {},[], ""
208
+ acc_score = { "isolate": "",
209
+ "country":{},
210
+ "sample_type":{},
211
+ #"specific_location":{},
212
+ #"ethnicity":{},
213
+ "query_cost":total_cost_title,
214
+ "time_cost":None,
215
+ "source":links}
216
+ meta = mtdna_classifier.fetch_ncbi_metadata(acc)
217
+ country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
218
+ acc_score["isolate"] = iso
219
+ print("meta: ",meta)
220
+ meta_expand = smart_fallback.fetch_ncbi(acc)
221
+ print("meta expand: ", meta_expand)
222
+ # set up step: create the folder to save document
223
+ chunk, all_output = "",""
224
+ if pudID:
225
+ id = str(pudID)
226
+ saveTitle = title
227
+ else:
228
+ try:
229
+ author_name = meta_expand["authors"].split(',')[0] # Use last name only
230
+ except:
231
+ author_name = meta_expand["authors"]
232
+ saveTitle = title + "_" + col_date + "_" + author_name
233
+ if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown":
234
+ saveTitle += "_" + acc
235
+ id = "DirectSubmission"
236
+ # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
237
+ # if not folder_path.exists():
238
+ # cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
239
+ # result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
240
+ # print("data/"+str(id) +" created.")
241
+ # else:
242
+ # print("data/"+str(id) +" already exists.")
243
+ # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
244
+ # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
245
+ # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
246
+ # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
247
+ data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
248
+ sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
249
+ print("sample folder id: ", sample_folder_id)
250
+
251
+ # Define document names
252
+ if len(saveTitle) > 50:
253
+ saveName = saveTitle[:50]
254
+ saveName = saveName.replace(" ", "_")
255
+ chunk_filename = f"{saveName}_merged_document.docx"
256
+ all_filename = f"{saveName}_all_merged_document.docx"
257
+ else:
258
+ saveName = saveTitle.replace(" ", "_")
259
+ chunk_filename = f"{saveName}_merged_document.docx"
260
+ all_filename = f"{saveName}_all_merged_document.docx"
261
+ print(chunk_filename, all_filename)
262
+ # Define local temp paths for reading/writing
263
+ # import tempfile
264
+ # tmp_dir = tempfile.mkdtemp()
265
+ LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
266
+ os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
267
+ file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
268
+ file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
269
+ # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
270
+ # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
271
+ print(file_chunk_path)
272
+ chunk_id = find_drive_file(chunk_filename, sample_folder_id)
273
+ all_id = find_drive_file(all_filename, sample_folder_id)
274
+
275
+ if chunk_id and all_id:
276
+ print("βœ… Files already exist in Google Drive. Downloading them...")
277
+ chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
278
+ all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
279
+ print("chunk_id and all_id: ")
280
+ print(chunk_id, all_id)
281
+ file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
282
+ print("πŸ“„ Name:", file["name"])
283
+ print("πŸ“ Parent folder ID:", file["parents"][0])
284
+ print("πŸ”— View link:", file["webViewLink"])
285
+
286
+
287
+ # Read and parse these into `chunk` and `all_output`
288
+ else:
289
+ # πŸ”₯ Remove any stale local copies
290
+ if os.path.exists(file_chunk_path):
291
+ os.remove(file_chunk_path)
292
+ print(f"πŸ—‘οΈ Removed stale: {file_chunk_path}")
293
+ if os.path.exists(file_all_path):
294
+ os.remove(file_all_path)
295
+ print(f"πŸ—‘οΈ Removed stale: {file_all_path}")
296
+ # πŸ”₯ Remove the local file first if it exists
297
+ # if os.path.exists(file_chunk_path):
298
+ # os.remove(file_chunk_path)
299
+ # print("remove chunk path")
300
+ # if os.path.exists(file_all_path):
301
+ # os.remove(file_all_path)
302
+ # print("remove all path")
303
+ # Try to download if already exists on Drive
304
+ chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
305
+ all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
306
+ print("chunk exist: ", chunk_exists)
307
+ # first way: ncbi method
308
+ print("country.lower: ",country.lower())
309
+ if country.lower() != "unknown":
310
+ stand_country = standardize_location.smart_country_lookup(country.lower())
311
+ print("stand_country: ", stand_country)
312
+ if stand_country.lower() != "not found":
313
+ acc_score["country"][stand_country.lower()] = ["ncbi"]
314
+ else: acc_score["country"][country.lower()] = ["ncbi"]
315
+ # if spe_loc.lower() != "unknown":
316
+ # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
317
+ # if ethnic.lower() != "unknown":
318
+ # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
319
+ if sample_type.lower() != "unknown":
320
+ acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
321
+ # second way: LLM model
322
+ # Preprocess the input token
323
+ print(acc_score)
324
+ accession, isolate = None, None
325
+ if acc != "unknown": accession = acc
326
+ if iso != "unknown": isolate = iso
327
+ # check doi first
328
+ if doi != "unknown":
329
+ link = 'https://doi.org/' + doi
330
+ # get the file to create listOfFile for each id
331
+ print("link of doi: ", link)
332
+ html = extractHTML.HTML("",link)
333
+ jsonSM = html.getSupMaterial()
334
+ article_text = html.getListSection()
335
+ if article_text:
336
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
337
+ links.append(link)
338
+ if jsonSM:
339
+ links += sum((jsonSM[key] for key in jsonSM),[])
340
+ # no doi then google custom search api
341
+ if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
342
+ # might find the article
343
+ print("no article text, start tem link")
344
+ #tem_links = mtdna_classifier.search_google_custom(title, 2)
345
+ tem_links = smart_fallback.smart_google_search(meta_expand)
346
+ print("tem links: ", tem_links)
347
+ tem_link_acc = smart_fallback.google_accession_search(acc)
348
+ tem_links += tem_link_acc
349
+ tem_links = unique_preserve_order(tem_links)
350
+ print("tem link before filtering: ", tem_links)
351
+ # filter the quality link
352
+ print("saveLinkFolder as sample folder id: ", sample_folder_id)
353
+ links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc)
354
+ print("this is links: ",links)
355
+ links = unique_preserve_order(links)
356
+ acc_score["source"] = links
357
+ # chunk_path = "/"+saveTitle+"_merged_document.docx"
358
+ # all_path = "/"+saveTitle+"_all_merged_document.docx"
359
+ # # if chunk and all output not exist yet
360
+ # file_chunk_path = saveLinkFolder + chunk_path
361
+ # file_all_path = saveLinkFolder + all_path
362
+ # if os.path.exists(file_chunk_path):
363
+ # print("File chunk exists!")
364
+ # if not chunk:
365
+ # text, table, document_title = model.read_docx_text(file_chunk_path)
366
+ # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
367
+ # if os.path.exists(file_all_path):
368
+ # print("File all output exists!")
369
+ # if not all_output:
370
+ # text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
371
+ # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
372
+ if chunk_exists:
373
+ print("File chunk exists!")
374
+ if not chunk:
375
+ print("start to get chunk")
376
+ text, table, document_title = model.read_docx_text(file_chunk_path)
377
+ chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
378
+ if all_exists:
379
+ print("File all output exists!")
380
+ if not all_output:
381
+ text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
382
+ all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
383
+ if not chunk and not all_output:
384
+ print("not chunk and all output")
385
+ # else: check if we can reuse these chunk and all output of existed accession to find another
386
+ if links:
387
+ for link in links:
388
+ print(link)
389
+ # if len(all_output) > 1000*1000:
390
+ # all_output = data_preprocess.normalize_for_overlap(all_output)
391
+ # print("after normalizing all output: ", len(all_output))
392
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
393
+ print("break here")
394
+ break
395
+ if iso != "unknown": query_kw = iso
396
+ else: query_kw = acc
397
+ #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
398
+ success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=180)
399
+ if success_process:
400
+ text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
401
+ print("yes succeed for process document")
402
+ else: text_link, tables_link, final_input_link = "", "", ""
403
+ context = data_preprocess.extract_context(final_input_link, query_kw)
404
+ if context != "Sample ID not found.":
405
+ if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
406
+ success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
407
+ if success_chunk:
408
+ chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
409
+ print("yes succeed for chunk")
410
+ else:
411
+ chunk += context
412
+ print("len context: ", len(context))
413
+ print("basic fall back")
414
+ print("len chunk after: ", len(chunk))
415
+ if len(final_input_link) > 1000*1000:
416
+ if context != "Sample ID not found.":
417
+ final_input_link = context
418
+ else:
419
+ final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
420
+ if len(final_input_link) > 1000 *1000:
421
+ final_input_link = final_input_link[:100000]
422
+ if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
423
+ success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
424
+ if success:
425
+ all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
426
+ print("yes succeed")
427
+ else:
428
+ all_output += final_input_link
429
+ print("len final input: ", len(final_input_link))
430
+ print("basic fall back")
431
+ print("len all output after: ", len(all_output))
432
+ #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
433
+
434
+ else:
435
+ chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
436
+ all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
437
+ if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
438
+ if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
439
+ if len(all_output) > 1*1024*1024:
440
+ all_output = data_preprocess.normalize_for_overlap(all_output)
441
+ if len(all_output) > 1*1024*1024:
442
+ all_output = all_output[:1*1024*1024]
443
+ print("chunk len: ", len(chunk))
444
+ print("all output len: ", len(all_output))
445
+ data_preprocess.save_text_to_docx(chunk, file_chunk_path)
446
+ data_preprocess.save_text_to_docx(all_output, file_all_path)
447
+ # Later when saving new files
448
+ # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
449
+ # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
450
+
451
+ # Upload to Drive
452
+ result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
453
+ result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
454
+ print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
455
+ print(f"πŸ”— Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
456
+ print("here 1")
457
+
458
+ # else:
459
+ # final_input = ""
460
+ # if all_output:
461
+ # final_input = all_output
462
+ # else:
463
+ # if chunk: final_input = chunk
464
+ # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
465
+ # if final_input:
466
+ # keywords = []
467
+ # if iso != "unknown": keywords.append(iso)
468
+ # if acc != "unknown": keywords.append(acc)
469
+ # for keyword in keywords:
470
+ # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
471
+ # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
472
+ # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
473
+ # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
474
+
475
+ # Define paths for cached RAG assets
476
+ # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
477
+ # document_chunks_path = saveLinkFolder+"/document_chunks.json"
478
+ # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
479
+ print("here 2")
480
+ faiss_filename = "faiss_index.bin"
481
+ chunks_filename = "document_chunks.json"
482
+ lookup_filename = "structured_lookup.json"
483
+ print("name of faiss: ", faiss_filename)
484
+
485
+ faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
486
+ document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
487
+ structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
488
+ print("name if faiss path: ", faiss_index_path)
489
+ # πŸ”₯ Remove the local file first if it exists
490
+ faiss_id = find_drive_file(faiss_filename, sample_folder_id)
491
+ document_id = find_drive_file(chunks_filename, sample_folder_id)
492
+ structure_id = find_drive_file(lookup_filename, sample_folder_id)
493
+ if faiss_id and document_id and structure_id:
494
+ print("βœ… 3 Files already exist in Google Drive. Downloading them...")
495
+ download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
496
+ download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
497
+ download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
498
+ # Read and parse these into `chunk` and `all_output`
499
+ else:
500
+ if os.path.exists(faiss_index_path):
501
+ os.remove(faiss_index_path)
502
+ if os.path.exists(document_chunks_path):
503
+ os.remove(document_chunks_path)
504
+ if os.path.exists(structured_lookup_path):
505
+ os.remove(structured_lookup_path)
506
+ download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
507
+ download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
508
+ download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
509
+
510
+ print("move to load rag")
511
+ master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
512
+ faiss_index_path, document_chunks_path, structured_lookup_path
513
+ )
514
+
515
+ global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
516
+ if not all_output:
517
+ if chunk: all_output = chunk
518
+ else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
519
+ if faiss_index is None:
520
+ print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
521
+ total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
522
+ all_output
523
+ ).total_tokens
524
+
525
+ initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
526
+ total_cost_title += initial_embedding_cost
527
+ print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
528
+
529
+
530
+ master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
531
+ file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
532
+ )
533
+ else:
534
+ print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
535
+ plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
536
+ master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
537
+
538
+ primary_word = iso
539
+ alternative_word = acc
540
+ print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
541
+ if features.lower() not in all_output.lower():
542
+ all_output += ". NCBI Features: " + features
543
+ # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
544
+ # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
545
+ # model.call_llm_api, chunk=chunk, all_output=all_output)
546
+ print("this is chunk for the model")
547
+ print(chunk)
548
+ print("this is all output for the model")
549
+ print(all_output)
550
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
551
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
552
+ model.call_llm_api, chunk=chunk, all_output=all_output)
553
+ print("country using ai: ", country)
554
+ print("sample type using ai: ", sample_type)
555
+ if len(country) == 0: country = "unknown"
556
+ if len(sample_type) == 0: sample_type = "unknown"
557
+ if country_explanation: country_explanation = "-"+country_explanation
558
+ else: country_explanation = ""
559
+ if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
560
+ else: sample_type_explanation = ""
561
+ if method_used == "unknown": method_used = ""
562
+ if country.lower() != "unknown":
563
+ stand_country = standardize_location.smart_country_lookup(country.lower())
564
+ if stand_country.lower() != "not found":
565
+ if stand_country.lower() in acc_score["country"]:
566
+ if country_explanation:
567
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
568
+ else:
569
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
570
+ else:
571
+ if country.lower() in acc_score["country"]:
572
+ if country_explanation:
573
+ if len(method_used + country_explanation) > 0:
574
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
575
+ else:
576
+ if len(method_used + country_explanation) > 0:
577
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
578
+ # if spe_loc.lower() != "unknown":
579
+ # if spe_loc.lower() in acc_score["specific_location"]:
580
+ # acc_score["specific_location"][spe_loc.lower()].append(method_used)
581
+ # else:
582
+ # acc_score["specific_location"][spe_loc.lower()] = [method_used]
583
+ # if ethnic.lower() != "unknown":
584
+ # if ethnic.lower() in acc_score["ethnicity"]:
585
+ # acc_score["ethnicity"][ethnic.lower()].append(method_used)
586
+ # else:
587
+ # acc_score["ethnicity"][ethnic.lower()] = [method_used]
588
+ if sample_type.lower() != "unknown":
589
+ if sample_type.lower() in acc_score["sample_type"]:
590
+ if len(method_used + sample_type_explanation) > 0:
591
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
592
+ else:
593
+ if len(method_used + sample_type_explanation)> 0:
594
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
595
+ # last resort: combine all information to give all output otherwise unknown
596
+ if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
597
+ text = ""
598
+ for key in meta_expand:
599
+ text += str(key) + ": " + meta_expand[key] + "\n"
600
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
601
+ text += data_preprocess.normalize_for_overlap(all_output)
602
+ if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
603
+ text += data_preprocess.normalize_for_overlap(chunk)
604
+ text += ". NCBI Features: " + features
605
+ print("this is text for the last resort model")
606
+ print(text)
607
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
608
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
609
+ model.call_llm_api, chunk=text, all_output=text)
610
+ print("this is last resort results: ")
611
+ print("country: ", country)
612
+ print("sample type: ", sample_type)
613
+ if len(country) == 0: country = "unknown"
614
+ if len(sample_type) == 0: sample_type = "unknown"
615
+ if country_explanation: country_explanation = "-"+country_explanation
616
+ else: country_explanation = ""
617
+ if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
618
+ else: sample_type_explanation = ""
619
+ if method_used == "unknown": method_used = ""
620
+ if country.lower() != "unknown":
621
+ stand_country = standardize_location.smart_country_lookup(country.lower())
622
+ if stand_country.lower() != "not found":
623
+ if stand_country.lower() in acc_score["country"]:
624
+ if country_explanation:
625
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
626
+ else:
627
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
628
+ else:
629
+ if country.lower() in acc_score["country"]:
630
+ if country_explanation:
631
+ if len(method_used + country_explanation) > 0:
632
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
633
+ else:
634
+ if len(method_used + country_explanation) > 0:
635
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
636
+ if sample_type.lower() != "unknown":
637
+ if sample_type.lower() in acc_score["sample_type"]:
638
+ if len(method_used + sample_type_explanation) > 0:
639
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
640
+ else:
641
+ if len(method_used + sample_type_explanation)> 0:
642
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
643
+ end = time.time()
644
+ total_cost_title += total_query_cost
645
+ acc_score["query_cost"] = f"{total_cost_title:.6f}"
646
+ elapsed = end - start
647
+ acc_score["time_cost"] = f"{elapsed:.3f} seconds"
648
+ accs_output[acc] = acc_score
649
+ print(accs_output[acc])
650
+
651
  return accs_output