Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +650 -648
pipeline.py
CHANGED
@@ -1,649 +1,651 @@
|
|
1 |
-
# test1: MJ17 direct
|
2 |
-
# test2: "A1YU101" thailand cross-ref
|
3 |
-
# test3: "EBK109" thailand cross-ref
|
4 |
-
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
|
5 |
-
import data_preprocess
|
6 |
-
import model
|
7 |
-
import mtdna_classifier
|
8 |
-
#import app
|
9 |
-
import smart_fallback
|
10 |
-
import pandas as pd
|
11 |
-
from pathlib import Path
|
12 |
-
import subprocess
|
13 |
-
from NER.html import extractHTML
|
14 |
-
import os
|
15 |
-
import google.generativeai as genai
|
16 |
-
import re
|
17 |
-
import standardize_location
|
18 |
-
# Helper functions in for this pipeline
|
19 |
-
# Track time
|
20 |
-
import time
|
21 |
-
import multiprocessing
|
22 |
-
import gspread
|
23 |
-
from googleapiclient.discovery import build
|
24 |
-
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
|
25 |
-
from google.oauth2.service_account import Credentials
|
26 |
-
from oauth2client.service_account import ServiceAccountCredentials
|
27 |
-
import io
|
28 |
-
import json
|
29 |
-
#βββ Authentication setup βββ
|
30 |
-
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
31 |
-
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
|
32 |
-
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
33 |
-
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
34 |
-
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
35 |
-
|
36 |
-
def get_or_create_drive_folder(name, parent_id=None):
|
37 |
-
query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
|
38 |
-
if parent_id:
|
39 |
-
query += f" and '{parent_id}' in parents"
|
40 |
-
results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
|
41 |
-
items = results.get("files", [])
|
42 |
-
if items:
|
43 |
-
return items[0]["id"]
|
44 |
-
file_metadata = {
|
45 |
-
"name": name,
|
46 |
-
"mimeType": "application/vnd.google-apps.folder"
|
47 |
-
}
|
48 |
-
if parent_id:
|
49 |
-
file_metadata["parents"] = [parent_id]
|
50 |
-
file = drive_service.files().create(body=file_metadata, fields="id").execute()
|
51 |
-
return file["id"]
|
52 |
-
def find_drive_file(filename, parent_id):
|
53 |
-
"""
|
54 |
-
Checks if a file with the given name exists inside the specified Google Drive folder.
|
55 |
-
Returns the file ID if found, else None.
|
56 |
-
"""
|
57 |
-
query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
|
58 |
-
results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
|
59 |
-
files = results.get('files', [])
|
60 |
-
if files:
|
61 |
-
return files[0]["id"]
|
62 |
-
return None
|
63 |
-
|
64 |
-
|
65 |
-
# def upload_file_to_drive(local_path, remote_name, folder_id):
|
66 |
-
# file_metadata = {"name": remote_name, "parents": [folder_id]}
|
67 |
-
# media = MediaFileUpload(local_path, resumable=True)
|
68 |
-
# existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
|
69 |
-
# if existing:
|
70 |
-
# drive_service.files().delete(fileId=existing[0]["id"]).execute()
|
71 |
-
# file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
|
72 |
-
# result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
|
73 |
-
# if not result.get("files"):
|
74 |
-
# print(f"β Upload failed: File '{remote_name}' not found in folder after upload.")
|
75 |
-
# else:
|
76 |
-
# print(f"β
Verified upload: {remote_name}")
|
77 |
-
# return file["id"]
|
78 |
-
def upload_file_to_drive(local_path, remote_name, folder_id):
|
79 |
-
try:
|
80 |
-
if not os.path.exists(local_path):
|
81 |
-
raise FileNotFoundError(f"β Local file does not exist: {local_path}")
|
82 |
-
|
83 |
-
# Delete existing file on Drive if present
|
84 |
-
existing = drive_service.files().list(
|
85 |
-
q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
|
86 |
-
fields="files(id)"
|
87 |
-
).execute().get("files", [])
|
88 |
-
|
89 |
-
if existing:
|
90 |
-
drive_service.files().delete(fileId=existing[0]["id"]).execute()
|
91 |
-
print(f"ποΈ Deleted existing '{remote_name}' in Drive folder {folder_id}")
|
92 |
-
|
93 |
-
file_metadata = {"name": remote_name, "parents": [folder_id]}
|
94 |
-
media = MediaFileUpload(local_path, resumable=True)
|
95 |
-
file = drive_service.files().create(
|
96 |
-
body=file_metadata,
|
97 |
-
media_body=media,
|
98 |
-
fields="id"
|
99 |
-
).execute()
|
100 |
-
|
101 |
-
print(f"β
Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
|
102 |
-
return file["id"]
|
103 |
-
|
104 |
-
except Exception as e:
|
105 |
-
print(f"β Error during upload: {e}")
|
106 |
-
return None
|
107 |
-
|
108 |
-
|
109 |
-
def download_file_from_drive(remote_name, folder_id, local_path):
|
110 |
-
results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
|
111 |
-
files = results.get("files", [])
|
112 |
-
if not files:
|
113 |
-
return False
|
114 |
-
file_id = files[0]["id"]
|
115 |
-
request = drive_service.files().get_media(fileId=file_id)
|
116 |
-
fh = io.FileIO(local_path, 'wb')
|
117 |
-
downloader = MediaIoBaseDownload(fh, request)
|
118 |
-
done = False
|
119 |
-
while not done:
|
120 |
-
_, done = downloader.next_chunk()
|
121 |
-
return True
|
122 |
-
def download_drive_file_content(file_id):
|
123 |
-
request = drive_service.files().get_media(fileId=file_id)
|
124 |
-
fh = io.BytesIO()
|
125 |
-
downloader = MediaIoBaseDownload(fh, request)
|
126 |
-
done = False
|
127 |
-
while not done:
|
128 |
-
_, done = downloader.next_chunk()
|
129 |
-
fh.seek(0)
|
130 |
-
return fh.read().decode("utf-8")
|
131 |
-
|
132 |
-
# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
|
133 |
-
# """
|
134 |
-
# Runs `func` with timeout in seconds. Kills if it exceeds.
|
135 |
-
# Returns: (success, result or None)
|
136 |
-
# """
|
137 |
-
# def wrapper(q, *args, **kwargs):
|
138 |
-
# try:
|
139 |
-
# q.put(func(*args, **kwargs))
|
140 |
-
# except Exception as e:
|
141 |
-
# q.put(e)
|
142 |
-
|
143 |
-
# q = multiprocessing.Queue()
|
144 |
-
# p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
|
145 |
-
# p.start()
|
146 |
-
# p.join(timeout)
|
147 |
-
|
148 |
-
# if p.is_alive():
|
149 |
-
# p.terminate()
|
150 |
-
# p.join()
|
151 |
-
# print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.")
|
152 |
-
# return False, None
|
153 |
-
# else:
|
154 |
-
# result = q.get()
|
155 |
-
# if isinstance(result, Exception):
|
156 |
-
# raise result
|
157 |
-
# return True, result
|
158 |
-
def run_with_timeout(func, args=(), kwargs={}, timeout=30):
|
159 |
-
import concurrent.futures
|
160 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
161 |
-
future = executor.submit(func, *args, **kwargs)
|
162 |
-
try:
|
163 |
-
return True, future.result(timeout=timeout)
|
164 |
-
except concurrent.futures.TimeoutError:
|
165 |
-
print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.")
|
166 |
-
return False, None
|
167 |
-
|
168 |
-
def time_it(func, *args, **kwargs):
|
169 |
-
"""
|
170 |
-
Measure how long a function takes to run and return its result + time.
|
171 |
-
"""
|
172 |
-
start = time.time()
|
173 |
-
result = func(*args, **kwargs)
|
174 |
-
end = time.time()
|
175 |
-
elapsed = end - start
|
176 |
-
print(f"β±οΈ '{func.__name__}' took {elapsed:.3f} seconds")
|
177 |
-
return result, elapsed
|
178 |
-
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
|
179 |
-
def track_gemini_cost():
|
180 |
-
# Prices are per 1,000 tokens
|
181 |
-
PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
182 |
-
PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
183 |
-
PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
184 |
-
return True
|
185 |
-
|
186 |
-
def unique_preserve_order(seq):
|
187 |
-
seen = set()
|
188 |
-
return [x for x in seq if not (x in seen or seen.add(x))]
|
189 |
-
# Main execution
|
190 |
-
def pipeline_with_gemini(accessions):
|
191 |
-
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
|
192 |
-
# there can be one accession number in the accessions
|
193 |
-
# Prices are per 1,000 tokens
|
194 |
-
PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
195 |
-
PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
196 |
-
PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
197 |
-
if not accessions:
|
198 |
-
print("no input")
|
199 |
-
return None
|
200 |
-
else:
|
201 |
-
accs_output = {}
|
202 |
-
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
203 |
-
for acc in accessions:
|
204 |
-
print("start gemini: ", acc)
|
205 |
-
start = time.time()
|
206 |
-
total_cost_title = 0
|
207 |
-
jsonSM, links, article_text = {},[], ""
|
208 |
-
acc_score = { "isolate": "",
|
209 |
-
"country":{},
|
210 |
-
"sample_type":{},
|
211 |
-
#"specific_location":{},
|
212 |
-
#"ethnicity":{},
|
213 |
-
"query_cost":total_cost_title,
|
214 |
-
"time_cost":None,
|
215 |
-
"source":links}
|
216 |
-
meta = mtdna_classifier.fetch_ncbi_metadata(
|
217 |
-
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
218 |
-
acc_score["isolate"] = iso
|
219 |
-
print("meta: ",meta)
|
220 |
-
meta_expand = smart_fallback.fetch_ncbi(
|
221 |
-
print("meta expand: ", meta_expand)
|
222 |
-
# set up step: create the folder to save document
|
223 |
-
chunk, all_output = "",""
|
224 |
-
if pudID:
|
225 |
-
id = str(pudID)
|
226 |
-
saveTitle = title
|
227 |
-
else:
|
228 |
-
try:
|
229 |
-
author_name = meta_expand["authors"].split(',')[0] # Use last name only
|
230 |
-
except:
|
231 |
-
author_name = meta_expand["authors"]
|
232 |
-
saveTitle = title + "_" + col_date + "_" + author_name
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
#
|
237 |
-
#
|
238 |
-
#
|
239 |
-
#
|
240 |
-
# print("data/"+str(id) +"
|
241 |
-
#
|
242 |
-
#
|
243 |
-
#
|
244 |
-
#
|
245 |
-
data_folder_id = GDRIVE_DATA_FOLDER_NAME
|
246 |
-
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
#
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
print(
|
281 |
-
|
282 |
-
print("
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
#
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
#
|
297 |
-
#
|
298 |
-
#
|
299 |
-
#
|
300 |
-
#
|
301 |
-
#
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
print("
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
# if
|
316 |
-
# acc_score["
|
317 |
-
if
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
if
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
#
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
print("
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
#
|
358 |
-
#
|
359 |
-
#
|
360 |
-
#
|
361 |
-
#
|
362 |
-
#
|
363 |
-
#
|
364 |
-
#
|
365 |
-
#
|
366 |
-
#
|
367 |
-
#
|
368 |
-
#
|
369 |
-
#
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
#
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
final_input_link =
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
if
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
#
|
448 |
-
|
449 |
-
#
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
print("
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
#
|
459 |
-
#
|
460 |
-
#
|
461 |
-
#
|
462 |
-
#
|
463 |
-
#
|
464 |
-
#
|
465 |
-
#
|
466 |
-
#
|
467 |
-
#
|
468 |
-
#
|
469 |
-
#
|
470 |
-
#
|
471 |
-
#
|
472 |
-
|
473 |
-
#
|
474 |
-
|
475 |
-
#
|
476 |
-
#
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
download_file_from_drive(
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
if os.path.exists(
|
501 |
-
os.remove(
|
502 |
-
if os.path.exists(
|
503 |
-
os.remove(
|
504 |
-
|
505 |
-
|
506 |
-
download_file_from_drive(
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
#
|
544 |
-
|
545 |
-
|
546 |
-
print("this is
|
547 |
-
print(
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
if
|
556 |
-
|
557 |
-
if
|
558 |
-
else:
|
559 |
-
if
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
#
|
579 |
-
#
|
580 |
-
# acc_score["specific_location"][spe_loc.lower()]
|
581 |
-
#
|
582 |
-
#
|
583 |
-
#
|
584 |
-
#
|
585 |
-
# acc_score["ethnicity"][ethnic.lower()]
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
text +=
|
600 |
-
if len(data_preprocess.normalize_for_overlap(
|
601 |
-
text += data_preprocess.normalize_for_overlap(
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
print("
|
611 |
-
|
612 |
-
|
613 |
-
if
|
614 |
-
|
615 |
-
if
|
616 |
-
else:
|
617 |
-
if
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
acc_score["
|
646 |
-
|
647 |
-
|
648 |
-
|
|
|
|
|
649 |
return accs_output
|
|
|
1 |
+
# test1: MJ17 direct
|
2 |
+
# test2: "A1YU101" thailand cross-ref
|
3 |
+
# test3: "EBK109" thailand cross-ref
|
4 |
+
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
|
5 |
+
import data_preprocess
|
6 |
+
import model
|
7 |
+
import mtdna_classifier
|
8 |
+
#import app
|
9 |
+
import smart_fallback
|
10 |
+
import pandas as pd
|
11 |
+
from pathlib import Path
|
12 |
+
import subprocess
|
13 |
+
from NER.html import extractHTML
|
14 |
+
import os
|
15 |
+
import google.generativeai as genai
|
16 |
+
import re
|
17 |
+
import standardize_location
|
18 |
+
# Helper functions in for this pipeline
|
19 |
+
# Track time
|
20 |
+
import time
|
21 |
+
import multiprocessing
|
22 |
+
import gspread
|
23 |
+
from googleapiclient.discovery import build
|
24 |
+
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
|
25 |
+
from google.oauth2.service_account import Credentials
|
26 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
27 |
+
import io
|
28 |
+
import json
|
29 |
+
#βββ Authentication setup βββ
|
30 |
+
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
31 |
+
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
|
32 |
+
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
33 |
+
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
34 |
+
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
35 |
+
|
36 |
+
def get_or_create_drive_folder(name, parent_id=None):
|
37 |
+
query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
|
38 |
+
if parent_id:
|
39 |
+
query += f" and '{parent_id}' in parents"
|
40 |
+
results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
|
41 |
+
items = results.get("files", [])
|
42 |
+
if items:
|
43 |
+
return items[0]["id"]
|
44 |
+
file_metadata = {
|
45 |
+
"name": name,
|
46 |
+
"mimeType": "application/vnd.google-apps.folder"
|
47 |
+
}
|
48 |
+
if parent_id:
|
49 |
+
file_metadata["parents"] = [parent_id]
|
50 |
+
file = drive_service.files().create(body=file_metadata, fields="id").execute()
|
51 |
+
return file["id"]
|
52 |
+
def find_drive_file(filename, parent_id):
|
53 |
+
"""
|
54 |
+
Checks if a file with the given name exists inside the specified Google Drive folder.
|
55 |
+
Returns the file ID if found, else None.
|
56 |
+
"""
|
57 |
+
query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
|
58 |
+
results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
|
59 |
+
files = results.get('files', [])
|
60 |
+
if files:
|
61 |
+
return files[0]["id"]
|
62 |
+
return None
|
63 |
+
|
64 |
+
|
65 |
+
# def upload_file_to_drive(local_path, remote_name, folder_id):
|
66 |
+
# file_metadata = {"name": remote_name, "parents": [folder_id]}
|
67 |
+
# media = MediaFileUpload(local_path, resumable=True)
|
68 |
+
# existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
|
69 |
+
# if existing:
|
70 |
+
# drive_service.files().delete(fileId=existing[0]["id"]).execute()
|
71 |
+
# file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
|
72 |
+
# result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
|
73 |
+
# if not result.get("files"):
|
74 |
+
# print(f"β Upload failed: File '{remote_name}' not found in folder after upload.")
|
75 |
+
# else:
|
76 |
+
# print(f"β
Verified upload: {remote_name}")
|
77 |
+
# return file["id"]
|
78 |
+
def upload_file_to_drive(local_path, remote_name, folder_id):
|
79 |
+
try:
|
80 |
+
if not os.path.exists(local_path):
|
81 |
+
raise FileNotFoundError(f"β Local file does not exist: {local_path}")
|
82 |
+
|
83 |
+
# Delete existing file on Drive if present
|
84 |
+
existing = drive_service.files().list(
|
85 |
+
q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
|
86 |
+
fields="files(id)"
|
87 |
+
).execute().get("files", [])
|
88 |
+
|
89 |
+
if existing:
|
90 |
+
drive_service.files().delete(fileId=existing[0]["id"]).execute()
|
91 |
+
print(f"ποΈ Deleted existing '{remote_name}' in Drive folder {folder_id}")
|
92 |
+
|
93 |
+
file_metadata = {"name": remote_name, "parents": [folder_id]}
|
94 |
+
media = MediaFileUpload(local_path, resumable=True)
|
95 |
+
file = drive_service.files().create(
|
96 |
+
body=file_metadata,
|
97 |
+
media_body=media,
|
98 |
+
fields="id"
|
99 |
+
).execute()
|
100 |
+
|
101 |
+
print(f"β
Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
|
102 |
+
return file["id"]
|
103 |
+
|
104 |
+
except Exception as e:
|
105 |
+
print(f"β Error during upload: {e}")
|
106 |
+
return None
|
107 |
+
|
108 |
+
|
109 |
+
def download_file_from_drive(remote_name, folder_id, local_path):
|
110 |
+
results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
|
111 |
+
files = results.get("files", [])
|
112 |
+
if not files:
|
113 |
+
return False
|
114 |
+
file_id = files[0]["id"]
|
115 |
+
request = drive_service.files().get_media(fileId=file_id)
|
116 |
+
fh = io.FileIO(local_path, 'wb')
|
117 |
+
downloader = MediaIoBaseDownload(fh, request)
|
118 |
+
done = False
|
119 |
+
while not done:
|
120 |
+
_, done = downloader.next_chunk()
|
121 |
+
return True
|
122 |
+
def download_drive_file_content(file_id):
|
123 |
+
request = drive_service.files().get_media(fileId=file_id)
|
124 |
+
fh = io.BytesIO()
|
125 |
+
downloader = MediaIoBaseDownload(fh, request)
|
126 |
+
done = False
|
127 |
+
while not done:
|
128 |
+
_, done = downloader.next_chunk()
|
129 |
+
fh.seek(0)
|
130 |
+
return fh.read().decode("utf-8")
|
131 |
+
|
132 |
+
# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
|
133 |
+
# """
|
134 |
+
# Runs `func` with timeout in seconds. Kills if it exceeds.
|
135 |
+
# Returns: (success, result or None)
|
136 |
+
# """
|
137 |
+
# def wrapper(q, *args, **kwargs):
|
138 |
+
# try:
|
139 |
+
# q.put(func(*args, **kwargs))
|
140 |
+
# except Exception as e:
|
141 |
+
# q.put(e)
|
142 |
+
|
143 |
+
# q = multiprocessing.Queue()
|
144 |
+
# p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
|
145 |
+
# p.start()
|
146 |
+
# p.join(timeout)
|
147 |
+
|
148 |
+
# if p.is_alive():
|
149 |
+
# p.terminate()
|
150 |
+
# p.join()
|
151 |
+
# print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.")
|
152 |
+
# return False, None
|
153 |
+
# else:
|
154 |
+
# result = q.get()
|
155 |
+
# if isinstance(result, Exception):
|
156 |
+
# raise result
|
157 |
+
# return True, result
|
158 |
+
def run_with_timeout(func, args=(), kwargs={}, timeout=30):
|
159 |
+
import concurrent.futures
|
160 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
161 |
+
future = executor.submit(func, *args, **kwargs)
|
162 |
+
try:
|
163 |
+
return True, future.result(timeout=timeout)
|
164 |
+
except concurrent.futures.TimeoutError:
|
165 |
+
print(f"β±οΈ Timeout exceeded ({timeout} sec) β function killed.")
|
166 |
+
return False, None
|
167 |
+
|
168 |
+
def time_it(func, *args, **kwargs):
|
169 |
+
"""
|
170 |
+
Measure how long a function takes to run and return its result + time.
|
171 |
+
"""
|
172 |
+
start = time.time()
|
173 |
+
result = func(*args, **kwargs)
|
174 |
+
end = time.time()
|
175 |
+
elapsed = end - start
|
176 |
+
print(f"β±οΈ '{func.__name__}' took {elapsed:.3f} seconds")
|
177 |
+
return result, elapsed
|
178 |
+
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
|
179 |
+
def track_gemini_cost():
|
180 |
+
# Prices are per 1,000 tokens
|
181 |
+
PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
182 |
+
PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
183 |
+
PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
184 |
+
return True
|
185 |
+
|
186 |
+
def unique_preserve_order(seq):
|
187 |
+
seen = set()
|
188 |
+
return [x for x in seq if not (x in seen or seen.add(x))]
|
189 |
+
# Main execution
|
190 |
+
def pipeline_with_gemini(accessions):
|
191 |
+
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
|
192 |
+
# there can be one accession number in the accessions
|
193 |
+
# Prices are per 1,000 tokens
|
194 |
+
PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
195 |
+
PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
196 |
+
PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
197 |
+
if not accessions:
|
198 |
+
print("no input")
|
199 |
+
return None
|
200 |
+
else:
|
201 |
+
accs_output = {}
|
202 |
+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
203 |
+
for acc in accessions:
|
204 |
+
print("start gemini: ", acc)
|
205 |
+
start = time.time()
|
206 |
+
total_cost_title = 0
|
207 |
+
jsonSM, links, article_text = {},[], ""
|
208 |
+
acc_score = { "isolate": "",
|
209 |
+
"country":{},
|
210 |
+
"sample_type":{},
|
211 |
+
#"specific_location":{},
|
212 |
+
#"ethnicity":{},
|
213 |
+
"query_cost":total_cost_title,
|
214 |
+
"time_cost":None,
|
215 |
+
"source":links}
|
216 |
+
meta = mtdna_classifier.fetch_ncbi_metadata(acc)
|
217 |
+
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
218 |
+
acc_score["isolate"] = iso
|
219 |
+
print("meta: ",meta)
|
220 |
+
meta_expand = smart_fallback.fetch_ncbi(acc)
|
221 |
+
print("meta expand: ", meta_expand)
|
222 |
+
# set up step: create the folder to save document
|
223 |
+
chunk, all_output = "",""
|
224 |
+
if pudID:
|
225 |
+
id = str(pudID)
|
226 |
+
saveTitle = title
|
227 |
+
else:
|
228 |
+
try:
|
229 |
+
author_name = meta_expand["authors"].split(',')[0] # Use last name only
|
230 |
+
except:
|
231 |
+
author_name = meta_expand["authors"]
|
232 |
+
saveTitle = title + "_" + col_date + "_" + author_name
|
233 |
+
if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown":
|
234 |
+
saveTitle += "_" + acc
|
235 |
+
id = "DirectSubmission"
|
236 |
+
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
237 |
+
# if not folder_path.exists():
|
238 |
+
# cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
|
239 |
+
# result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
240 |
+
# print("data/"+str(id) +" created.")
|
241 |
+
# else:
|
242 |
+
# print("data/"+str(id) +" already exists.")
|
243 |
+
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
244 |
+
# parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
245 |
+
# data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
246 |
+
# sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
247 |
+
data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
|
248 |
+
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
249 |
+
print("sample folder id: ", sample_folder_id)
|
250 |
+
|
251 |
+
# Define document names
|
252 |
+
if len(saveTitle) > 50:
|
253 |
+
saveName = saveTitle[:50]
|
254 |
+
saveName = saveName.replace(" ", "_")
|
255 |
+
chunk_filename = f"{saveName}_merged_document.docx"
|
256 |
+
all_filename = f"{saveName}_all_merged_document.docx"
|
257 |
+
else:
|
258 |
+
saveName = saveTitle.replace(" ", "_")
|
259 |
+
chunk_filename = f"{saveName}_merged_document.docx"
|
260 |
+
all_filename = f"{saveName}_all_merged_document.docx"
|
261 |
+
print(chunk_filename, all_filename)
|
262 |
+
# Define local temp paths for reading/writing
|
263 |
+
# import tempfile
|
264 |
+
# tmp_dir = tempfile.mkdtemp()
|
265 |
+
LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
|
266 |
+
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
|
267 |
+
file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
|
268 |
+
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
|
269 |
+
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
|
270 |
+
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
|
271 |
+
print(file_chunk_path)
|
272 |
+
chunk_id = find_drive_file(chunk_filename, sample_folder_id)
|
273 |
+
all_id = find_drive_file(all_filename, sample_folder_id)
|
274 |
+
|
275 |
+
if chunk_id and all_id:
|
276 |
+
print("β
Files already exist in Google Drive. Downloading them...")
|
277 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
278 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
279 |
+
print("chunk_id and all_id: ")
|
280 |
+
print(chunk_id, all_id)
|
281 |
+
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
282 |
+
print("π Name:", file["name"])
|
283 |
+
print("π Parent folder ID:", file["parents"][0])
|
284 |
+
print("π View link:", file["webViewLink"])
|
285 |
+
|
286 |
+
|
287 |
+
# Read and parse these into `chunk` and `all_output`
|
288 |
+
else:
|
289 |
+
# π₯ Remove any stale local copies
|
290 |
+
if os.path.exists(file_chunk_path):
|
291 |
+
os.remove(file_chunk_path)
|
292 |
+
print(f"ποΈ Removed stale: {file_chunk_path}")
|
293 |
+
if os.path.exists(file_all_path):
|
294 |
+
os.remove(file_all_path)
|
295 |
+
print(f"ποΈ Removed stale: {file_all_path}")
|
296 |
+
# π₯ Remove the local file first if it exists
|
297 |
+
# if os.path.exists(file_chunk_path):
|
298 |
+
# os.remove(file_chunk_path)
|
299 |
+
# print("remove chunk path")
|
300 |
+
# if os.path.exists(file_all_path):
|
301 |
+
# os.remove(file_all_path)
|
302 |
+
# print("remove all path")
|
303 |
+
# Try to download if already exists on Drive
|
304 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
305 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
306 |
+
print("chunk exist: ", chunk_exists)
|
307 |
+
# first way: ncbi method
|
308 |
+
print("country.lower: ",country.lower())
|
309 |
+
if country.lower() != "unknown":
|
310 |
+
stand_country = standardize_location.smart_country_lookup(country.lower())
|
311 |
+
print("stand_country: ", stand_country)
|
312 |
+
if stand_country.lower() != "not found":
|
313 |
+
acc_score["country"][stand_country.lower()] = ["ncbi"]
|
314 |
+
else: acc_score["country"][country.lower()] = ["ncbi"]
|
315 |
+
# if spe_loc.lower() != "unknown":
|
316 |
+
# acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
|
317 |
+
# if ethnic.lower() != "unknown":
|
318 |
+
# acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
|
319 |
+
if sample_type.lower() != "unknown":
|
320 |
+
acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
|
321 |
+
# second way: LLM model
|
322 |
+
# Preprocess the input token
|
323 |
+
print(acc_score)
|
324 |
+
accession, isolate = None, None
|
325 |
+
if acc != "unknown": accession = acc
|
326 |
+
if iso != "unknown": isolate = iso
|
327 |
+
# check doi first
|
328 |
+
if doi != "unknown":
|
329 |
+
link = 'https://doi.org/' + doi
|
330 |
+
# get the file to create listOfFile for each id
|
331 |
+
print("link of doi: ", link)
|
332 |
+
html = extractHTML.HTML("",link)
|
333 |
+
jsonSM = html.getSupMaterial()
|
334 |
+
article_text = html.getListSection()
|
335 |
+
if article_text:
|
336 |
+
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
337 |
+
links.append(link)
|
338 |
+
if jsonSM:
|
339 |
+
links += sum((jsonSM[key] for key in jsonSM),[])
|
340 |
+
# no doi then google custom search api
|
341 |
+
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
342 |
+
# might find the article
|
343 |
+
print("no article text, start tem link")
|
344 |
+
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
345 |
+
tem_links = smart_fallback.smart_google_search(meta_expand)
|
346 |
+
print("tem links: ", tem_links)
|
347 |
+
tem_link_acc = smart_fallback.google_accession_search(acc)
|
348 |
+
tem_links += tem_link_acc
|
349 |
+
tem_links = unique_preserve_order(tem_links)
|
350 |
+
print("tem link before filtering: ", tem_links)
|
351 |
+
# filter the quality link
|
352 |
+
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
353 |
+
links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc)
|
354 |
+
print("this is links: ",links)
|
355 |
+
links = unique_preserve_order(links)
|
356 |
+
acc_score["source"] = links
|
357 |
+
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
358 |
+
# all_path = "/"+saveTitle+"_all_merged_document.docx"
|
359 |
+
# # if chunk and all output not exist yet
|
360 |
+
# file_chunk_path = saveLinkFolder + chunk_path
|
361 |
+
# file_all_path = saveLinkFolder + all_path
|
362 |
+
# if os.path.exists(file_chunk_path):
|
363 |
+
# print("File chunk exists!")
|
364 |
+
# if not chunk:
|
365 |
+
# text, table, document_title = model.read_docx_text(file_chunk_path)
|
366 |
+
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
367 |
+
# if os.path.exists(file_all_path):
|
368 |
+
# print("File all output exists!")
|
369 |
+
# if not all_output:
|
370 |
+
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
371 |
+
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
372 |
+
if chunk_exists:
|
373 |
+
print("File chunk exists!")
|
374 |
+
if not chunk:
|
375 |
+
print("start to get chunk")
|
376 |
+
text, table, document_title = model.read_docx_text(file_chunk_path)
|
377 |
+
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
378 |
+
if all_exists:
|
379 |
+
print("File all output exists!")
|
380 |
+
if not all_output:
|
381 |
+
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
382 |
+
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
383 |
+
if not chunk and not all_output:
|
384 |
+
print("not chunk and all output")
|
385 |
+
# else: check if we can reuse these chunk and all output of existed accession to find another
|
386 |
+
if links:
|
387 |
+
for link in links:
|
388 |
+
print(link)
|
389 |
+
# if len(all_output) > 1000*1000:
|
390 |
+
# all_output = data_preprocess.normalize_for_overlap(all_output)
|
391 |
+
# print("after normalizing all output: ", len(all_output))
|
392 |
+
if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
|
393 |
+
print("break here")
|
394 |
+
break
|
395 |
+
if iso != "unknown": query_kw = iso
|
396 |
+
else: query_kw = acc
|
397 |
+
#text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
|
398 |
+
success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=180)
|
399 |
+
if success_process:
|
400 |
+
text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
|
401 |
+
print("yes succeed for process document")
|
402 |
+
else: text_link, tables_link, final_input_link = "", "", ""
|
403 |
+
context = data_preprocess.extract_context(final_input_link, query_kw)
|
404 |
+
if context != "Sample ID not found.":
|
405 |
+
if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
|
406 |
+
success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
|
407 |
+
if success_chunk:
|
408 |
+
chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
|
409 |
+
print("yes succeed for chunk")
|
410 |
+
else:
|
411 |
+
chunk += context
|
412 |
+
print("len context: ", len(context))
|
413 |
+
print("basic fall back")
|
414 |
+
print("len chunk after: ", len(chunk))
|
415 |
+
if len(final_input_link) > 1000*1000:
|
416 |
+
if context != "Sample ID not found.":
|
417 |
+
final_input_link = context
|
418 |
+
else:
|
419 |
+
final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
|
420 |
+
if len(final_input_link) > 1000 *1000:
|
421 |
+
final_input_link = final_input_link[:100000]
|
422 |
+
if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
|
423 |
+
success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
|
424 |
+
if success:
|
425 |
+
all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
|
426 |
+
print("yes succeed")
|
427 |
+
else:
|
428 |
+
all_output += final_input_link
|
429 |
+
print("len final input: ", len(final_input_link))
|
430 |
+
print("basic fall back")
|
431 |
+
print("len all output after: ", len(all_output))
|
432 |
+
#country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
|
433 |
+
|
434 |
+
else:
|
435 |
+
chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
436 |
+
all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
437 |
+
if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
438 |
+
if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
439 |
+
if len(all_output) > 1*1024*1024:
|
440 |
+
all_output = data_preprocess.normalize_for_overlap(all_output)
|
441 |
+
if len(all_output) > 1*1024*1024:
|
442 |
+
all_output = all_output[:1*1024*1024]
|
443 |
+
print("chunk len: ", len(chunk))
|
444 |
+
print("all output len: ", len(all_output))
|
445 |
+
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
446 |
+
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
447 |
+
# Later when saving new files
|
448 |
+
# data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
|
449 |
+
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
450 |
+
|
451 |
+
# Upload to Drive
|
452 |
+
result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
453 |
+
result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
454 |
+
print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
|
455 |
+
print(f"π Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
|
456 |
+
print("here 1")
|
457 |
+
|
458 |
+
# else:
|
459 |
+
# final_input = ""
|
460 |
+
# if all_output:
|
461 |
+
# final_input = all_output
|
462 |
+
# else:
|
463 |
+
# if chunk: final_input = chunk
|
464 |
+
# #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
|
465 |
+
# if final_input:
|
466 |
+
# keywords = []
|
467 |
+
# if iso != "unknown": keywords.append(iso)
|
468 |
+
# if acc != "unknown": keywords.append(acc)
|
469 |
+
# for keyword in keywords:
|
470 |
+
# chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
|
471 |
+
# countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
|
472 |
+
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
|
473 |
+
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
|
474 |
+
|
475 |
+
# Define paths for cached RAG assets
|
476 |
+
# faiss_index_path = saveLinkFolder+"/faiss_index.bin"
|
477 |
+
# document_chunks_path = saveLinkFolder+"/document_chunks.json"
|
478 |
+
# structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
|
479 |
+
print("here 2")
|
480 |
+
faiss_filename = "faiss_index.bin"
|
481 |
+
chunks_filename = "document_chunks.json"
|
482 |
+
lookup_filename = "structured_lookup.json"
|
483 |
+
print("name of faiss: ", faiss_filename)
|
484 |
+
|
485 |
+
faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
|
486 |
+
document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
|
487 |
+
structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
|
488 |
+
print("name if faiss path: ", faiss_index_path)
|
489 |
+
# π₯ Remove the local file first if it exists
|
490 |
+
faiss_id = find_drive_file(faiss_filename, sample_folder_id)
|
491 |
+
document_id = find_drive_file(chunks_filename, sample_folder_id)
|
492 |
+
structure_id = find_drive_file(lookup_filename, sample_folder_id)
|
493 |
+
if faiss_id and document_id and structure_id:
|
494 |
+
print("β
3 Files already exist in Google Drive. Downloading them...")
|
495 |
+
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
|
496 |
+
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
|
497 |
+
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
|
498 |
+
# Read and parse these into `chunk` and `all_output`
|
499 |
+
else:
|
500 |
+
if os.path.exists(faiss_index_path):
|
501 |
+
os.remove(faiss_index_path)
|
502 |
+
if os.path.exists(document_chunks_path):
|
503 |
+
os.remove(document_chunks_path)
|
504 |
+
if os.path.exists(structured_lookup_path):
|
505 |
+
os.remove(structured_lookup_path)
|
506 |
+
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
|
507 |
+
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
|
508 |
+
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
|
509 |
+
|
510 |
+
print("move to load rag")
|
511 |
+
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
|
512 |
+
faiss_index_path, document_chunks_path, structured_lookup_path
|
513 |
+
)
|
514 |
+
|
515 |
+
global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
|
516 |
+
if not all_output:
|
517 |
+
if chunk: all_output = chunk
|
518 |
+
else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
519 |
+
if faiss_index is None:
|
520 |
+
print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
|
521 |
+
total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
|
522 |
+
all_output
|
523 |
+
).total_tokens
|
524 |
+
|
525 |
+
initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
|
526 |
+
total_cost_title += initial_embedding_cost
|
527 |
+
print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
|
528 |
+
|
529 |
+
|
530 |
+
master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
|
531 |
+
file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
|
532 |
+
)
|
533 |
+
else:
|
534 |
+
print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
|
535 |
+
plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
|
536 |
+
master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
|
537 |
+
|
538 |
+
primary_word = iso
|
539 |
+
alternative_word = acc
|
540 |
+
print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
|
541 |
+
if features.lower() not in all_output.lower():
|
542 |
+
all_output += ". NCBI Features: " + features
|
543 |
+
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
|
544 |
+
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
545 |
+
# model.call_llm_api, chunk=chunk, all_output=all_output)
|
546 |
+
print("this is chunk for the model")
|
547 |
+
print(chunk)
|
548 |
+
print("this is all output for the model")
|
549 |
+
print(all_output)
|
550 |
+
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
551 |
+
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
552 |
+
model.call_llm_api, chunk=chunk, all_output=all_output)
|
553 |
+
print("country using ai: ", country)
|
554 |
+
print("sample type using ai: ", sample_type)
|
555 |
+
if len(country) == 0: country = "unknown"
|
556 |
+
if len(sample_type) == 0: sample_type = "unknown"
|
557 |
+
if country_explanation: country_explanation = "-"+country_explanation
|
558 |
+
else: country_explanation = ""
|
559 |
+
if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
|
560 |
+
else: sample_type_explanation = ""
|
561 |
+
if method_used == "unknown": method_used = ""
|
562 |
+
if country.lower() != "unknown":
|
563 |
+
stand_country = standardize_location.smart_country_lookup(country.lower())
|
564 |
+
if stand_country.lower() != "not found":
|
565 |
+
if stand_country.lower() in acc_score["country"]:
|
566 |
+
if country_explanation:
|
567 |
+
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
|
568 |
+
else:
|
569 |
+
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
|
570 |
+
else:
|
571 |
+
if country.lower() in acc_score["country"]:
|
572 |
+
if country_explanation:
|
573 |
+
if len(method_used + country_explanation) > 0:
|
574 |
+
acc_score["country"][country.lower()].append(method_used + country_explanation)
|
575 |
+
else:
|
576 |
+
if len(method_used + country_explanation) > 0:
|
577 |
+
acc_score["country"][country.lower()] = [method_used + country_explanation]
|
578 |
+
# if spe_loc.lower() != "unknown":
|
579 |
+
# if spe_loc.lower() in acc_score["specific_location"]:
|
580 |
+
# acc_score["specific_location"][spe_loc.lower()].append(method_used)
|
581 |
+
# else:
|
582 |
+
# acc_score["specific_location"][spe_loc.lower()] = [method_used]
|
583 |
+
# if ethnic.lower() != "unknown":
|
584 |
+
# if ethnic.lower() in acc_score["ethnicity"]:
|
585 |
+
# acc_score["ethnicity"][ethnic.lower()].append(method_used)
|
586 |
+
# else:
|
587 |
+
# acc_score["ethnicity"][ethnic.lower()] = [method_used]
|
588 |
+
if sample_type.lower() != "unknown":
|
589 |
+
if sample_type.lower() in acc_score["sample_type"]:
|
590 |
+
if len(method_used + sample_type_explanation) > 0:
|
591 |
+
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
|
592 |
+
else:
|
593 |
+
if len(method_used + sample_type_explanation)> 0:
|
594 |
+
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
595 |
+
# last resort: combine all information to give all output otherwise unknown
|
596 |
+
if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
|
597 |
+
text = ""
|
598 |
+
for key in meta_expand:
|
599 |
+
text += str(key) + ": " + meta_expand[key] + "\n"
|
600 |
+
if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
|
601 |
+
text += data_preprocess.normalize_for_overlap(all_output)
|
602 |
+
if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
|
603 |
+
text += data_preprocess.normalize_for_overlap(chunk)
|
604 |
+
text += ". NCBI Features: " + features
|
605 |
+
print("this is text for the last resort model")
|
606 |
+
print(text)
|
607 |
+
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
608 |
+
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
609 |
+
model.call_llm_api, chunk=text, all_output=text)
|
610 |
+
print("this is last resort results: ")
|
611 |
+
print("country: ", country)
|
612 |
+
print("sample type: ", sample_type)
|
613 |
+
if len(country) == 0: country = "unknown"
|
614 |
+
if len(sample_type) == 0: sample_type = "unknown"
|
615 |
+
if country_explanation: country_explanation = "-"+country_explanation
|
616 |
+
else: country_explanation = ""
|
617 |
+
if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
|
618 |
+
else: sample_type_explanation = ""
|
619 |
+
if method_used == "unknown": method_used = ""
|
620 |
+
if country.lower() != "unknown":
|
621 |
+
stand_country = standardize_location.smart_country_lookup(country.lower())
|
622 |
+
if stand_country.lower() != "not found":
|
623 |
+
if stand_country.lower() in acc_score["country"]:
|
624 |
+
if country_explanation:
|
625 |
+
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
|
626 |
+
else:
|
627 |
+
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
|
628 |
+
else:
|
629 |
+
if country.lower() in acc_score["country"]:
|
630 |
+
if country_explanation:
|
631 |
+
if len(method_used + country_explanation) > 0:
|
632 |
+
acc_score["country"][country.lower()].append(method_used + country_explanation)
|
633 |
+
else:
|
634 |
+
if len(method_used + country_explanation) > 0:
|
635 |
+
acc_score["country"][country.lower()] = [method_used + country_explanation]
|
636 |
+
if sample_type.lower() != "unknown":
|
637 |
+
if sample_type.lower() in acc_score["sample_type"]:
|
638 |
+
if len(method_used + sample_type_explanation) > 0:
|
639 |
+
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
|
640 |
+
else:
|
641 |
+
if len(method_used + sample_type_explanation)> 0:
|
642 |
+
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
643 |
+
end = time.time()
|
644 |
+
total_cost_title += total_query_cost
|
645 |
+
acc_score["query_cost"] = f"{total_cost_title:.6f}"
|
646 |
+
elapsed = end - start
|
647 |
+
acc_score["time_cost"] = f"{elapsed:.3f} seconds"
|
648 |
+
accs_output[acc] = acc_score
|
649 |
+
print(accs_output[acc])
|
650 |
+
|
651 |
return accs_output
|