VyLala commited on
Commit
ccbc9f4
·
verified ·
1 Parent(s): 997d78d

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +345 -346
pipeline.py CHANGED
@@ -1,347 +1,346 @@
1
- # test1: MJ17 direct
2
- # test2: "A1YU101" thailand cross-ref
3
- # test3: "EBK109" thailand cross-ref
4
- # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
- from iterate3 import data_preprocess, model
6
- import mtdna_classifier
7
- import app
8
- import pandas as pd
9
- from pathlib import Path
10
- import subprocess
11
- from NER.html import extractHTML
12
- import os
13
- import google.generativeai as genai
14
- import re
15
- import standardize_location
16
- # Helper functions in for this pipeline
17
- # Track time
18
- import time
19
- import multiprocessing
20
-
21
- def run_with_timeout(func, args=(), kwargs={}, timeout=20):
22
- """
23
- Runs `func` with timeout in seconds. Kills if it exceeds.
24
- Returns: (success, result or None)
25
- """
26
- def wrapper(q, *args, **kwargs):
27
- try:
28
- q.put(func(*args, **kwargs))
29
- except Exception as e:
30
- q.put(e)
31
-
32
- q = multiprocessing.Queue()
33
- p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
34
- p.start()
35
- p.join(timeout)
36
-
37
- if p.is_alive():
38
- p.terminate()
39
- p.join()
40
- print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
41
- return False, None
42
- else:
43
- result = q.get()
44
- if isinstance(result, Exception):
45
- raise result
46
- return True, result
47
-
48
- def time_it(func, *args, **kwargs):
49
- """
50
- Measure how long a function takes to run and return its result + time.
51
- """
52
- start = time.time()
53
- result = func(*args, **kwargs)
54
- end = time.time()
55
- elapsed = end - start
56
- print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
57
- return result, elapsed
58
- # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
59
- def track_gemini_cost():
60
- # Prices are per 1,000 tokens
61
- PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
62
- PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
63
- PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
64
- return True
65
-
66
- def unique_preserve_order(seq):
67
- seen = set()
68
- return [x for x in seq if not (x in seen or seen.add(x))]
69
- # Main execution
70
- def pipeline_with_gemini(accessions):
71
- # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
72
- # there can be one accession number in the accessions
73
- # Prices are per 1,000 tokens
74
- PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
75
- PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
76
- PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
77
- if not accessions:
78
- print("no input")
79
- return None
80
- else:
81
- accs_output = {}
82
- os.environ["GOOGLE_API_KEY"] = "AIzaSyDi0CNKBgEtnr6YuPaY6YNEuC5wT0cdKhk"
83
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
84
- for acc in accessions:
85
- start = time.time()
86
- total_cost_title = 0
87
- jsonSM, links, article_text = {},[], ""
88
- acc_score = { "isolate": "",
89
- "country":{},
90
- "sample_type":{},
91
- #"specific_location":{},
92
- #"ethnicity":{},
93
- "query_cost":total_cost_title,
94
- "time_cost":None,
95
- "source":links}
96
- meta = mtdna_classifier.fetch_ncbi_metadata(acc)
97
- country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
98
- acc_score["isolate"] = iso
99
- # set up step: create the folder to save document
100
- chunk, all_output = "",""
101
- if pudID:
102
- id = pudID
103
- saveTitle = title
104
- else:
105
- saveTitle = title + "_" + col_date
106
- id = "DirectSubmission"
107
- folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
108
- if not folder_path.exists():
109
- cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
110
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
111
- print("data/"+str(id) +" created.")
112
- else:
113
- print("data/"+str(id) +" already exists.")
114
- saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
115
- # first way: ncbi method
116
- if country.lower() != "unknown":
117
- stand_country = standardize_location.smart_country_lookup(country.lower())
118
- if stand_country.lower() != "not found":
119
- acc_score["country"][stand_country.lower()] = ["ncbi"]
120
- else: acc_score["country"][country.lower()] = ["ncbi"]
121
- # if spe_loc.lower() != "unknown":
122
- # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
123
- # if ethnic.lower() != "unknown":
124
- # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
125
- if sample_type.lower() != "unknown":
126
- acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
127
- # second way: LLM model
128
- # Preprocess the input token
129
- accession, isolate = None, None
130
- if acc != "unknown": accession = acc
131
- if iso != "unknown": isolate = iso
132
- # check doi first
133
- if doi != "unknown":
134
- link = 'https://doi.org/' + doi
135
- # get the file to create listOfFile for each id
136
- html = extractHTML.HTML("",link)
137
- jsonSM = html.getSupMaterial()
138
- article_text = html.getListSection()
139
- if article_text:
140
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
141
- links.append(link)
142
- if jsonSM:
143
- links += sum((jsonSM[key] for key in jsonSM),[])
144
- # no doi then google custom search api
145
- if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
146
- # might find the article
147
- tem_links = mtdna_classifier.search_google_custom(title, 2)
148
- # get supplementary of that article
149
- for link in tem_links:
150
- html = extractHTML.HTML("",link)
151
- jsonSM = html.getSupMaterial()
152
- article_text_tem = html.getListSection()
153
- if article_text_tem:
154
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text_tem.lower() or "403 Forbidden Request".lower() not in article_text_tem.lower():
155
- links.append(link)
156
- if jsonSM:
157
- links += sum((jsonSM[key] for key in jsonSM),[])
158
- print(links)
159
- links = unique_preserve_order(links)
160
- acc_score["source"] = links
161
- chunk_path = "/"+saveTitle+"_merged_document.docx"
162
- all_path = "/"+saveTitle+"_all_merged_document.docx"
163
- # if chunk and all output not exist yet
164
- file_chunk_path = saveLinkFolder + chunk_path
165
- file_all_path = saveLinkFolder + all_path
166
- if os.path.exists(file_chunk_path):
167
- print("File chunk exists!")
168
- if not chunk:
169
- text, table, document_title = model.read_docx_text(file_chunk_path)
170
- chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
171
- if os.path.exists(file_all_path):
172
- print("File all output exists!")
173
- if not all_output:
174
- text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
175
- all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
176
- if not chunk and not all_output:
177
- # else: check if we can reuse these chunk and all output of existed accession to find another
178
- if links:
179
- for link in links:
180
- print(link)
181
- # if len(all_output) > 1000*1000:
182
- # all_output = data_preprocess.normalize_for_overlap(all_output)
183
- # print("after normalizing all output: ", len(all_output))
184
- if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
185
- print("break here")
186
- break
187
- if iso != "unknown": query_kw = iso
188
- else: query_kw = acc
189
- #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
190
- success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,saveLinkFolder),kwargs={"isolate":query_kw},timeout=180)
191
- if success_process:
192
- text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
193
- print("yes succeed for process document")
194
- else: text_link, tables_link, final_input_link = "", "", ""
195
- context = data_preprocess.extract_context(final_input_link, query_kw)
196
- if context != "Sample ID not found.":
197
- if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
198
- success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
199
- if success_chunk:
200
- chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
201
- print("yes succeed for chunk")
202
- else:
203
- chunk += context
204
- print("len context: ", len(context))
205
- print("basic fall back")
206
- print("len chunk after: ", len(chunk))
207
- if len(final_input_link) > 1000*1000:
208
- if context != "Sample ID not found.":
209
- final_input_link = context
210
- else:
211
- final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
212
- if len(final_input_link) > 1000 *1000:
213
- final_input_link = final_input_link[:100000]
214
- if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
215
- success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
216
- if success:
217
- all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
218
- print("yes succeed")
219
- else:
220
- all_output += final_input_link
221
- print("len final input: ", len(final_input_link))
222
- print("basic fall back")
223
- print("len all output after: ", len(all_output))
224
- #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
225
-
226
- else:
227
- chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
228
- all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
229
- if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
230
- if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
231
- if len(all_output) > 1*1024*1024:
232
- all_output = data_preprocess.normalize_for_overlap(all_output)
233
- if len(all_output) > 1*1024*1024:
234
- all_output = all_output[:1*1024*1024]
235
- print("chunk len: ", len(chunk))
236
- print("all output len: ", len(all_output))
237
- data_preprocess.save_text_to_docx(chunk, file_chunk_path)
238
- data_preprocess.save_text_to_docx(all_output, file_all_path)
239
- # else:
240
- # final_input = ""
241
- # if all_output:
242
- # final_input = all_output
243
- # else:
244
- # if chunk: final_input = chunk
245
- # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
246
- # if final_input:
247
- # keywords = []
248
- # if iso != "unknown": keywords.append(iso)
249
- # if acc != "unknown": keywords.append(acc)
250
- # for keyword in keywords:
251
- # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
252
- # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
253
- # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
254
- # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
255
-
256
- # Define paths for cached RAG assets
257
- faiss_index_path = saveLinkFolder+"/faiss_index.bin"
258
- document_chunks_path = saveLinkFolder+"/document_chunks.json"
259
- structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
260
-
261
- master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
262
- faiss_index_path, document_chunks_path, structured_lookup_path
263
- )
264
-
265
- global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
266
- if not all_output:
267
- if chunk: all_output = chunk
268
- else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
269
- if faiss_index is None:
270
- print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
271
- total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
272
- all_output
273
- ).total_tokens
274
-
275
- initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
276
- total_cost_title += initial_embedding_cost
277
- print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
278
-
279
-
280
- master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
281
- file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
282
- )
283
- else:
284
- print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
285
- plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
286
- master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
287
-
288
- primary_word = iso
289
- alternative_word = acc
290
- print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
291
- if features.lower() not in all_output.lower():
292
- all_output += ". NCBI Features: " + features
293
- # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
294
- # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
295
- # model.call_llm_api, chunk=chunk, all_output=all_output)
296
- country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
297
- primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
298
- model.call_llm_api, chunk=chunk, all_output=all_output)
299
- if len(country) == 0: country = "unknown"
300
- if len(sample_type) == 0: sample_type = "unknown"
301
- if country_explanation: country_explanation = "-"+country_explanation
302
- else: country_explanation = ""
303
- if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
304
- else: sample_type_explanation = ""
305
- if method_used == "unknown": method_used = ""
306
- if country.lower() != "unknown":
307
- stand_country = standardize_location.smart_country_lookup(country.lower())
308
- if stand_country.lower() != "not found":
309
- if stand_country.lower() in acc_score["country"]:
310
- if country_explanation:
311
- acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
312
- else:
313
- acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
314
- else:
315
- if country.lower() in acc_score["country"]:
316
- if country_explanation:
317
- if len(method_used + country_explanation) > 0:
318
- acc_score["country"][country.lower()].append(method_used + country_explanation)
319
- else:
320
- if len(method_used + country_explanation) > 0:
321
- acc_score["country"][country.lower()] = [method_used + country_explanation]
322
- # if spe_loc.lower() != "unknown":
323
- # if spe_loc.lower() in acc_score["specific_location"]:
324
- # acc_score["specific_location"][spe_loc.lower()].append(method_used)
325
- # else:
326
- # acc_score["specific_location"][spe_loc.lower()] = [method_used]
327
- # if ethnic.lower() != "unknown":
328
- # if ethnic.lower() in acc_score["ethnicity"]:
329
- # acc_score["ethnicity"][ethnic.lower()].append(method_used)
330
- # else:
331
- # acc_score["ethnicity"][ethnic.lower()] = [method_used]
332
- if sample_type.lower() != "unknown":
333
- if sample_type.lower() in acc_score["sample_type"]:
334
- if len(method_used + sample_type_explanation) > 0:
335
- acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
336
- else:
337
- if len(method_used + sample_type_explanation)> 0:
338
- acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
339
- end = time.time()
340
- total_cost_title += total_query_cost
341
- acc_score["query_cost"] = total_cost_title
342
- elapsed = end - start
343
- acc_score["time_cost"] = f"{elapsed:.3f} seconds"
344
- accs_output[acc] = acc_score
345
- print(accs_output[acc])
346
-
347
  return accs_output
 
1
+ # test1: MJ17 direct
2
+ # test2: "A1YU101" thailand cross-ref
3
+ # test3: "EBK109" thailand cross-ref
4
+ # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
+ from iterate3 import data_preprocess, model
6
+ import mtdna_classifier
7
+ import app
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ import subprocess
11
+ from NER.html import extractHTML
12
+ import os
13
+ import google.generativeai as genai
14
+ import re
15
+ import standardize_location
16
+ # Helper functions in for this pipeline
17
+ # Track time
18
+ import time
19
+ import multiprocessing
20
+
21
+ def run_with_timeout(func, args=(), kwargs={}, timeout=20):
22
+ """
23
+ Runs `func` with timeout in seconds. Kills if it exceeds.
24
+ Returns: (success, result or None)
25
+ """
26
+ def wrapper(q, *args, **kwargs):
27
+ try:
28
+ q.put(func(*args, **kwargs))
29
+ except Exception as e:
30
+ q.put(e)
31
+
32
+ q = multiprocessing.Queue()
33
+ p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
34
+ p.start()
35
+ p.join(timeout)
36
+
37
+ if p.is_alive():
38
+ p.terminate()
39
+ p.join()
40
+ print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
41
+ return False, None
42
+ else:
43
+ result = q.get()
44
+ if isinstance(result, Exception):
45
+ raise result
46
+ return True, result
47
+
48
+ def time_it(func, *args, **kwargs):
49
+ """
50
+ Measure how long a function takes to run and return its result + time.
51
+ """
52
+ start = time.time()
53
+ result = func(*args, **kwargs)
54
+ end = time.time()
55
+ elapsed = end - start
56
+ print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
57
+ return result, elapsed
58
+ # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
59
+ def track_gemini_cost():
60
+ # Prices are per 1,000 tokens
61
+ PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
62
+ PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
63
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
64
+ return True
65
+
66
+ def unique_preserve_order(seq):
67
+ seen = set()
68
+ return [x for x in seq if not (x in seen or seen.add(x))]
69
+ # Main execution
70
+ def pipeline_with_gemini(accessions):
71
+ # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
72
+ # there can be one accession number in the accessions
73
+ # Prices are per 1,000 tokens
74
+ PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
75
+ PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
76
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
77
+ if not accessions:
78
+ print("no input")
79
+ return None
80
+ else:
81
+ accs_output = {}
82
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
83
+ for acc in accessions:
84
+ start = time.time()
85
+ total_cost_title = 0
86
+ jsonSM, links, article_text = {},[], ""
87
+ acc_score = { "isolate": "",
88
+ "country":{},
89
+ "sample_type":{},
90
+ #"specific_location":{},
91
+ #"ethnicity":{},
92
+ "query_cost":total_cost_title,
93
+ "time_cost":None,
94
+ "source":links}
95
+ meta = mtdna_classifier.fetch_ncbi_metadata(acc)
96
+ country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
97
+ acc_score["isolate"] = iso
98
+ # set up step: create the folder to save document
99
+ chunk, all_output = "",""
100
+ if pudID:
101
+ id = pudID
102
+ saveTitle = title
103
+ else:
104
+ saveTitle = title + "_" + col_date
105
+ id = "DirectSubmission"
106
+ folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
107
+ if not folder_path.exists():
108
+ cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
109
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
110
+ print("data/"+str(id) +" created.")
111
+ else:
112
+ print("data/"+str(id) +" already exists.")
113
+ saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
114
+ # first way: ncbi method
115
+ if country.lower() != "unknown":
116
+ stand_country = standardize_location.smart_country_lookup(country.lower())
117
+ if stand_country.lower() != "not found":
118
+ acc_score["country"][stand_country.lower()] = ["ncbi"]
119
+ else: acc_score["country"][country.lower()] = ["ncbi"]
120
+ # if spe_loc.lower() != "unknown":
121
+ # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
122
+ # if ethnic.lower() != "unknown":
123
+ # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
124
+ if sample_type.lower() != "unknown":
125
+ acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
126
+ # second way: LLM model
127
+ # Preprocess the input token
128
+ accession, isolate = None, None
129
+ if acc != "unknown": accession = acc
130
+ if iso != "unknown": isolate = iso
131
+ # check doi first
132
+ if doi != "unknown":
133
+ link = 'https://doi.org/' + doi
134
+ # get the file to create listOfFile for each id
135
+ html = extractHTML.HTML("",link)
136
+ jsonSM = html.getSupMaterial()
137
+ article_text = html.getListSection()
138
+ if article_text:
139
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
140
+ links.append(link)
141
+ if jsonSM:
142
+ links += sum((jsonSM[key] for key in jsonSM),[])
143
+ # no doi then google custom search api
144
+ if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
145
+ # might find the article
146
+ tem_links = mtdna_classifier.search_google_custom(title, 2)
147
+ # get supplementary of that article
148
+ for link in tem_links:
149
+ html = extractHTML.HTML("",link)
150
+ jsonSM = html.getSupMaterial()
151
+ article_text_tem = html.getListSection()
152
+ if article_text_tem:
153
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text_tem.lower() or "403 Forbidden Request".lower() not in article_text_tem.lower():
154
+ links.append(link)
155
+ if jsonSM:
156
+ links += sum((jsonSM[key] for key in jsonSM),[])
157
+ print(links)
158
+ links = unique_preserve_order(links)
159
+ acc_score["source"] = links
160
+ chunk_path = "/"+saveTitle+"_merged_document.docx"
161
+ all_path = "/"+saveTitle+"_all_merged_document.docx"
162
+ # if chunk and all output not exist yet
163
+ file_chunk_path = saveLinkFolder + chunk_path
164
+ file_all_path = saveLinkFolder + all_path
165
+ if os.path.exists(file_chunk_path):
166
+ print("File chunk exists!")
167
+ if not chunk:
168
+ text, table, document_title = model.read_docx_text(file_chunk_path)
169
+ chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
170
+ if os.path.exists(file_all_path):
171
+ print("File all output exists!")
172
+ if not all_output:
173
+ text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
174
+ all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
175
+ if not chunk and not all_output:
176
+ # else: check if we can reuse these chunk and all output of existed accession to find another
177
+ if links:
178
+ for link in links:
179
+ print(link)
180
+ # if len(all_output) > 1000*1000:
181
+ # all_output = data_preprocess.normalize_for_overlap(all_output)
182
+ # print("after normalizing all output: ", len(all_output))
183
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
184
+ print("break here")
185
+ break
186
+ if iso != "unknown": query_kw = iso
187
+ else: query_kw = acc
188
+ #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
189
+ success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,saveLinkFolder),kwargs={"isolate":query_kw},timeout=180)
190
+ if success_process:
191
+ text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
192
+ print("yes succeed for process document")
193
+ else: text_link, tables_link, final_input_link = "", "", ""
194
+ context = data_preprocess.extract_context(final_input_link, query_kw)
195
+ if context != "Sample ID not found.":
196
+ if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
197
+ success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
198
+ if success_chunk:
199
+ chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
200
+ print("yes succeed for chunk")
201
+ else:
202
+ chunk += context
203
+ print("len context: ", len(context))
204
+ print("basic fall back")
205
+ print("len chunk after: ", len(chunk))
206
+ if len(final_input_link) > 1000*1000:
207
+ if context != "Sample ID not found.":
208
+ final_input_link = context
209
+ else:
210
+ final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
211
+ if len(final_input_link) > 1000 *1000:
212
+ final_input_link = final_input_link[:100000]
213
+ if len(data_preprocess.normalize_for_overlap(all_output)) < 1000*1000:
214
+ success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link))
215
+ if success:
216
+ all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
217
+ print("yes succeed")
218
+ else:
219
+ all_output += final_input_link
220
+ print("len final input: ", len(final_input_link))
221
+ print("basic fall back")
222
+ print("len all output after: ", len(all_output))
223
+ #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
224
+
225
+ else:
226
+ chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
227
+ all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
228
+ if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
229
+ if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
230
+ if len(all_output) > 1*1024*1024:
231
+ all_output = data_preprocess.normalize_for_overlap(all_output)
232
+ if len(all_output) > 1*1024*1024:
233
+ all_output = all_output[:1*1024*1024]
234
+ print("chunk len: ", len(chunk))
235
+ print("all output len: ", len(all_output))
236
+ data_preprocess.save_text_to_docx(chunk, file_chunk_path)
237
+ data_preprocess.save_text_to_docx(all_output, file_all_path)
238
+ # else:
239
+ # final_input = ""
240
+ # if all_output:
241
+ # final_input = all_output
242
+ # else:
243
+ # if chunk: final_input = chunk
244
+ # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
245
+ # if final_input:
246
+ # keywords = []
247
+ # if iso != "unknown": keywords.append(iso)
248
+ # if acc != "unknown": keywords.append(acc)
249
+ # for keyword in keywords:
250
+ # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
251
+ # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
252
+ # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
253
+ # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
254
+
255
+ # Define paths for cached RAG assets
256
+ faiss_index_path = saveLinkFolder+"/faiss_index.bin"
257
+ document_chunks_path = saveLinkFolder+"/document_chunks.json"
258
+ structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
259
+
260
+ master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
261
+ faiss_index_path, document_chunks_path, structured_lookup_path
262
+ )
263
+
264
+ global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
265
+ if not all_output:
266
+ if chunk: all_output = chunk
267
+ else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
268
+ if faiss_index is None:
269
+ print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
270
+ total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
271
+ all_output
272
+ ).total_tokens
273
+
274
+ initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
275
+ total_cost_title += initial_embedding_cost
276
+ print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
277
+
278
+
279
+ master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
280
+ file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
281
+ )
282
+ else:
283
+ print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
284
+ plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
285
+ master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
286
+
287
+ primary_word = iso
288
+ alternative_word = acc
289
+ print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
290
+ if features.lower() not in all_output.lower():
291
+ all_output += ". NCBI Features: " + features
292
+ # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
293
+ # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
294
+ # model.call_llm_api, chunk=chunk, all_output=all_output)
295
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
296
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
297
+ model.call_llm_api, chunk=chunk, all_output=all_output)
298
+ if len(country) == 0: country = "unknown"
299
+ if len(sample_type) == 0: sample_type = "unknown"
300
+ if country_explanation: country_explanation = "-"+country_explanation
301
+ else: country_explanation = ""
302
+ if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
303
+ else: sample_type_explanation = ""
304
+ if method_used == "unknown": method_used = ""
305
+ if country.lower() != "unknown":
306
+ stand_country = standardize_location.smart_country_lookup(country.lower())
307
+ if stand_country.lower() != "not found":
308
+ if stand_country.lower() in acc_score["country"]:
309
+ if country_explanation:
310
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
311
+ else:
312
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
313
+ else:
314
+ if country.lower() in acc_score["country"]:
315
+ if country_explanation:
316
+ if len(method_used + country_explanation) > 0:
317
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
318
+ else:
319
+ if len(method_used + country_explanation) > 0:
320
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
321
+ # if spe_loc.lower() != "unknown":
322
+ # if spe_loc.lower() in acc_score["specific_location"]:
323
+ # acc_score["specific_location"][spe_loc.lower()].append(method_used)
324
+ # else:
325
+ # acc_score["specific_location"][spe_loc.lower()] = [method_used]
326
+ # if ethnic.lower() != "unknown":
327
+ # if ethnic.lower() in acc_score["ethnicity"]:
328
+ # acc_score["ethnicity"][ethnic.lower()].append(method_used)
329
+ # else:
330
+ # acc_score["ethnicity"][ethnic.lower()] = [method_used]
331
+ if sample_type.lower() != "unknown":
332
+ if sample_type.lower() in acc_score["sample_type"]:
333
+ if len(method_used + sample_type_explanation) > 0:
334
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
335
+ else:
336
+ if len(method_used + sample_type_explanation)> 0:
337
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
338
+ end = time.time()
339
+ total_cost_title += total_query_cost
340
+ acc_score["query_cost"] = total_cost_title
341
+ elapsed = end - start
342
+ acc_score["time_cost"] = f"{elapsed:.3f} seconds"
343
+ accs_output[acc] = acc_score
344
+ print(accs_output[acc])
345
+
 
346
  return accs_output