Spaces:
Running
Running
Upload 3 files
Browse files- mtdna_backend.py +32 -21
- pipeline.py +82 -8
- smart_fallback.py +156 -0
mtdna_backend.py
CHANGED
@@ -139,12 +139,12 @@ def summarize_results(accession):
|
|
139 |
if cached:
|
140 |
print(f"✅ Using cached result for {accession}")
|
141 |
return [[
|
142 |
-
cached["Sample ID"],
|
143 |
-
cached["Predicted Country"],
|
144 |
-
cached["Country Explanation"],
|
145 |
-
cached["Predicted Sample Type"],
|
146 |
-
cached["Sample Type Explanation"],
|
147 |
-
cached["Sources"],
|
148 |
cached["Time cost"]
|
149 |
]]
|
150 |
# only run when nothing in the cache
|
@@ -175,13 +175,15 @@ def summarize_results(accession):
|
|
175 |
pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
|
176 |
for section, results in outputs[key].items():
|
177 |
if section == "country" or section =="sample_type":
|
178 |
-
pred_output = "\n".join(list(results.keys()))
|
179 |
output_explanation = ""
|
180 |
for result, content in results.items():
|
181 |
if len(result) == 0: result = "unknown"
|
182 |
if len(content) == 0: output_explanation = "unknown"
|
183 |
else:
|
184 |
output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
|
|
|
|
|
185 |
if section == "country":
|
186 |
pred_country, country_explanation = pred_output, output_explanation
|
187 |
elif section == "sample_type":
|
@@ -191,24 +193,24 @@ def summarize_results(accession):
|
|
191 |
else: label = key
|
192 |
if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
|
193 |
row = {
|
194 |
-
"Sample ID": label,
|
195 |
-
"Predicted Country": pred_country,
|
196 |
-
"Country Explanation": country_explanation,
|
197 |
-
"Predicted Sample Type":pred_sample,
|
198 |
-
"Sample Type Explanation":sample_explanation,
|
199 |
-
"Sources": "\n".join(outputs[key]["source"]),
|
200 |
"Time cost": outputs[key]["time_cost"]
|
201 |
}
|
202 |
#row_score.append(row)
|
203 |
rows.append(list(row.values()))
|
204 |
|
205 |
save_row = {
|
206 |
-
"Sample ID": label,
|
207 |
-
"Predicted Country": pred_country,
|
208 |
-
"Country Explanation": country_explanation,
|
209 |
-
"Predicted Sample Type":pred_sample,
|
210 |
-
"Sample Type Explanation":sample_explanation,
|
211 |
-
"Sources": "\n".join(outputs[key]["source"]),
|
212 |
"Query_cost": outputs[key]["query_cost"],
|
213 |
"Time cost": outputs[key]["time_cost"]
|
214 |
}
|
@@ -530,14 +532,23 @@ def check_known_output(accession):
|
|
530 |
if "Sample ID" not in df.columns:
|
531 |
print("❌ Column 'Sample ID' not found in Google Sheet.")
|
532 |
return None
|
533 |
-
|
534 |
match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
|
535 |
if match:
|
536 |
accession = match.group(0)
|
537 |
|
538 |
matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
|
539 |
if not matched.empty:
|
540 |
-
return matched.iloc[0].to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
541 |
else:
|
542 |
print(f"🔍 Accession {accession} not found in known_samples.")
|
543 |
return None
|
|
|
139 |
if cached:
|
140 |
print(f"✅ Using cached result for {accession}")
|
141 |
return [[
|
142 |
+
cached["Sample ID"] or "unknown",
|
143 |
+
cached["Predicted Country"] or "unknown",
|
144 |
+
cached["Country Explanation"] or "unknown",
|
145 |
+
cached["Predicted Sample Type"] or "unknown",
|
146 |
+
cached["Sample Type Explanation"] or "unknown",
|
147 |
+
cached["Sources"] or "No Links",
|
148 |
cached["Time cost"]
|
149 |
]]
|
150 |
# only run when nothing in the cache
|
|
|
175 |
pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
|
176 |
for section, results in outputs[key].items():
|
177 |
if section == "country" or section =="sample_type":
|
178 |
+
pred_output = []#"\n".join(list(results.keys()))
|
179 |
output_explanation = ""
|
180 |
for result, content in results.items():
|
181 |
if len(result) == 0: result = "unknown"
|
182 |
if len(content) == 0: output_explanation = "unknown"
|
183 |
else:
|
184 |
output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
|
185 |
+
pred_output.append(result)
|
186 |
+
pred_output = "\n".join(pred_output)
|
187 |
if section == "country":
|
188 |
pred_country, country_explanation = pred_output, output_explanation
|
189 |
elif section == "sample_type":
|
|
|
193 |
else: label = key
|
194 |
if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
|
195 |
row = {
|
196 |
+
"Sample ID": label or "unknown",
|
197 |
+
"Predicted Country": pred_country or "unknown",
|
198 |
+
"Country Explanation": country_explanation or "unknown",
|
199 |
+
"Predicted Sample Type":pred_sample or "unknown",
|
200 |
+
"Sample Type Explanation":sample_explanation or "unknown",
|
201 |
+
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
202 |
"Time cost": outputs[key]["time_cost"]
|
203 |
}
|
204 |
#row_score.append(row)
|
205 |
rows.append(list(row.values()))
|
206 |
|
207 |
save_row = {
|
208 |
+
"Sample ID": label or "unknown",
|
209 |
+
"Predicted Country": pred_country or "unknown",
|
210 |
+
"Country Explanation": country_explanation or "unknown",
|
211 |
+
"Predicted Sample Type":pred_sample or "unknown",
|
212 |
+
"Sample Type Explanation":sample_explanation or "unknown",
|
213 |
+
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
214 |
"Query_cost": outputs[key]["query_cost"],
|
215 |
"Time cost": outputs[key]["time_cost"]
|
216 |
}
|
|
|
532 |
if "Sample ID" not in df.columns:
|
533 |
print("❌ Column 'Sample ID' not found in Google Sheet.")
|
534 |
return None
|
535 |
+
|
536 |
match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
|
537 |
if match:
|
538 |
accession = match.group(0)
|
539 |
|
540 |
matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
|
541 |
if not matched.empty:
|
542 |
+
#return matched.iloc[0].to_dict()
|
543 |
+
row = matched.iloc[0]
|
544 |
+
country = row.get("Predicted Country", "").strip().lower()
|
545 |
+
sample_type = row.get("Predicted Sample Type", "").strip().lower()
|
546 |
+
|
547 |
+
if country and country != "unknown" and sample_type and sample_type != "unknown":
|
548 |
+
return row.to_dict()
|
549 |
+
else:
|
550 |
+
print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
|
551 |
+
return None
|
552 |
else:
|
553 |
print(f"🔍 Accession {accession} not found in known_samples.")
|
554 |
return None
|
pipeline.py
CHANGED
@@ -6,6 +6,7 @@ import data_preprocess
|
|
6 |
import model
|
7 |
import mtdna_classifier
|
8 |
#import app
|
|
|
9 |
import pandas as pd
|
10 |
from pathlib import Path
|
11 |
import subprocess
|
@@ -27,7 +28,7 @@ import io
|
|
27 |
import json
|
28 |
#––– Authentication setup –––
|
29 |
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
30 |
-
GDRIVE_DATA_FOLDER_NAME = "
|
31 |
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
32 |
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
33 |
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
@@ -216,13 +217,18 @@ def pipeline_with_gemini(accessions):
|
|
216 |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
217 |
acc_score["isolate"] = iso
|
218 |
print(meta)
|
|
|
219 |
# set up step: create the folder to save document
|
220 |
chunk, all_output = "",""
|
221 |
if pudID:
|
222 |
id = str(pudID)
|
223 |
saveTitle = title
|
224 |
else:
|
225 |
-
|
|
|
|
|
|
|
|
|
226 |
id = "DirectSubmission"
|
227 |
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
228 |
# if not folder_path.exists():
|
@@ -232,10 +238,13 @@ def pipeline_with_gemini(accessions):
|
|
232 |
# else:
|
233 |
# print("data/"+str(id) +" already exists.")
|
234 |
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
235 |
-
parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
236 |
-
data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
|
|
|
|
237 |
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
238 |
-
|
|
|
239 |
# Define document names
|
240 |
if len(saveTitle) > 50:
|
241 |
saveName = saveTitle[:50]
|
@@ -264,6 +273,14 @@ def pipeline_with_gemini(accessions):
|
|
264 |
print("✅ Files already exist in Google Drive. Downloading them...")
|
265 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
266 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
# Read and parse these into `chunk` and `all_output`
|
268 |
else:
|
269 |
# 🔥 Remove any stale local copies
|
@@ -321,7 +338,8 @@ def pipeline_with_gemini(accessions):
|
|
321 |
if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
322 |
# might find the article
|
323 |
print("no article text")
|
324 |
-
tem_links = mtdna_classifier.search_google_custom(title, 2)
|
|
|
325 |
# get supplementary of that article
|
326 |
print("tem links length ", len(tem_links))
|
327 |
for link in tem_links:
|
@@ -436,8 +454,10 @@ def pipeline_with_gemini(accessions):
|
|
436 |
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
437 |
|
438 |
# Upload to Drive
|
439 |
-
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
440 |
-
upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
|
|
|
|
441 |
print("here 1")
|
442 |
|
443 |
# else:
|
@@ -528,9 +548,15 @@ def pipeline_with_gemini(accessions):
|
|
528 |
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
|
529 |
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
530 |
# model.call_llm_api, chunk=chunk, all_output=all_output)
|
|
|
|
|
|
|
|
|
531 |
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
532 |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
533 |
model.call_llm_api, chunk=chunk, all_output=all_output)
|
|
|
|
|
534 |
if len(country) == 0: country = "unknown"
|
535 |
if len(sample_type) == 0: sample_type = "unknown"
|
536 |
if country_explanation: country_explanation = "-"+country_explanation
|
@@ -571,6 +597,54 @@ def pipeline_with_gemini(accessions):
|
|
571 |
else:
|
572 |
if len(method_used + sample_type_explanation)> 0:
|
573 |
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
end = time.time()
|
575 |
total_cost_title += total_query_cost
|
576 |
acc_score["query_cost"] = f"{total_cost_title:.6f}"
|
|
|
6 |
import model
|
7 |
import mtdna_classifier
|
8 |
#import app
|
9 |
+
import smart_fallback
|
10 |
import pandas as pd
|
11 |
from pathlib import Path
|
12 |
import subprocess
|
|
|
28 |
import json
|
29 |
#––– Authentication setup –––
|
30 |
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
31 |
+
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
|
32 |
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
33 |
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
34 |
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
|
|
217 |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
218 |
acc_score["isolate"] = iso
|
219 |
print(meta)
|
220 |
+
meta_expand = smart_fallback.fetch_ncbi(acc)
|
221 |
# set up step: create the folder to save document
|
222 |
chunk, all_output = "",""
|
223 |
if pudID:
|
224 |
id = str(pudID)
|
225 |
saveTitle = title
|
226 |
else:
|
227 |
+
try:
|
228 |
+
author_name = meta_expand["authors"].split(',')[0] # Use last name only
|
229 |
+
except:
|
230 |
+
author_name = meta_expand["authors"]
|
231 |
+
saveTitle = title + "_" + col_date + "_" + author_name
|
232 |
id = "DirectSubmission"
|
233 |
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
234 |
# if not folder_path.exists():
|
|
|
238 |
# else:
|
239 |
# print("data/"+str(id) +" already exists.")
|
240 |
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
241 |
+
# parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
242 |
+
# data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
243 |
+
# sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
244 |
+
data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
|
245 |
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
246 |
+
print("sample folder id: ", sample_folder_id)
|
247 |
+
|
248 |
# Define document names
|
249 |
if len(saveTitle) > 50:
|
250 |
saveName = saveTitle[:50]
|
|
|
273 |
print("✅ Files already exist in Google Drive. Downloading them...")
|
274 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
275 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
276 |
+
print("chunk_id and all_id: ")
|
277 |
+
print(chunk_id, all_id)
|
278 |
+
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
279 |
+
print("📄 Name:", file["name"])
|
280 |
+
print("📁 Parent folder ID:", file["parents"][0])
|
281 |
+
print("🔗 View link:", file["webViewLink"])
|
282 |
+
|
283 |
+
|
284 |
# Read and parse these into `chunk` and `all_output`
|
285 |
else:
|
286 |
# 🔥 Remove any stale local copies
|
|
|
338 |
if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
339 |
# might find the article
|
340 |
print("no article text")
|
341 |
+
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
342 |
+
tem_links = smart_fallback.smart_google_search(meta_expand)
|
343 |
# get supplementary of that article
|
344 |
print("tem links length ", len(tem_links))
|
345 |
for link in tem_links:
|
|
|
454 |
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
455 |
|
456 |
# Upload to Drive
|
457 |
+
result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
458 |
+
result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
459 |
+
print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
|
460 |
+
print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
|
461 |
print("here 1")
|
462 |
|
463 |
# else:
|
|
|
548 |
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
|
549 |
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
550 |
# model.call_llm_api, chunk=chunk, all_output=all_output)
|
551 |
+
print("this is chunk for the model")
|
552 |
+
print(chunk)
|
553 |
+
print("this is all output for the model")
|
554 |
+
print(all_output)
|
555 |
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
556 |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
557 |
model.call_llm_api, chunk=chunk, all_output=all_output)
|
558 |
+
print("country using ai: ", country)
|
559 |
+
print("sample type using ai: ", sample_type)
|
560 |
if len(country) == 0: country = "unknown"
|
561 |
if len(sample_type) == 0: sample_type = "unknown"
|
562 |
if country_explanation: country_explanation = "-"+country_explanation
|
|
|
597 |
else:
|
598 |
if len(method_used + sample_type_explanation)> 0:
|
599 |
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
600 |
+
# last resort: combine all information to give all output otherwise unknown
|
601 |
+
if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
|
602 |
+
text = ""
|
603 |
+
for key in meta_expand:
|
604 |
+
text += str(key) + ": " + meta_expand[key] + "\n"
|
605 |
+
if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
|
606 |
+
text += data_preprocess.normalize_for_overlap(all_output)
|
607 |
+
if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
|
608 |
+
text += data_preprocess.normalize_for_overlap(chunk)
|
609 |
+
text += ". NCBI Features: " + features
|
610 |
+
print("this is text for the last resort model")
|
611 |
+
print(text)
|
612 |
+
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
613 |
+
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
614 |
+
model.call_llm_api, chunk=text, all_output=text)
|
615 |
+
print("this is last resort results: ")
|
616 |
+
print("country: ", country)
|
617 |
+
print("sample type: ", sample_type)
|
618 |
+
if len(country) == 0: country = "unknown"
|
619 |
+
if len(sample_type) == 0: sample_type = "unknown"
|
620 |
+
if country_explanation: country_explanation = "-"+country_explanation
|
621 |
+
else: country_explanation = ""
|
622 |
+
if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
|
623 |
+
else: sample_type_explanation = ""
|
624 |
+
if method_used == "unknown": method_used = ""
|
625 |
+
if country.lower() != "unknown":
|
626 |
+
stand_country = standardize_location.smart_country_lookup(country.lower())
|
627 |
+
if stand_country.lower() != "not found":
|
628 |
+
if stand_country.lower() in acc_score["country"]:
|
629 |
+
if country_explanation:
|
630 |
+
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
|
631 |
+
else:
|
632 |
+
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
|
633 |
+
else:
|
634 |
+
if country.lower() in acc_score["country"]:
|
635 |
+
if country_explanation:
|
636 |
+
if len(method_used + country_explanation) > 0:
|
637 |
+
acc_score["country"][country.lower()].append(method_used + country_explanation)
|
638 |
+
else:
|
639 |
+
if len(method_used + country_explanation) > 0:
|
640 |
+
acc_score["country"][country.lower()] = [method_used + country_explanation]
|
641 |
+
if sample_type.lower() != "unknown":
|
642 |
+
if sample_type.lower() in acc_score["sample_type"]:
|
643 |
+
if len(method_used + sample_type_explanation) > 0:
|
644 |
+
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
|
645 |
+
else:
|
646 |
+
if len(method_used + sample_type_explanation)> 0:
|
647 |
+
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
648 |
end = time.time()
|
649 |
total_cost_title += total_query_cost
|
650 |
acc_score["query_cost"] = f"{total_cost_title:.6f}"
|
smart_fallback.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Bio import Entrez, Medline
|
2 |
+
import model
|
3 |
+
import mtdna_classifier
|
4 |
+
# Setup
|
5 |
+
def fetch_ncbi(accession_number):
|
6 |
+
Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
|
7 |
+
handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
|
8 |
+
record = Entrez.read(handle)
|
9 |
+
handle.close()
|
10 |
+
outputs = {"authors":"unknown",
|
11 |
+
"institution":"unknown",
|
12 |
+
"isolate":"unknown",
|
13 |
+
"definition":"unknown",
|
14 |
+
"title":"unknown",
|
15 |
+
"seq_comment":"unknown",
|
16 |
+
"collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
|
17 |
+
gb_seq = None
|
18 |
+
try:
|
19 |
+
# Validate record structure: It should be a list with at least one element (a dict)
|
20 |
+
if isinstance(record, list) and len(record) > 0:
|
21 |
+
if isinstance(record[0], dict):
|
22 |
+
gb_seq = record[0]
|
23 |
+
else:
|
24 |
+
print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
|
25 |
+
# extract collection date
|
26 |
+
if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
|
27 |
+
outputs["collection_date"] = gb_seq["GBSeq_create-date"]
|
28 |
+
else:
|
29 |
+
if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
|
30 |
+
outputs["collection_date"] = gb_seq["GBSeq_update-date"]
|
31 |
+
# extract definition
|
32 |
+
if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
|
33 |
+
outputs["definition"] = gb_seq["GBSeq_definition"]
|
34 |
+
# extract related-reference things
|
35 |
+
if "GBSeq_references" in gb_seq:
|
36 |
+
for ref in gb_seq["GBSeq_references"]:
|
37 |
+
# extract authors
|
38 |
+
if "GBReference_authors" in ref and outputs["authors"]=="unknown":
|
39 |
+
outputs["authors"] = "and ".join(ref["GBReference_authors"])
|
40 |
+
# extract title
|
41 |
+
if "GBReference_title" in ref and outputs["title"]=="unknown":
|
42 |
+
outputs["title"] = ref["GBReference_title"]
|
43 |
+
# extract submitted journal
|
44 |
+
if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
|
45 |
+
outputs["institution"] = ref['GBReference_journal']
|
46 |
+
# extract seq_comment
|
47 |
+
if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
|
48 |
+
outputs["seq_comment"] = gb_seq["GBSeq_comment"]
|
49 |
+
# extract isolate
|
50 |
+
if "GBSeq_feature-table" in gb_seq:
|
51 |
+
if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
|
52 |
+
for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
|
53 |
+
if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
|
54 |
+
outputs["isolate"] = ref["GBQualifier_value"]
|
55 |
+
else:
|
56 |
+
print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
|
57 |
+
|
58 |
+
# If gb_seq is still None, return defaults
|
59 |
+
if gb_seq is None:
|
60 |
+
return {"authors":"unknown",
|
61 |
+
"institution":"unknown",
|
62 |
+
"isolate":"unknown",
|
63 |
+
"definition":"unknown",
|
64 |
+
"title":"unknown",
|
65 |
+
"seq_comment":"unknown",
|
66 |
+
"collection_date":"unknown" }
|
67 |
+
return outputs
|
68 |
+
except:
|
69 |
+
print("error in fetching ncbi data")
|
70 |
+
return {"authors":"unknown",
|
71 |
+
"institution":"unknown",
|
72 |
+
"isolate":"unknown",
|
73 |
+
"definition":"unknown",
|
74 |
+
"title":"unknown",
|
75 |
+
"seq_comment":"unknown",
|
76 |
+
"collection_date":"unknown" }
|
77 |
+
# Method 1: Smarter Google
|
78 |
+
def smart_google_queries(metadata: dict):
|
79 |
+
queries = []
|
80 |
+
|
81 |
+
# Extract useful fields
|
82 |
+
isolate = metadata.get("isolate")
|
83 |
+
author = metadata.get("authors")
|
84 |
+
institution = metadata.get("institution")
|
85 |
+
title = metadata.get("title")
|
86 |
+
print(title)
|
87 |
+
combined = []
|
88 |
+
# Construct queries
|
89 |
+
if isolate:
|
90 |
+
queries.append(f'"{isolate}" mitochondrial DNA')
|
91 |
+
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
92 |
+
|
93 |
+
if author:
|
94 |
+
try:
|
95 |
+
author_name = author.split(',')[0] # Use last name only
|
96 |
+
except:
|
97 |
+
author_name = author
|
98 |
+
queries.append(f'"{author_name}" mitochondrial DNA')
|
99 |
+
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
100 |
+
|
101 |
+
if institution:
|
102 |
+
try:
|
103 |
+
short_inst = institution.split(',')[0] # Take first part of institution
|
104 |
+
except:
|
105 |
+
short_inst = institution
|
106 |
+
queries.append(f'"{short_inst}" mtDNA sequence')
|
107 |
+
queries.append(f'"{short_inst}" isolate site:nature.com')
|
108 |
+
queries.append(title)
|
109 |
+
return queries
|
110 |
+
|
111 |
+
def filter_links_by_metadata(search_results):
|
112 |
+
TRUSTED_DOMAINS = [
|
113 |
+
"ncbi.nlm.nih.gov",
|
114 |
+
"pubmed.ncbi.nlm.nih.gov",
|
115 |
+
"pmc.ncbi.nlm.nih.gov",
|
116 |
+
"biorxiv.org",
|
117 |
+
"researchgate.net",
|
118 |
+
"nature.com",
|
119 |
+
"sciencedirect.com"
|
120 |
+
]
|
121 |
+
def is_trusted_link(link):
|
122 |
+
for domain in TRUSTED_DOMAINS:
|
123 |
+
if domain in link:
|
124 |
+
return True
|
125 |
+
return False
|
126 |
+
def is_relevant_title_snippet(link):
|
127 |
+
keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
128 |
+
title_snippet = link.lower()
|
129 |
+
for keyword in keywords:
|
130 |
+
if keyword in title_snippet:
|
131 |
+
return True
|
132 |
+
return False
|
133 |
+
|
134 |
+
filtered = []
|
135 |
+
if len(search_results) > 0:
|
136 |
+
for link in search_results:
|
137 |
+
if is_trusted_link(link) and link not in filtered:
|
138 |
+
filtered.append(link)
|
139 |
+
if is_relevant_title_snippet(link) and link not in filtered:
|
140 |
+
filtered.append(link)
|
141 |
+
return filtered
|
142 |
+
|
143 |
+
def smart_google_search(metadata):
|
144 |
+
queries = smart_google_queries(metadata)
|
145 |
+
links = []
|
146 |
+
for q in queries:
|
147 |
+
#print("\n🔍 Query:", q)
|
148 |
+
results = mtdna_classifier.search_google_custom(q,2)
|
149 |
+
for link in results:
|
150 |
+
#print(f"- {link}")
|
151 |
+
if link not in links:
|
152 |
+
links.append(link)
|
153 |
+
filter_links = filter_links_by_metadata(links)
|
154 |
+
return filter_links
|
155 |
+
# Method 2: Prompt LLM better or better ai search api with all
|
156 |
+
# the total information from even ncbi and all search
|