VyLala commited on
Commit
0767cb0
·
verified ·
1 Parent(s): d79f1bd

Upload 3 files

Browse files
Files changed (3) hide show
  1. mtdna_backend.py +32 -21
  2. pipeline.py +82 -8
  3. smart_fallback.py +156 -0
mtdna_backend.py CHANGED
@@ -139,12 +139,12 @@ def summarize_results(accession):
139
  if cached:
140
  print(f"✅ Using cached result for {accession}")
141
  return [[
142
- cached["Sample ID"],
143
- cached["Predicted Country"],
144
- cached["Country Explanation"],
145
- cached["Predicted Sample Type"],
146
- cached["Sample Type Explanation"],
147
- cached["Sources"],
148
  cached["Time cost"]
149
  ]]
150
  # only run when nothing in the cache
@@ -175,13 +175,15 @@ def summarize_results(accession):
175
  pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
176
  for section, results in outputs[key].items():
177
  if section == "country" or section =="sample_type":
178
- pred_output = "\n".join(list(results.keys()))
179
  output_explanation = ""
180
  for result, content in results.items():
181
  if len(result) == 0: result = "unknown"
182
  if len(content) == 0: output_explanation = "unknown"
183
  else:
184
  output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
 
 
185
  if section == "country":
186
  pred_country, country_explanation = pred_output, output_explanation
187
  elif section == "sample_type":
@@ -191,24 +193,24 @@ def summarize_results(accession):
191
  else: label = key
192
  if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
193
  row = {
194
- "Sample ID": label,
195
- "Predicted Country": pred_country,
196
- "Country Explanation": country_explanation,
197
- "Predicted Sample Type":pred_sample,
198
- "Sample Type Explanation":sample_explanation,
199
- "Sources": "\n".join(outputs[key]["source"]),
200
  "Time cost": outputs[key]["time_cost"]
201
  }
202
  #row_score.append(row)
203
  rows.append(list(row.values()))
204
 
205
  save_row = {
206
- "Sample ID": label,
207
- "Predicted Country": pred_country,
208
- "Country Explanation": country_explanation,
209
- "Predicted Sample Type":pred_sample,
210
- "Sample Type Explanation":sample_explanation,
211
- "Sources": "\n".join(outputs[key]["source"]),
212
  "Query_cost": outputs[key]["query_cost"],
213
  "Time cost": outputs[key]["time_cost"]
214
  }
@@ -530,14 +532,23 @@ def check_known_output(accession):
530
  if "Sample ID" not in df.columns:
531
  print("❌ Column 'Sample ID' not found in Google Sheet.")
532
  return None
533
-
534
  match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
535
  if match:
536
  accession = match.group(0)
537
 
538
  matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
539
  if not matched.empty:
540
- return matched.iloc[0].to_dict()
 
 
 
 
 
 
 
 
 
541
  else:
542
  print(f"🔍 Accession {accession} not found in known_samples.")
543
  return None
 
139
  if cached:
140
  print(f"✅ Using cached result for {accession}")
141
  return [[
142
+ cached["Sample ID"] or "unknown",
143
+ cached["Predicted Country"] or "unknown",
144
+ cached["Country Explanation"] or "unknown",
145
+ cached["Predicted Sample Type"] or "unknown",
146
+ cached["Sample Type Explanation"] or "unknown",
147
+ cached["Sources"] or "No Links",
148
  cached["Time cost"]
149
  ]]
150
  # only run when nothing in the cache
 
175
  pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
176
  for section, results in outputs[key].items():
177
  if section == "country" or section =="sample_type":
178
+ pred_output = []#"\n".join(list(results.keys()))
179
  output_explanation = ""
180
  for result, content in results.items():
181
  if len(result) == 0: result = "unknown"
182
  if len(content) == 0: output_explanation = "unknown"
183
  else:
184
  output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
185
+ pred_output.append(result)
186
+ pred_output = "\n".join(pred_output)
187
  if section == "country":
188
  pred_country, country_explanation = pred_output, output_explanation
189
  elif section == "sample_type":
 
193
  else: label = key
194
  if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
195
  row = {
196
+ "Sample ID": label or "unknown",
197
+ "Predicted Country": pred_country or "unknown",
198
+ "Country Explanation": country_explanation or "unknown",
199
+ "Predicted Sample Type":pred_sample or "unknown",
200
+ "Sample Type Explanation":sample_explanation or "unknown",
201
+ "Sources": "\n".join(outputs[key]["source"]) or "No Links",
202
  "Time cost": outputs[key]["time_cost"]
203
  }
204
  #row_score.append(row)
205
  rows.append(list(row.values()))
206
 
207
  save_row = {
208
+ "Sample ID": label or "unknown",
209
+ "Predicted Country": pred_country or "unknown",
210
+ "Country Explanation": country_explanation or "unknown",
211
+ "Predicted Sample Type":pred_sample or "unknown",
212
+ "Sample Type Explanation":sample_explanation or "unknown",
213
+ "Sources": "\n".join(outputs[key]["source"]) or "No Links",
214
  "Query_cost": outputs[key]["query_cost"],
215
  "Time cost": outputs[key]["time_cost"]
216
  }
 
532
  if "Sample ID" not in df.columns:
533
  print("❌ Column 'Sample ID' not found in Google Sheet.")
534
  return None
535
+
536
  match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
537
  if match:
538
  accession = match.group(0)
539
 
540
  matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
541
  if not matched.empty:
542
+ #return matched.iloc[0].to_dict()
543
+ row = matched.iloc[0]
544
+ country = row.get("Predicted Country", "").strip().lower()
545
+ sample_type = row.get("Predicted Sample Type", "").strip().lower()
546
+
547
+ if country and country != "unknown" and sample_type and sample_type != "unknown":
548
+ return row.to_dict()
549
+ else:
550
+ print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
551
+ return None
552
  else:
553
  print(f"🔍 Accession {accession} not found in known_samples.")
554
  return None
pipeline.py CHANGED
@@ -6,6 +6,7 @@ import data_preprocess
6
  import model
7
  import mtdna_classifier
8
  #import app
 
9
  import pandas as pd
10
  from pathlib import Path
11
  import subprocess
@@ -27,7 +28,7 @@ import io
27
  import json
28
  #––– Authentication setup –––
29
  GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
30
- GDRIVE_DATA_FOLDER_NAME = "data"
31
  GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
32
  GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
33
  drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
@@ -216,13 +217,18 @@ def pipeline_with_gemini(accessions):
216
  country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
217
  acc_score["isolate"] = iso
218
  print(meta)
 
219
  # set up step: create the folder to save document
220
  chunk, all_output = "",""
221
  if pudID:
222
  id = str(pudID)
223
  saveTitle = title
224
  else:
225
- saveTitle = title + "_" + col_date
 
 
 
 
226
  id = "DirectSubmission"
227
  # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
228
  # if not folder_path.exists():
@@ -232,10 +238,13 @@ def pipeline_with_gemini(accessions):
232
  # else:
233
  # print("data/"+str(id) +" already exists.")
234
  # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
235
- parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
236
- data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
 
 
237
  sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
238
-
 
239
  # Define document names
240
  if len(saveTitle) > 50:
241
  saveName = saveTitle[:50]
@@ -264,6 +273,14 @@ def pipeline_with_gemini(accessions):
264
  print("✅ Files already exist in Google Drive. Downloading them...")
265
  chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
266
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
 
 
 
 
 
 
 
 
267
  # Read and parse these into `chunk` and `all_output`
268
  else:
269
  # 🔥 Remove any stale local copies
@@ -321,7 +338,8 @@ def pipeline_with_gemini(accessions):
321
  if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
322
  # might find the article
323
  print("no article text")
324
- tem_links = mtdna_classifier.search_google_custom(title, 2)
 
325
  # get supplementary of that article
326
  print("tem links length ", len(tem_links))
327
  for link in tem_links:
@@ -436,8 +454,10 @@ def pipeline_with_gemini(accessions):
436
  # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
437
 
438
  # Upload to Drive
439
- upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
440
- upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
 
 
441
  print("here 1")
442
 
443
  # else:
@@ -528,9 +548,15 @@ def pipeline_with_gemini(accessions):
528
  # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
529
  # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
530
  # model.call_llm_api, chunk=chunk, all_output=all_output)
 
 
 
 
531
  country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
532
  primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
533
  model.call_llm_api, chunk=chunk, all_output=all_output)
 
 
534
  if len(country) == 0: country = "unknown"
535
  if len(sample_type) == 0: sample_type = "unknown"
536
  if country_explanation: country_explanation = "-"+country_explanation
@@ -571,6 +597,54 @@ def pipeline_with_gemini(accessions):
571
  else:
572
  if len(method_used + sample_type_explanation)> 0:
573
  acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  end = time.time()
575
  total_cost_title += total_query_cost
576
  acc_score["query_cost"] = f"{total_cost_title:.6f}"
 
6
  import model
7
  import mtdna_classifier
8
  #import app
9
+ import smart_fallback
10
  import pandas as pd
11
  from pathlib import Path
12
  import subprocess
 
28
  import json
29
  #––– Authentication setup –––
30
  GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
31
+ GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
32
  GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
33
  GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
34
  drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
 
217
  country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
218
  acc_score["isolate"] = iso
219
  print(meta)
220
+ meta_expand = smart_fallback.fetch_ncbi(acc)
221
  # set up step: create the folder to save document
222
  chunk, all_output = "",""
223
  if pudID:
224
  id = str(pudID)
225
  saveTitle = title
226
  else:
227
+ try:
228
+ author_name = meta_expand["authors"].split(',')[0] # Use last name only
229
+ except:
230
+ author_name = meta_expand["authors"]
231
+ saveTitle = title + "_" + col_date + "_" + author_name
232
  id = "DirectSubmission"
233
  # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
234
  # if not folder_path.exists():
 
238
  # else:
239
  # print("data/"+str(id) +" already exists.")
240
  # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
241
+ # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
242
+ # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
243
+ # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
244
+ data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
245
  sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
246
+ print("sample folder id: ", sample_folder_id)
247
+
248
  # Define document names
249
  if len(saveTitle) > 50:
250
  saveName = saveTitle[:50]
 
273
  print("✅ Files already exist in Google Drive. Downloading them...")
274
  chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
275
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
276
+ print("chunk_id and all_id: ")
277
+ print(chunk_id, all_id)
278
+ file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
279
+ print("📄 Name:", file["name"])
280
+ print("📁 Parent folder ID:", file["parents"][0])
281
+ print("🔗 View link:", file["webViewLink"])
282
+
283
+
284
  # Read and parse these into `chunk` and `all_output`
285
  else:
286
  # 🔥 Remove any stale local copies
 
338
  if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
339
  # might find the article
340
  print("no article text")
341
+ #tem_links = mtdna_classifier.search_google_custom(title, 2)
342
+ tem_links = smart_fallback.smart_google_search(meta_expand)
343
  # get supplementary of that article
344
  print("tem links length ", len(tem_links))
345
  for link in tem_links:
 
454
  # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
455
 
456
  # Upload to Drive
457
+ result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
458
+ result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
459
+ print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
460
+ print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
461
  print("here 1")
462
 
463
  # else:
 
548
  # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
549
  # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
550
  # model.call_llm_api, chunk=chunk, all_output=all_output)
551
+ print("this is chunk for the model")
552
+ print(chunk)
553
+ print("this is all output for the model")
554
+ print(all_output)
555
  country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
556
  primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
557
  model.call_llm_api, chunk=chunk, all_output=all_output)
558
+ print("country using ai: ", country)
559
+ print("sample type using ai: ", sample_type)
560
  if len(country) == 0: country = "unknown"
561
  if len(sample_type) == 0: sample_type = "unknown"
562
  if country_explanation: country_explanation = "-"+country_explanation
 
597
  else:
598
  if len(method_used + sample_type_explanation)> 0:
599
  acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
600
+ # last resort: combine all information to give all output otherwise unknown
601
+ if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
602
+ text = ""
603
+ for key in meta_expand:
604
+ text += str(key) + ": " + meta_expand[key] + "\n"
605
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
606
+ text += data_preprocess.normalize_for_overlap(all_output)
607
+ if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
608
+ text += data_preprocess.normalize_for_overlap(chunk)
609
+ text += ". NCBI Features: " + features
610
+ print("this is text for the last resort model")
611
+ print(text)
612
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
613
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
614
+ model.call_llm_api, chunk=text, all_output=text)
615
+ print("this is last resort results: ")
616
+ print("country: ", country)
617
+ print("sample type: ", sample_type)
618
+ if len(country) == 0: country = "unknown"
619
+ if len(sample_type) == 0: sample_type = "unknown"
620
+ if country_explanation: country_explanation = "-"+country_explanation
621
+ else: country_explanation = ""
622
+ if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
623
+ else: sample_type_explanation = ""
624
+ if method_used == "unknown": method_used = ""
625
+ if country.lower() != "unknown":
626
+ stand_country = standardize_location.smart_country_lookup(country.lower())
627
+ if stand_country.lower() != "not found":
628
+ if stand_country.lower() in acc_score["country"]:
629
+ if country_explanation:
630
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
631
+ else:
632
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
633
+ else:
634
+ if country.lower() in acc_score["country"]:
635
+ if country_explanation:
636
+ if len(method_used + country_explanation) > 0:
637
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
638
+ else:
639
+ if len(method_used + country_explanation) > 0:
640
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
641
+ if sample_type.lower() != "unknown":
642
+ if sample_type.lower() in acc_score["sample_type"]:
643
+ if len(method_used + sample_type_explanation) > 0:
644
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
645
+ else:
646
+ if len(method_used + sample_type_explanation)> 0:
647
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
648
  end = time.time()
649
  total_cost_title += total_query_cost
650
  acc_score["query_cost"] = f"{total_cost_title:.6f}"
smart_fallback.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import Entrez, Medline
2
+ import model
3
+ import mtdna_classifier
4
+ # Setup
5
+ def fetch_ncbi(accession_number):
6
+ Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
7
+ handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
8
+ record = Entrez.read(handle)
9
+ handle.close()
10
+ outputs = {"authors":"unknown",
11
+ "institution":"unknown",
12
+ "isolate":"unknown",
13
+ "definition":"unknown",
14
+ "title":"unknown",
15
+ "seq_comment":"unknown",
16
+ "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
17
+ gb_seq = None
18
+ try:
19
+ # Validate record structure: It should be a list with at least one element (a dict)
20
+ if isinstance(record, list) and len(record) > 0:
21
+ if isinstance(record[0], dict):
22
+ gb_seq = record[0]
23
+ else:
24
+ print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
25
+ # extract collection date
26
+ if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
27
+ outputs["collection_date"] = gb_seq["GBSeq_create-date"]
28
+ else:
29
+ if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
30
+ outputs["collection_date"] = gb_seq["GBSeq_update-date"]
31
+ # extract definition
32
+ if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
33
+ outputs["definition"] = gb_seq["GBSeq_definition"]
34
+ # extract related-reference things
35
+ if "GBSeq_references" in gb_seq:
36
+ for ref in gb_seq["GBSeq_references"]:
37
+ # extract authors
38
+ if "GBReference_authors" in ref and outputs["authors"]=="unknown":
39
+ outputs["authors"] = "and ".join(ref["GBReference_authors"])
40
+ # extract title
41
+ if "GBReference_title" in ref and outputs["title"]=="unknown":
42
+ outputs["title"] = ref["GBReference_title"]
43
+ # extract submitted journal
44
+ if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
45
+ outputs["institution"] = ref['GBReference_journal']
46
+ # extract seq_comment
47
+ if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
48
+ outputs["seq_comment"] = gb_seq["GBSeq_comment"]
49
+ # extract isolate
50
+ if "GBSeq_feature-table" in gb_seq:
51
+ if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
52
+ for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
53
+ if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
54
+ outputs["isolate"] = ref["GBQualifier_value"]
55
+ else:
56
+ print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
57
+
58
+ # If gb_seq is still None, return defaults
59
+ if gb_seq is None:
60
+ return {"authors":"unknown",
61
+ "institution":"unknown",
62
+ "isolate":"unknown",
63
+ "definition":"unknown",
64
+ "title":"unknown",
65
+ "seq_comment":"unknown",
66
+ "collection_date":"unknown" }
67
+ return outputs
68
+ except:
69
+ print("error in fetching ncbi data")
70
+ return {"authors":"unknown",
71
+ "institution":"unknown",
72
+ "isolate":"unknown",
73
+ "definition":"unknown",
74
+ "title":"unknown",
75
+ "seq_comment":"unknown",
76
+ "collection_date":"unknown" }
77
+ # Method 1: Smarter Google
78
+ def smart_google_queries(metadata: dict):
79
+ queries = []
80
+
81
+ # Extract useful fields
82
+ isolate = metadata.get("isolate")
83
+ author = metadata.get("authors")
84
+ institution = metadata.get("institution")
85
+ title = metadata.get("title")
86
+ print(title)
87
+ combined = []
88
+ # Construct queries
89
+ if isolate:
90
+ queries.append(f'"{isolate}" mitochondrial DNA')
91
+ queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
92
+
93
+ if author:
94
+ try:
95
+ author_name = author.split(',')[0] # Use last name only
96
+ except:
97
+ author_name = author
98
+ queries.append(f'"{author_name}" mitochondrial DNA')
99
+ queries.append(f'"{author_name}" mtDNA site:researchgate.net')
100
+
101
+ if institution:
102
+ try:
103
+ short_inst = institution.split(',')[0] # Take first part of institution
104
+ except:
105
+ short_inst = institution
106
+ queries.append(f'"{short_inst}" mtDNA sequence')
107
+ queries.append(f'"{short_inst}" isolate site:nature.com')
108
+ queries.append(title)
109
+ return queries
110
+
111
+ def filter_links_by_metadata(search_results):
112
+ TRUSTED_DOMAINS = [
113
+ "ncbi.nlm.nih.gov",
114
+ "pubmed.ncbi.nlm.nih.gov",
115
+ "pmc.ncbi.nlm.nih.gov",
116
+ "biorxiv.org",
117
+ "researchgate.net",
118
+ "nature.com",
119
+ "sciencedirect.com"
120
+ ]
121
+ def is_trusted_link(link):
122
+ for domain in TRUSTED_DOMAINS:
123
+ if domain in link:
124
+ return True
125
+ return False
126
+ def is_relevant_title_snippet(link):
127
+ keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
128
+ title_snippet = link.lower()
129
+ for keyword in keywords:
130
+ if keyword in title_snippet:
131
+ return True
132
+ return False
133
+
134
+ filtered = []
135
+ if len(search_results) > 0:
136
+ for link in search_results:
137
+ if is_trusted_link(link) and link not in filtered:
138
+ filtered.append(link)
139
+ if is_relevant_title_snippet(link) and link not in filtered:
140
+ filtered.append(link)
141
+ return filtered
142
+
143
+ def smart_google_search(metadata):
144
+ queries = smart_google_queries(metadata)
145
+ links = []
146
+ for q in queries:
147
+ #print("\n🔍 Query:", q)
148
+ results = mtdna_classifier.search_google_custom(q,2)
149
+ for link in results:
150
+ #print(f"- {link}")
151
+ if link not in links:
152
+ links.append(link)
153
+ filter_links = filter_links_by_metadata(links)
154
+ return filter_links
155
+ # Method 2: Prompt LLM better or better ai search api with all
156
+ # the total information from even ncbi and all search