Spaces:
Running
Running
Update smart_fallback.py
Browse files- smart_fallback.py +208 -204
smart_fallback.py
CHANGED
@@ -1,205 +1,209 @@
|
|
1 |
-
from Bio import Entrez, Medline
|
2 |
-
#import model
|
3 |
-
import mtdna_classifier
|
4 |
-
from NER.html import extractHTML
|
5 |
-
import data_preprocess
|
6 |
-
# Setup
|
7 |
-
def fetch_ncbi(accession_number):
|
8 |
-
try:
|
9 |
-
Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
|
10 |
-
handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
|
11 |
-
record = Entrez.read(handle)
|
12 |
-
handle.close()
|
13 |
-
outputs = {"authors":"unknown",
|
14 |
-
"institution":"unknown",
|
15 |
-
"isolate":"unknown",
|
16 |
-
"definition":"unknown",
|
17 |
-
"title":"unknown",
|
18 |
-
"seq_comment":"unknown",
|
19 |
-
"collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
|
20 |
-
gb_seq = None
|
21 |
-
# Validate record structure: It should be a list with at least one element (a dict)
|
22 |
-
if isinstance(record, list) and len(record) > 0:
|
23 |
-
if isinstance(record[0], dict):
|
24 |
-
gb_seq = record[0]
|
25 |
-
else:
|
26 |
-
print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
|
27 |
-
# extract collection date
|
28 |
-
if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
|
29 |
-
outputs["collection_date"] = gb_seq["GBSeq_create-date"]
|
30 |
-
else:
|
31 |
-
if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
|
32 |
-
outputs["collection_date"] = gb_seq["GBSeq_update-date"]
|
33 |
-
# extract definition
|
34 |
-
if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
|
35 |
-
outputs["definition"] = gb_seq["GBSeq_definition"]
|
36 |
-
# extract related-reference things
|
37 |
-
if "GBSeq_references" in gb_seq:
|
38 |
-
for ref in gb_seq["GBSeq_references"]:
|
39 |
-
# extract authors
|
40 |
-
if "GBReference_authors" in ref and outputs["authors"]=="unknown":
|
41 |
-
outputs["authors"] = "and ".join(ref["GBReference_authors"])
|
42 |
-
# extract title
|
43 |
-
if "GBReference_title" in ref and outputs["title"]=="unknown":
|
44 |
-
outputs["title"] = ref["GBReference_title"]
|
45 |
-
# extract submitted journal
|
46 |
-
if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
|
47 |
-
outputs["institution"] = ref['GBReference_journal']
|
48 |
-
# extract seq_comment
|
49 |
-
if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
|
50 |
-
outputs["seq_comment"] = gb_seq["GBSeq_comment"]
|
51 |
-
# extract isolate
|
52 |
-
if "GBSeq_feature-table" in gb_seq:
|
53 |
-
if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
|
54 |
-
for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
|
55 |
-
if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
|
56 |
-
outputs["isolate"] = ref["GBQualifier_value"]
|
57 |
-
else:
|
58 |
-
print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
|
59 |
-
|
60 |
-
# If gb_seq is still None, return defaults
|
61 |
-
if gb_seq is None:
|
62 |
-
return {"authors":"unknown",
|
63 |
-
"institution":"unknown",
|
64 |
-
"isolate":"unknown",
|
65 |
-
"definition":"unknown",
|
66 |
-
"title":"unknown",
|
67 |
-
"seq_comment":"unknown",
|
68 |
-
"collection_date":"unknown" }
|
69 |
-
return outputs
|
70 |
-
except:
|
71 |
-
print("error in fetching ncbi data")
|
72 |
-
return {"authors":"unknown",
|
73 |
-
"institution":"unknown",
|
74 |
-
"isolate":"unknown",
|
75 |
-
"definition":"unknown",
|
76 |
-
"title":"unknown",
|
77 |
-
"seq_comment":"unknown",
|
78 |
-
"collection_date":"unknown" }
|
79 |
-
# Fallback if NCBI crashed or cannot find accession on NBCI
|
80 |
-
def google_accession_search(accession_id):
|
81 |
-
"""
|
82 |
-
Search for metadata by accession ID using Google Custom Search.
|
83 |
-
Falls back to known biological databases and archives.
|
84 |
-
"""
|
85 |
-
queries = [
|
86 |
-
f"{accession_id}",
|
87 |
-
f"{accession_id} site:ncbi.nlm.nih.gov",
|
88 |
-
f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
|
89 |
-
f"{accession_id} site:europepmc.org",
|
90 |
-
f"{accession_id} site:researchgate.net",
|
91 |
-
f"{accession_id} mtDNA",
|
92 |
-
f"{accession_id} mitochondrial DNA"
|
93 |
-
]
|
94 |
-
|
95 |
-
links = []
|
96 |
-
for query in queries:
|
97 |
-
search_results = mtdna_classifier.search_google_custom(query, 2)
|
98 |
-
for link in search_results:
|
99 |
-
if link not in links:
|
100 |
-
links.append(link)
|
101 |
-
return links
|
102 |
-
|
103 |
-
# Method 1: Smarter Google
|
104 |
-
def smart_google_queries(metadata: dict):
|
105 |
-
queries = []
|
106 |
-
|
107 |
-
# Extract useful fields
|
108 |
-
isolate = metadata.get("isolate")
|
109 |
-
author = metadata.get("authors")
|
110 |
-
institution = metadata.get("institution")
|
111 |
-
title = metadata.get("title")
|
112 |
-
combined = []
|
113 |
-
# Construct queries
|
114 |
-
if isolate and isolate!="unknown":
|
115 |
-
queries.append(f'"{isolate}" mitochondrial DNA')
|
116 |
-
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
117 |
-
|
118 |
-
if author and author!="unknown":
|
119 |
-
try:
|
120 |
-
author_name = author.split(',')[0] # Use last name only
|
121 |
-
except:
|
122 |
-
author_name = author
|
123 |
-
queries.append(f'"{author_name}" mitochondrial DNA')
|
124 |
-
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
125 |
-
|
126 |
-
if institution and institution!="unknown":
|
127 |
-
try:
|
128 |
-
short_inst = institution.split(',')[0] # Take first part of institution
|
129 |
-
except:
|
130 |
-
short_inst = institution
|
131 |
-
queries.append(f'"{short_inst}" mtDNA sequence')
|
132 |
-
queries.append(f'"{short_inst}" isolate site:nature.com')
|
133 |
-
if title and title!='unknown':
|
134 |
-
if title!="Direct Submission":
|
135 |
-
queries.append(title)
|
136 |
-
return queries
|
137 |
-
|
138 |
-
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
139 |
-
TRUSTED_DOMAINS = [
|
140 |
-
"ncbi.nlm.nih.gov",
|
141 |
-
"pubmed.ncbi.nlm.nih.gov",
|
142 |
-
"pmc.ncbi.nlm.nih.gov",
|
143 |
-
"biorxiv.org",
|
144 |
-
"researchgate.net",
|
145 |
-
"nature.com",
|
146 |
-
"sciencedirect.com"
|
147 |
-
]
|
148 |
-
def is_trusted_link(link):
|
149 |
-
for domain in TRUSTED_DOMAINS:
|
150 |
-
if domain in link:
|
151 |
-
return True
|
152 |
-
return False
|
153 |
-
def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
|
154 |
-
output = []
|
155 |
-
keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
156 |
-
if accession:
|
157 |
-
keywords = [accession] + keywords
|
158 |
-
title_snippet = link.lower()
|
159 |
-
print("save link folder inside this filter function: ", saveLinkFolder)
|
160 |
-
article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
161 |
-
print("article text")
|
162 |
-
print(article_text)
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
205 |
# the total information from even ncbi and all search
|
|
|
1 |
+
from Bio import Entrez, Medline
|
2 |
+
#import model
|
3 |
+
import mtdna_classifier
|
4 |
+
from NER.html import extractHTML
|
5 |
+
import data_preprocess
|
6 |
+
# Setup
|
7 |
+
def fetch_ncbi(accession_number):
|
8 |
+
try:
|
9 |
+
Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
|
10 |
+
handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
|
11 |
+
record = Entrez.read(handle)
|
12 |
+
handle.close()
|
13 |
+
outputs = {"authors":"unknown",
|
14 |
+
"institution":"unknown",
|
15 |
+
"isolate":"unknown",
|
16 |
+
"definition":"unknown",
|
17 |
+
"title":"unknown",
|
18 |
+
"seq_comment":"unknown",
|
19 |
+
"collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
|
20 |
+
gb_seq = None
|
21 |
+
# Validate record structure: It should be a list with at least one element (a dict)
|
22 |
+
if isinstance(record, list) and len(record) > 0:
|
23 |
+
if isinstance(record[0], dict):
|
24 |
+
gb_seq = record[0]
|
25 |
+
else:
|
26 |
+
print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
|
27 |
+
# extract collection date
|
28 |
+
if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
|
29 |
+
outputs["collection_date"] = gb_seq["GBSeq_create-date"]
|
30 |
+
else:
|
31 |
+
if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
|
32 |
+
outputs["collection_date"] = gb_seq["GBSeq_update-date"]
|
33 |
+
# extract definition
|
34 |
+
if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
|
35 |
+
outputs["definition"] = gb_seq["GBSeq_definition"]
|
36 |
+
# extract related-reference things
|
37 |
+
if "GBSeq_references" in gb_seq:
|
38 |
+
for ref in gb_seq["GBSeq_references"]:
|
39 |
+
# extract authors
|
40 |
+
if "GBReference_authors" in ref and outputs["authors"]=="unknown":
|
41 |
+
outputs["authors"] = "and ".join(ref["GBReference_authors"])
|
42 |
+
# extract title
|
43 |
+
if "GBReference_title" in ref and outputs["title"]=="unknown":
|
44 |
+
outputs["title"] = ref["GBReference_title"]
|
45 |
+
# extract submitted journal
|
46 |
+
if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
|
47 |
+
outputs["institution"] = ref['GBReference_journal']
|
48 |
+
# extract seq_comment
|
49 |
+
if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
|
50 |
+
outputs["seq_comment"] = gb_seq["GBSeq_comment"]
|
51 |
+
# extract isolate
|
52 |
+
if "GBSeq_feature-table" in gb_seq:
|
53 |
+
if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
|
54 |
+
for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
|
55 |
+
if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
|
56 |
+
outputs["isolate"] = ref["GBQualifier_value"]
|
57 |
+
else:
|
58 |
+
print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
|
59 |
+
|
60 |
+
# If gb_seq is still None, return defaults
|
61 |
+
if gb_seq is None:
|
62 |
+
return {"authors":"unknown",
|
63 |
+
"institution":"unknown",
|
64 |
+
"isolate":"unknown",
|
65 |
+
"definition":"unknown",
|
66 |
+
"title":"unknown",
|
67 |
+
"seq_comment":"unknown",
|
68 |
+
"collection_date":"unknown" }
|
69 |
+
return outputs
|
70 |
+
except:
|
71 |
+
print("error in fetching ncbi data")
|
72 |
+
return {"authors":"unknown",
|
73 |
+
"institution":"unknown",
|
74 |
+
"isolate":"unknown",
|
75 |
+
"definition":"unknown",
|
76 |
+
"title":"unknown",
|
77 |
+
"seq_comment":"unknown",
|
78 |
+
"collection_date":"unknown" }
|
79 |
+
# Fallback if NCBI crashed or cannot find accession on NBCI
|
80 |
+
def google_accession_search(accession_id):
|
81 |
+
"""
|
82 |
+
Search for metadata by accession ID using Google Custom Search.
|
83 |
+
Falls back to known biological databases and archives.
|
84 |
+
"""
|
85 |
+
queries = [
|
86 |
+
f"{accession_id}",
|
87 |
+
f"{accession_id} site:ncbi.nlm.nih.gov",
|
88 |
+
f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
|
89 |
+
f"{accession_id} site:europepmc.org",
|
90 |
+
f"{accession_id} site:researchgate.net",
|
91 |
+
f"{accession_id} mtDNA",
|
92 |
+
f"{accession_id} mitochondrial DNA"
|
93 |
+
]
|
94 |
+
|
95 |
+
links = []
|
96 |
+
for query in queries:
|
97 |
+
search_results = mtdna_classifier.search_google_custom(query, 2)
|
98 |
+
for link in search_results:
|
99 |
+
if link not in links:
|
100 |
+
links.append(link)
|
101 |
+
return links
|
102 |
+
|
103 |
+
# Method 1: Smarter Google
|
104 |
+
def smart_google_queries(metadata: dict):
|
105 |
+
queries = []
|
106 |
+
|
107 |
+
# Extract useful fields
|
108 |
+
isolate = metadata.get("isolate")
|
109 |
+
author = metadata.get("authors")
|
110 |
+
institution = metadata.get("institution")
|
111 |
+
title = metadata.get("title")
|
112 |
+
combined = []
|
113 |
+
# Construct queries
|
114 |
+
if isolate and isolate!="unknown":
|
115 |
+
queries.append(f'"{isolate}" mitochondrial DNA')
|
116 |
+
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
117 |
+
|
118 |
+
if author and author!="unknown":
|
119 |
+
try:
|
120 |
+
author_name = author.split(',')[0] # Use last name only
|
121 |
+
except:
|
122 |
+
author_name = author
|
123 |
+
queries.append(f'"{author_name}" mitochondrial DNA')
|
124 |
+
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
125 |
+
|
126 |
+
if institution and institution!="unknown":
|
127 |
+
try:
|
128 |
+
short_inst = institution.split(',')[0] # Take first part of institution
|
129 |
+
except:
|
130 |
+
short_inst = institution
|
131 |
+
queries.append(f'"{short_inst}" mtDNA sequence')
|
132 |
+
queries.append(f'"{short_inst}" isolate site:nature.com')
|
133 |
+
if title and title!='unknown':
|
134 |
+
if title!="Direct Submission":
|
135 |
+
queries.append(title)
|
136 |
+
return queries
|
137 |
+
|
138 |
+
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
139 |
+
TRUSTED_DOMAINS = [
|
140 |
+
"ncbi.nlm.nih.gov",
|
141 |
+
"pubmed.ncbi.nlm.nih.gov",
|
142 |
+
"pmc.ncbi.nlm.nih.gov",
|
143 |
+
"biorxiv.org",
|
144 |
+
"researchgate.net",
|
145 |
+
"nature.com",
|
146 |
+
"sciencedirect.com"
|
147 |
+
]
|
148 |
+
def is_trusted_link(link):
|
149 |
+
for domain in TRUSTED_DOMAINS:
|
150 |
+
if domain in link:
|
151 |
+
return True
|
152 |
+
return False
|
153 |
+
def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
|
154 |
+
output = []
|
155 |
+
keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
156 |
+
if accession:
|
157 |
+
keywords = [accession] + keywords
|
158 |
+
title_snippet = link.lower()
|
159 |
+
print("save link folder inside this filter function: ", saveLinkFolder)
|
160 |
+
article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
161 |
+
print("article text")
|
162 |
+
print(article_text)
|
163 |
+
try:
|
164 |
+
ext = link.split(".")[-1].lower()
|
165 |
+
if ext not in ["pdf", "docx", "xlsx"]:
|
166 |
+
html = extractHTML.HTML("", link)
|
167 |
+
jsonSM = html.getSupMaterial()
|
168 |
+
if jsonSM:
|
169 |
+
output += sum((jsonSM[key] for key in jsonSM), [])
|
170 |
+
except Exception:
|
171 |
+
pass # continue silently
|
172 |
+
for keyword in keywords:
|
173 |
+
if keyword.lower() in article_text.lower():
|
174 |
+
if link not in output:
|
175 |
+
output.append(link)
|
176 |
+
print("link and keyword: ", link, keyword)
|
177 |
+
return output
|
178 |
+
if keyword.lower() in title_snippet.lower():
|
179 |
+
if link not in output:
|
180 |
+
output.append(link)
|
181 |
+
print("link and keyword: ", link, keyword)
|
182 |
+
return output
|
183 |
+
return output
|
184 |
+
|
185 |
+
filtered = []
|
186 |
+
if len(search_results) > 0:
|
187 |
+
for link in search_results:
|
188 |
+
if is_trusted_link(link):
|
189 |
+
if link not in filtered:
|
190 |
+
filtered.append(link)
|
191 |
+
else:
|
192 |
+
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
193 |
+
filtered += output_link
|
194 |
+
return filtered
|
195 |
+
|
196 |
+
def smart_google_search(metadata):
|
197 |
+
queries = smart_google_queries(metadata)
|
198 |
+
links = []
|
199 |
+
for q in queries:
|
200 |
+
#print("\n🔍 Query:", q)
|
201 |
+
results = mtdna_classifier.search_google_custom(q,2)
|
202 |
+
for link in results:
|
203 |
+
#print(f"- {link}")
|
204 |
+
if link not in links:
|
205 |
+
links.append(link)
|
206 |
+
#filter_links = filter_links_by_metadata(links)
|
207 |
+
return links
|
208 |
+
# Method 2: Prompt LLM better or better ai search api with all
|
209 |
# the total information from even ncbi and all search
|