Spaces:
Running
Running
Update smart_fallback.py
Browse files- smart_fallback.py +341 -341
smart_fallback.py
CHANGED
@@ -1,342 +1,342 @@
|
|
1 |
-
from Bio import Entrez, Medline
|
2 |
-
#import model
|
3 |
-
import mtdna_classifier
|
4 |
-
from NER.html import extractHTML
|
5 |
-
import data_preprocess
|
6 |
-
import pipeline
|
7 |
-
# Setup
|
8 |
-
def fetch_ncbi(accession_number):
|
9 |
-
try:
|
10 |
-
Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
|
11 |
-
handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
|
12 |
-
record = Entrez.read(handle)
|
13 |
-
handle.close()
|
14 |
-
outputs = {"authors":"unknown",
|
15 |
-
"institution":"unknown",
|
16 |
-
"isolate":"unknown",
|
17 |
-
"definition":"unknown",
|
18 |
-
"title":"unknown",
|
19 |
-
"seq_comment":"unknown",
|
20 |
-
"collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
|
21 |
-
gb_seq = None
|
22 |
-
# Validate record structure: It should be a list with at least one element (a dict)
|
23 |
-
if isinstance(record, list) and len(record) > 0:
|
24 |
-
if isinstance(record[0], dict):
|
25 |
-
gb_seq = record[0]
|
26 |
-
else:
|
27 |
-
print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
|
28 |
-
# extract collection date
|
29 |
-
if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
|
30 |
-
outputs["collection_date"] = gb_seq["GBSeq_create-date"]
|
31 |
-
else:
|
32 |
-
if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
|
33 |
-
outputs["collection_date"] = gb_seq["GBSeq_update-date"]
|
34 |
-
# extract definition
|
35 |
-
if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
|
36 |
-
outputs["definition"] = gb_seq["GBSeq_definition"]
|
37 |
-
# extract related-reference things
|
38 |
-
if "GBSeq_references" in gb_seq:
|
39 |
-
for ref in gb_seq["GBSeq_references"]:
|
40 |
-
# extract authors
|
41 |
-
if "GBReference_authors" in ref and outputs["authors"]=="unknown":
|
42 |
-
outputs["authors"] = "and ".join(ref["GBReference_authors"])
|
43 |
-
# extract title
|
44 |
-
if "GBReference_title" in ref and outputs["title"]=="unknown":
|
45 |
-
outputs["title"] = ref["GBReference_title"]
|
46 |
-
# extract submitted journal
|
47 |
-
if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
|
48 |
-
outputs["institution"] = ref['GBReference_journal']
|
49 |
-
# extract seq_comment
|
50 |
-
if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
|
51 |
-
outputs["seq_comment"] = gb_seq["GBSeq_comment"]
|
52 |
-
# extract isolate
|
53 |
-
if "GBSeq_feature-table" in gb_seq:
|
54 |
-
if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
|
55 |
-
for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
|
56 |
-
if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
|
57 |
-
outputs["isolate"] = ref["GBQualifier_value"]
|
58 |
-
else:
|
59 |
-
print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
|
60 |
-
|
61 |
-
# If gb_seq is still None, return defaults
|
62 |
-
if gb_seq is None:
|
63 |
-
return {"authors":"unknown",
|
64 |
-
"institution":"unknown",
|
65 |
-
"isolate":"unknown",
|
66 |
-
"definition":"unknown",
|
67 |
-
"title":"unknown",
|
68 |
-
"seq_comment":"unknown",
|
69 |
-
"collection_date":"unknown" }
|
70 |
-
return outputs
|
71 |
-
except:
|
72 |
-
print("error in fetching ncbi data")
|
73 |
-
return {"authors":"unknown",
|
74 |
-
"institution":"unknown",
|
75 |
-
"isolate":"unknown",
|
76 |
-
"definition":"unknown",
|
77 |
-
"title":"unknown",
|
78 |
-
"seq_comment":"unknown",
|
79 |
-
"collection_date":"unknown" }
|
80 |
-
# Fallback if NCBI crashed or cannot find accession on NBCI
|
81 |
-
def google_accession_search(accession_id):
|
82 |
-
"""
|
83 |
-
Search for metadata by accession ID using Google Custom Search.
|
84 |
-
Falls back to known biological databases and archives.
|
85 |
-
"""
|
86 |
-
queries = [
|
87 |
-
f"{accession_id}",
|
88 |
-
f"{accession_id} site:ncbi.nlm.nih.gov",
|
89 |
-
f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
|
90 |
-
f"{accession_id} site:europepmc.org",
|
91 |
-
f"{accession_id} site:researchgate.net",
|
92 |
-
f"{accession_id} mtDNA",
|
93 |
-
f"{accession_id} mitochondrial DNA"
|
94 |
-
]
|
95 |
-
|
96 |
-
links = []
|
97 |
-
for query in queries:
|
98 |
-
search_results = mtdna_classifier.search_google_custom(query, 2)
|
99 |
-
for link in search_results:
|
100 |
-
if link not in links:
|
101 |
-
links.append(link)
|
102 |
-
return links
|
103 |
-
|
104 |
-
# Method 1: Smarter Google
|
105 |
-
def smart_google_queries(metadata: dict):
|
106 |
-
queries = []
|
107 |
-
|
108 |
-
# Extract useful fields
|
109 |
-
isolate = metadata.get("isolate")
|
110 |
-
author = metadata.get("authors")
|
111 |
-
institution = metadata.get("institution")
|
112 |
-
title = metadata.get("title")
|
113 |
-
combined = []
|
114 |
-
# Construct queries
|
115 |
-
if isolate and isolate!="unknown" and isolate!="Unpublished":
|
116 |
-
queries.append(f'"{isolate}" mitochondrial DNA')
|
117 |
-
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
118 |
-
|
119 |
-
if author and author!="unknown" and author!="Unpublished":
|
120 |
-
# try:
|
121 |
-
# author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
|
122 |
-
# except:
|
123 |
-
# try:
|
124 |
-
# author_name = author.split(',')[0] # Use last name only
|
125 |
-
# except:
|
126 |
-
# author_name = author
|
127 |
-
try:
|
128 |
-
author_name = author.split(',')[0] # Use last name only
|
129 |
-
except:
|
130 |
-
author_name = author
|
131 |
-
queries.append(f'"{author_name}" mitochondrial DNA')
|
132 |
-
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
133 |
-
|
134 |
-
if institution and institution!="unknown" and institution!="Unpublished":
|
135 |
-
try:
|
136 |
-
short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
|
137 |
-
except:
|
138 |
-
try:
|
139 |
-
short_inst = institution.split(',')[0]
|
140 |
-
except:
|
141 |
-
short_inst = institution
|
142 |
-
queries.append(f'"{short_inst}" mtDNA sequence')
|
143 |
-
#queries.append(f'"{short_inst}" isolate site:nature.com')
|
144 |
-
if title and title!='unknown' and title!="Unpublished":
|
145 |
-
if title!="Direct Submission":
|
146 |
-
queries.append(title)
|
147 |
-
|
148 |
-
return queries
|
149 |
-
|
150 |
-
# def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
|
151 |
-
# TRUSTED_DOMAINS = [
|
152 |
-
# "ncbi.nlm.nih.gov",
|
153 |
-
# "pubmed.ncbi.nlm.nih.gov",
|
154 |
-
# "pmc.ncbi.nlm.nih.gov",
|
155 |
-
# "biorxiv.org",
|
156 |
-
# "researchgate.net",
|
157 |
-
# "nature.com",
|
158 |
-
# "sciencedirect.com"
|
159 |
-
# ]
|
160 |
-
# if stop_flag is not None and stop_flag.value:
|
161 |
-
# print(f"π Stop detected {accession}, aborting early...")
|
162 |
-
# return []
|
163 |
-
# def is_trusted_link(link):
|
164 |
-
# for domain in TRUSTED_DOMAINS:
|
165 |
-
# if domain in link:
|
166 |
-
# return True
|
167 |
-
# return False
|
168 |
-
# def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
|
169 |
-
# output = []
|
170 |
-
# keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
171 |
-
# if accession:
|
172 |
-
# keywords = [accession] + keywords
|
173 |
-
# title_snippet = link.lower()
|
174 |
-
# print("save link folder inside this filter function: ", saveLinkFolder)
|
175 |
-
# success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
|
176 |
-
# if stop_flag is not None and stop_flag.value:
|
177 |
-
# print(f"π Stop detected {accession}, aborting early...")
|
178 |
-
# return []
|
179 |
-
# if success_process:
|
180 |
-
# article_text = output_process
|
181 |
-
# print("yes succeed for getting article text")
|
182 |
-
# else:
|
183 |
-
# print("no suceed, fallback to no link")
|
184 |
-
# article_text = ""
|
185 |
-
# #article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
186 |
-
# print("article text")
|
187 |
-
# #print(article_text)
|
188 |
-
# if stop_flag is not None and stop_flag.value:
|
189 |
-
# print(f"π Stop detected {accession}, aborting early...")
|
190 |
-
# return []
|
191 |
-
# try:
|
192 |
-
# ext = link.split(".")[-1].lower()
|
193 |
-
# if ext not in ["pdf", "docx", "xlsx"]:
|
194 |
-
# html = extractHTML.HTML("", link)
|
195 |
-
# if stop_flag is not None and stop_flag.value:
|
196 |
-
# print(f"π Stop detected {accession}, aborting early...")
|
197 |
-
# return []
|
198 |
-
# jsonSM = html.getSupMaterial()
|
199 |
-
# if jsonSM:
|
200 |
-
# output += sum((jsonSM[key] for key in jsonSM), [])
|
201 |
-
# except Exception:
|
202 |
-
# pass # continue silently
|
203 |
-
# for keyword in keywords:
|
204 |
-
# if keyword.lower() in article_text.lower():
|
205 |
-
# if link not in output:
|
206 |
-
# output.append([link,keyword.lower()])
|
207 |
-
# print("link and keyword for article text: ", link, keyword)
|
208 |
-
# return output
|
209 |
-
# if keyword.lower() in title_snippet.lower():
|
210 |
-
# if link not in output:
|
211 |
-
# output.append([link,keyword.lower()])
|
212 |
-
# print("link and keyword for title: ", link, keyword)
|
213 |
-
# return output
|
214 |
-
# return output
|
215 |
-
|
216 |
-
# filtered = []
|
217 |
-
# better_filter = []
|
218 |
-
# if len(search_results) > 0:
|
219 |
-
# for link in search_results:
|
220 |
-
# # if is_trusted_link(link):
|
221 |
-
# # if link not in filtered:
|
222 |
-
# # filtered.append(link)
|
223 |
-
# # else:
|
224 |
-
# print(link)
|
225 |
-
# if stop_flag is not None and stop_flag.value:
|
226 |
-
# print(f"π Stop detected {accession}, aborting early...")
|
227 |
-
# return []
|
228 |
-
# if link:
|
229 |
-
# output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
230 |
-
# print("output link: ")
|
231 |
-
# print(output_link)
|
232 |
-
# for out_link in output_link:
|
233 |
-
# if isinstance(out_link,list) and len(out_link) > 1:
|
234 |
-
# print(out_link)
|
235 |
-
# kw = out_link[1]
|
236 |
-
# print("kw and acc: ", kw, accession.lower())
|
237 |
-
# if accession and kw == accession.lower():
|
238 |
-
# better_filter.append(out_link[0])
|
239 |
-
# filtered.append(out_link[0])
|
240 |
-
# else: filtered.append(out_link)
|
241 |
-
# print("done with link and here is filter: ",filtered)
|
242 |
-
# if better_filter:
|
243 |
-
# filtered = better_filter
|
244 |
-
# return filtered
|
245 |
-
|
246 |
-
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
247 |
-
TRUSTED_DOMAINS = [
|
248 |
-
"ncbi.nlm.nih.gov",
|
249 |
-
"pubmed.ncbi.nlm.nih.gov",
|
250 |
-
"pmc.ncbi.nlm.nih.gov",
|
251 |
-
"biorxiv.org",
|
252 |
-
"researchgate.net",
|
253 |
-
"nature.com",
|
254 |
-
"sciencedirect.com"
|
255 |
-
]
|
256 |
-
def is_trusted_link(link):
|
257 |
-
for domain in TRUSTED_DOMAINS:
|
258 |
-
if domain in link:
|
259 |
-
return True
|
260 |
-
return False
|
261 |
-
def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
|
262 |
-
output = []
|
263 |
-
keywords = ["mtDNA", "mitochondrial", "
|
264 |
-
#keywords = ["mtDNA", "mitochondrial"]
|
265 |
-
if accession:
|
266 |
-
keywords = [accession] + keywords
|
267 |
-
title_snippet = link.lower()
|
268 |
-
#print("save link folder inside this filter function: ", saveLinkFolder)
|
269 |
-
article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
270 |
-
print("article text done")
|
271 |
-
#print(article_text)
|
272 |
-
try:
|
273 |
-
ext = link.split(".")[-1].lower()
|
274 |
-
if ext not in ["pdf", "docx", "xlsx"]:
|
275 |
-
html = extractHTML.HTML("", link)
|
276 |
-
jsonSM = html.getSupMaterial()
|
277 |
-
if jsonSM:
|
278 |
-
output += sum((jsonSM[key] for key in jsonSM), [])
|
279 |
-
except Exception:
|
280 |
-
pass # continue silently
|
281 |
-
for keyword in keywords:
|
282 |
-
if article_text:
|
283 |
-
if keyword.lower() in article_text.lower():
|
284 |
-
if link not in output:
|
285 |
-
output.append([link,keyword.lower(), article_text])
|
286 |
-
return output
|
287 |
-
if keyword.lower() in title_snippet.lower():
|
288 |
-
if link not in output:
|
289 |
-
output.append([link,keyword.lower()])
|
290 |
-
print("link and keyword for title: ", link, keyword)
|
291 |
-
return output
|
292 |
-
return output
|
293 |
-
|
294 |
-
filtered = {}
|
295 |
-
better_filter = {}
|
296 |
-
if len(search_results) > 0:
|
297 |
-
print(search_results)
|
298 |
-
for link in search_results:
|
299 |
-
# if is_trusted_link(link):
|
300 |
-
# if link not in filtered:
|
301 |
-
# filtered.append(link)
|
302 |
-
# else:
|
303 |
-
print(link)
|
304 |
-
if link:
|
305 |
-
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
306 |
-
print("output link: ")
|
307 |
-
print(output_link)
|
308 |
-
for out_link in output_link:
|
309 |
-
if isinstance(out_link,list) and len(out_link) > 1:
|
310 |
-
print(out_link)
|
311 |
-
kw = out_link[1]
|
312 |
-
if accession and kw == accession.lower():
|
313 |
-
if len(out_link) == 2:
|
314 |
-
better_filter[out_link[0]] = ""
|
315 |
-
elif len(out_link) == 3:
|
316 |
-
# save article
|
317 |
-
better_filter[out_link[0]] = out_link[2]
|
318 |
-
if len(out_link) == 2:
|
319 |
-
better_filter[out_link[0]] = ""
|
320 |
-
elif len(out_link) == 3:
|
321 |
-
# save article
|
322 |
-
better_filter[out_link[0]] = out_link[2]
|
323 |
-
else: filtered[out_link] = ""
|
324 |
-
print("done with link and here is filter: ",filtered)
|
325 |
-
if better_filter:
|
326 |
-
filtered = better_filter
|
327 |
-
return filtered
|
328 |
-
|
329 |
-
def smart_google_search(metadata):
|
330 |
-
queries = smart_google_queries(metadata)
|
331 |
-
links = []
|
332 |
-
for q in queries:
|
333 |
-
#print("\nπ Query:", q)
|
334 |
-
results = mtdna_classifier.search_google_custom(q,2)
|
335 |
-
for link in results:
|
336 |
-
#print(f"- {link}")
|
337 |
-
if link not in links:
|
338 |
-
links.append(link)
|
339 |
-
#filter_links = filter_links_by_metadata(links)
|
340 |
-
return links
|
341 |
-
# Method 2: Prompt LLM better or better ai search api with all
|
342 |
# the total information from even ncbi and all search
|
|
|
1 |
+
from Bio import Entrez, Medline
|
2 |
+
#import model
|
3 |
+
import mtdna_classifier
|
4 |
+
from NER.html import extractHTML
|
5 |
+
import data_preprocess
|
6 |
+
import pipeline
|
7 |
+
# Setup
|
8 |
+
def fetch_ncbi(accession_number):
|
9 |
+
try:
|
10 |
+
Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
|
11 |
+
handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
|
12 |
+
record = Entrez.read(handle)
|
13 |
+
handle.close()
|
14 |
+
outputs = {"authors":"unknown",
|
15 |
+
"institution":"unknown",
|
16 |
+
"isolate":"unknown",
|
17 |
+
"definition":"unknown",
|
18 |
+
"title":"unknown",
|
19 |
+
"seq_comment":"unknown",
|
20 |
+
"collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
|
21 |
+
gb_seq = None
|
22 |
+
# Validate record structure: It should be a list with at least one element (a dict)
|
23 |
+
if isinstance(record, list) and len(record) > 0:
|
24 |
+
if isinstance(record[0], dict):
|
25 |
+
gb_seq = record[0]
|
26 |
+
else:
|
27 |
+
print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
|
28 |
+
# extract collection date
|
29 |
+
if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
|
30 |
+
outputs["collection_date"] = gb_seq["GBSeq_create-date"]
|
31 |
+
else:
|
32 |
+
if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
|
33 |
+
outputs["collection_date"] = gb_seq["GBSeq_update-date"]
|
34 |
+
# extract definition
|
35 |
+
if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
|
36 |
+
outputs["definition"] = gb_seq["GBSeq_definition"]
|
37 |
+
# extract related-reference things
|
38 |
+
if "GBSeq_references" in gb_seq:
|
39 |
+
for ref in gb_seq["GBSeq_references"]:
|
40 |
+
# extract authors
|
41 |
+
if "GBReference_authors" in ref and outputs["authors"]=="unknown":
|
42 |
+
outputs["authors"] = "and ".join(ref["GBReference_authors"])
|
43 |
+
# extract title
|
44 |
+
if "GBReference_title" in ref and outputs["title"]=="unknown":
|
45 |
+
outputs["title"] = ref["GBReference_title"]
|
46 |
+
# extract submitted journal
|
47 |
+
if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
|
48 |
+
outputs["institution"] = ref['GBReference_journal']
|
49 |
+
# extract seq_comment
|
50 |
+
if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
|
51 |
+
outputs["seq_comment"] = gb_seq["GBSeq_comment"]
|
52 |
+
# extract isolate
|
53 |
+
if "GBSeq_feature-table" in gb_seq:
|
54 |
+
if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
|
55 |
+
for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
|
56 |
+
if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
|
57 |
+
outputs["isolate"] = ref["GBQualifier_value"]
|
58 |
+
else:
|
59 |
+
print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
|
60 |
+
|
61 |
+
# If gb_seq is still None, return defaults
|
62 |
+
if gb_seq is None:
|
63 |
+
return {"authors":"unknown",
|
64 |
+
"institution":"unknown",
|
65 |
+
"isolate":"unknown",
|
66 |
+
"definition":"unknown",
|
67 |
+
"title":"unknown",
|
68 |
+
"seq_comment":"unknown",
|
69 |
+
"collection_date":"unknown" }
|
70 |
+
return outputs
|
71 |
+
except:
|
72 |
+
print("error in fetching ncbi data")
|
73 |
+
return {"authors":"unknown",
|
74 |
+
"institution":"unknown",
|
75 |
+
"isolate":"unknown",
|
76 |
+
"definition":"unknown",
|
77 |
+
"title":"unknown",
|
78 |
+
"seq_comment":"unknown",
|
79 |
+
"collection_date":"unknown" }
|
80 |
+
# Fallback if NCBI crashed or cannot find accession on NBCI
|
81 |
+
def google_accession_search(accession_id):
|
82 |
+
"""
|
83 |
+
Search for metadata by accession ID using Google Custom Search.
|
84 |
+
Falls back to known biological databases and archives.
|
85 |
+
"""
|
86 |
+
queries = [
|
87 |
+
f"{accession_id}",
|
88 |
+
f"{accession_id} site:ncbi.nlm.nih.gov",
|
89 |
+
f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
|
90 |
+
f"{accession_id} site:europepmc.org",
|
91 |
+
f"{accession_id} site:researchgate.net",
|
92 |
+
f"{accession_id} mtDNA",
|
93 |
+
f"{accession_id} mitochondrial DNA"
|
94 |
+
]
|
95 |
+
|
96 |
+
links = []
|
97 |
+
for query in queries:
|
98 |
+
search_results = mtdna_classifier.search_google_custom(query, 2)
|
99 |
+
for link in search_results:
|
100 |
+
if link not in links:
|
101 |
+
links.append(link)
|
102 |
+
return links
|
103 |
+
|
104 |
+
# Method 1: Smarter Google
|
105 |
+
def smart_google_queries(metadata: dict):
|
106 |
+
queries = []
|
107 |
+
|
108 |
+
# Extract useful fields
|
109 |
+
isolate = metadata.get("isolate")
|
110 |
+
author = metadata.get("authors")
|
111 |
+
institution = metadata.get("institution")
|
112 |
+
title = metadata.get("title")
|
113 |
+
combined = []
|
114 |
+
# Construct queries
|
115 |
+
if isolate and isolate!="unknown" and isolate!="Unpublished":
|
116 |
+
queries.append(f'"{isolate}" mitochondrial DNA')
|
117 |
+
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
118 |
+
|
119 |
+
if author and author!="unknown" and author!="Unpublished":
|
120 |
+
# try:
|
121 |
+
# author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
|
122 |
+
# except:
|
123 |
+
# try:
|
124 |
+
# author_name = author.split(',')[0] # Use last name only
|
125 |
+
# except:
|
126 |
+
# author_name = author
|
127 |
+
try:
|
128 |
+
author_name = author.split(',')[0] # Use last name only
|
129 |
+
except:
|
130 |
+
author_name = author
|
131 |
+
queries.append(f'"{author_name}" mitochondrial DNA')
|
132 |
+
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
133 |
+
|
134 |
+
if institution and institution!="unknown" and institution!="Unpublished":
|
135 |
+
try:
|
136 |
+
short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
|
137 |
+
except:
|
138 |
+
try:
|
139 |
+
short_inst = institution.split(',')[0]
|
140 |
+
except:
|
141 |
+
short_inst = institution
|
142 |
+
queries.append(f'"{short_inst}" mtDNA sequence')
|
143 |
+
#queries.append(f'"{short_inst}" isolate site:nature.com')
|
144 |
+
if title and title!='unknown' and title!="Unpublished":
|
145 |
+
if title!="Direct Submission":
|
146 |
+
queries.append(title)
|
147 |
+
|
148 |
+
return queries
|
149 |
+
|
150 |
+
# def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
|
151 |
+
# TRUSTED_DOMAINS = [
|
152 |
+
# "ncbi.nlm.nih.gov",
|
153 |
+
# "pubmed.ncbi.nlm.nih.gov",
|
154 |
+
# "pmc.ncbi.nlm.nih.gov",
|
155 |
+
# "biorxiv.org",
|
156 |
+
# "researchgate.net",
|
157 |
+
# "nature.com",
|
158 |
+
# "sciencedirect.com"
|
159 |
+
# ]
|
160 |
+
# if stop_flag is not None and stop_flag.value:
|
161 |
+
# print(f"π Stop detected {accession}, aborting early...")
|
162 |
+
# return []
|
163 |
+
# def is_trusted_link(link):
|
164 |
+
# for domain in TRUSTED_DOMAINS:
|
165 |
+
# if domain in link:
|
166 |
+
# return True
|
167 |
+
# return False
|
168 |
+
# def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
|
169 |
+
# output = []
|
170 |
+
# keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
171 |
+
# if accession:
|
172 |
+
# keywords = [accession] + keywords
|
173 |
+
# title_snippet = link.lower()
|
174 |
+
# print("save link folder inside this filter function: ", saveLinkFolder)
|
175 |
+
# success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
|
176 |
+
# if stop_flag is not None and stop_flag.value:
|
177 |
+
# print(f"π Stop detected {accession}, aborting early...")
|
178 |
+
# return []
|
179 |
+
# if success_process:
|
180 |
+
# article_text = output_process
|
181 |
+
# print("yes succeed for getting article text")
|
182 |
+
# else:
|
183 |
+
# print("no suceed, fallback to no link")
|
184 |
+
# article_text = ""
|
185 |
+
# #article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
186 |
+
# print("article text")
|
187 |
+
# #print(article_text)
|
188 |
+
# if stop_flag is not None and stop_flag.value:
|
189 |
+
# print(f"π Stop detected {accession}, aborting early...")
|
190 |
+
# return []
|
191 |
+
# try:
|
192 |
+
# ext = link.split(".")[-1].lower()
|
193 |
+
# if ext not in ["pdf", "docx", "xlsx"]:
|
194 |
+
# html = extractHTML.HTML("", link)
|
195 |
+
# if stop_flag is not None and stop_flag.value:
|
196 |
+
# print(f"π Stop detected {accession}, aborting early...")
|
197 |
+
# return []
|
198 |
+
# jsonSM = html.getSupMaterial()
|
199 |
+
# if jsonSM:
|
200 |
+
# output += sum((jsonSM[key] for key in jsonSM), [])
|
201 |
+
# except Exception:
|
202 |
+
# pass # continue silently
|
203 |
+
# for keyword in keywords:
|
204 |
+
# if keyword.lower() in article_text.lower():
|
205 |
+
# if link not in output:
|
206 |
+
# output.append([link,keyword.lower()])
|
207 |
+
# print("link and keyword for article text: ", link, keyword)
|
208 |
+
# return output
|
209 |
+
# if keyword.lower() in title_snippet.lower():
|
210 |
+
# if link not in output:
|
211 |
+
# output.append([link,keyword.lower()])
|
212 |
+
# print("link and keyword for title: ", link, keyword)
|
213 |
+
# return output
|
214 |
+
# return output
|
215 |
+
|
216 |
+
# filtered = []
|
217 |
+
# better_filter = []
|
218 |
+
# if len(search_results) > 0:
|
219 |
+
# for link in search_results:
|
220 |
+
# # if is_trusted_link(link):
|
221 |
+
# # if link not in filtered:
|
222 |
+
# # filtered.append(link)
|
223 |
+
# # else:
|
224 |
+
# print(link)
|
225 |
+
# if stop_flag is not None and stop_flag.value:
|
226 |
+
# print(f"π Stop detected {accession}, aborting early...")
|
227 |
+
# return []
|
228 |
+
# if link:
|
229 |
+
# output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
230 |
+
# print("output link: ")
|
231 |
+
# print(output_link)
|
232 |
+
# for out_link in output_link:
|
233 |
+
# if isinstance(out_link,list) and len(out_link) > 1:
|
234 |
+
# print(out_link)
|
235 |
+
# kw = out_link[1]
|
236 |
+
# print("kw and acc: ", kw, accession.lower())
|
237 |
+
# if accession and kw == accession.lower():
|
238 |
+
# better_filter.append(out_link[0])
|
239 |
+
# filtered.append(out_link[0])
|
240 |
+
# else: filtered.append(out_link)
|
241 |
+
# print("done with link and here is filter: ",filtered)
|
242 |
+
# if better_filter:
|
243 |
+
# filtered = better_filter
|
244 |
+
# return filtered
|
245 |
+
|
246 |
+
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
247 |
+
TRUSTED_DOMAINS = [
|
248 |
+
"ncbi.nlm.nih.gov",
|
249 |
+
"pubmed.ncbi.nlm.nih.gov",
|
250 |
+
"pmc.ncbi.nlm.nih.gov",
|
251 |
+
"biorxiv.org",
|
252 |
+
"researchgate.net",
|
253 |
+
"nature.com",
|
254 |
+
"sciencedirect.com"
|
255 |
+
]
|
256 |
+
def is_trusted_link(link):
|
257 |
+
for domain in TRUSTED_DOMAINS:
|
258 |
+
if domain in link:
|
259 |
+
return True
|
260 |
+
return False
|
261 |
+
def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
|
262 |
+
output = []
|
263 |
+
keywords = ["mtDNA", "mitochondrial", "Homo sapiens"]
|
264 |
+
#keywords = ["mtDNA", "mitochondrial"]
|
265 |
+
if accession:
|
266 |
+
keywords = [accession] + keywords
|
267 |
+
title_snippet = link.lower()
|
268 |
+
#print("save link folder inside this filter function: ", saveLinkFolder)
|
269 |
+
article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
270 |
+
print("article text done")
|
271 |
+
#print(article_text)
|
272 |
+
try:
|
273 |
+
ext = link.split(".")[-1].lower()
|
274 |
+
if ext not in ["pdf", "docx", "xlsx"]:
|
275 |
+
html = extractHTML.HTML("", link)
|
276 |
+
jsonSM = html.getSupMaterial()
|
277 |
+
if jsonSM:
|
278 |
+
output += sum((jsonSM[key] for key in jsonSM), [])
|
279 |
+
except Exception:
|
280 |
+
pass # continue silently
|
281 |
+
for keyword in keywords:
|
282 |
+
if article_text:
|
283 |
+
if keyword.lower() in article_text.lower():
|
284 |
+
if link not in output:
|
285 |
+
output.append([link,keyword.lower(), article_text])
|
286 |
+
return output
|
287 |
+
if keyword.lower() in title_snippet.lower():
|
288 |
+
if link not in output:
|
289 |
+
output.append([link,keyword.lower()])
|
290 |
+
print("link and keyword for title: ", link, keyword)
|
291 |
+
return output
|
292 |
+
return output
|
293 |
+
|
294 |
+
filtered = {}
|
295 |
+
better_filter = {}
|
296 |
+
if len(search_results) > 0:
|
297 |
+
print(search_results)
|
298 |
+
for link in search_results:
|
299 |
+
# if is_trusted_link(link):
|
300 |
+
# if link not in filtered:
|
301 |
+
# filtered.append(link)
|
302 |
+
# else:
|
303 |
+
print(link)
|
304 |
+
if link:
|
305 |
+
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
306 |
+
print("output link: ")
|
307 |
+
print(output_link)
|
308 |
+
for out_link in output_link:
|
309 |
+
if isinstance(out_link,list) and len(out_link) > 1:
|
310 |
+
print(out_link)
|
311 |
+
kw = out_link[1]
|
312 |
+
if accession and kw == accession.lower():
|
313 |
+
if len(out_link) == 2:
|
314 |
+
better_filter[out_link[0]] = ""
|
315 |
+
elif len(out_link) == 3:
|
316 |
+
# save article
|
317 |
+
better_filter[out_link[0]] = out_link[2]
|
318 |
+
if len(out_link) == 2:
|
319 |
+
better_filter[out_link[0]] = ""
|
320 |
+
elif len(out_link) == 3:
|
321 |
+
# save article
|
322 |
+
better_filter[out_link[0]] = out_link[2]
|
323 |
+
else: filtered[out_link] = ""
|
324 |
+
print("done with link and here is filter: ",filtered)
|
325 |
+
if better_filter:
|
326 |
+
filtered = better_filter
|
327 |
+
return filtered
|
328 |
+
|
329 |
+
def smart_google_search(metadata):
|
330 |
+
queries = smart_google_queries(metadata)
|
331 |
+
links = []
|
332 |
+
for q in queries:
|
333 |
+
#print("\nπ Query:", q)
|
334 |
+
results = mtdna_classifier.search_google_custom(q,2)
|
335 |
+
for link in results:
|
336 |
+
#print(f"- {link}")
|
337 |
+
if link not in links:
|
338 |
+
links.append(link)
|
339 |
+
#filter_links = filter_links_by_metadata(links)
|
340 |
+
return links
|
341 |
+
# Method 2: Prompt LLM better or better ai search api with all
|
342 |
# the total information from even ncbi and all search
|