VyLala commited on
Commit
ae45270
Β·
verified Β·
1 Parent(s): 12e8e1b

Update smart_fallback.py

Browse files
Files changed (1) hide show
  1. smart_fallback.py +341 -341
smart_fallback.py CHANGED
@@ -1,342 +1,342 @@
1
- from Bio import Entrez, Medline
2
- #import model
3
- import mtdna_classifier
4
- from NER.html import extractHTML
5
- import data_preprocess
6
- import pipeline
7
- # Setup
8
- def fetch_ncbi(accession_number):
9
- try:
10
- Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
11
- handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
12
- record = Entrez.read(handle)
13
- handle.close()
14
- outputs = {"authors":"unknown",
15
- "institution":"unknown",
16
- "isolate":"unknown",
17
- "definition":"unknown",
18
- "title":"unknown",
19
- "seq_comment":"unknown",
20
- "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
21
- gb_seq = None
22
- # Validate record structure: It should be a list with at least one element (a dict)
23
- if isinstance(record, list) and len(record) > 0:
24
- if isinstance(record[0], dict):
25
- gb_seq = record[0]
26
- else:
27
- print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
28
- # extract collection date
29
- if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
30
- outputs["collection_date"] = gb_seq["GBSeq_create-date"]
31
- else:
32
- if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
33
- outputs["collection_date"] = gb_seq["GBSeq_update-date"]
34
- # extract definition
35
- if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
36
- outputs["definition"] = gb_seq["GBSeq_definition"]
37
- # extract related-reference things
38
- if "GBSeq_references" in gb_seq:
39
- for ref in gb_seq["GBSeq_references"]:
40
- # extract authors
41
- if "GBReference_authors" in ref and outputs["authors"]=="unknown":
42
- outputs["authors"] = "and ".join(ref["GBReference_authors"])
43
- # extract title
44
- if "GBReference_title" in ref and outputs["title"]=="unknown":
45
- outputs["title"] = ref["GBReference_title"]
46
- # extract submitted journal
47
- if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
48
- outputs["institution"] = ref['GBReference_journal']
49
- # extract seq_comment
50
- if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
51
- outputs["seq_comment"] = gb_seq["GBSeq_comment"]
52
- # extract isolate
53
- if "GBSeq_feature-table" in gb_seq:
54
- if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
55
- for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
56
- if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
57
- outputs["isolate"] = ref["GBQualifier_value"]
58
- else:
59
- print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
60
-
61
- # If gb_seq is still None, return defaults
62
- if gb_seq is None:
63
- return {"authors":"unknown",
64
- "institution":"unknown",
65
- "isolate":"unknown",
66
- "definition":"unknown",
67
- "title":"unknown",
68
- "seq_comment":"unknown",
69
- "collection_date":"unknown" }
70
- return outputs
71
- except:
72
- print("error in fetching ncbi data")
73
- return {"authors":"unknown",
74
- "institution":"unknown",
75
- "isolate":"unknown",
76
- "definition":"unknown",
77
- "title":"unknown",
78
- "seq_comment":"unknown",
79
- "collection_date":"unknown" }
80
- # Fallback if NCBI crashed or cannot find accession on NBCI
81
- def google_accession_search(accession_id):
82
- """
83
- Search for metadata by accession ID using Google Custom Search.
84
- Falls back to known biological databases and archives.
85
- """
86
- queries = [
87
- f"{accession_id}",
88
- f"{accession_id} site:ncbi.nlm.nih.gov",
89
- f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
90
- f"{accession_id} site:europepmc.org",
91
- f"{accession_id} site:researchgate.net",
92
- f"{accession_id} mtDNA",
93
- f"{accession_id} mitochondrial DNA"
94
- ]
95
-
96
- links = []
97
- for query in queries:
98
- search_results = mtdna_classifier.search_google_custom(query, 2)
99
- for link in search_results:
100
- if link not in links:
101
- links.append(link)
102
- return links
103
-
104
- # Method 1: Smarter Google
105
- def smart_google_queries(metadata: dict):
106
- queries = []
107
-
108
- # Extract useful fields
109
- isolate = metadata.get("isolate")
110
- author = metadata.get("authors")
111
- institution = metadata.get("institution")
112
- title = metadata.get("title")
113
- combined = []
114
- # Construct queries
115
- if isolate and isolate!="unknown" and isolate!="Unpublished":
116
- queries.append(f'"{isolate}" mitochondrial DNA')
117
- queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
118
-
119
- if author and author!="unknown" and author!="Unpublished":
120
- # try:
121
- # author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
122
- # except:
123
- # try:
124
- # author_name = author.split(',')[0] # Use last name only
125
- # except:
126
- # author_name = author
127
- try:
128
- author_name = author.split(',')[0] # Use last name only
129
- except:
130
- author_name = author
131
- queries.append(f'"{author_name}" mitochondrial DNA')
132
- queries.append(f'"{author_name}" mtDNA site:researchgate.net')
133
-
134
- if institution and institution!="unknown" and institution!="Unpublished":
135
- try:
136
- short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
137
- except:
138
- try:
139
- short_inst = institution.split(',')[0]
140
- except:
141
- short_inst = institution
142
- queries.append(f'"{short_inst}" mtDNA sequence')
143
- #queries.append(f'"{short_inst}" isolate site:nature.com')
144
- if title and title!='unknown' and title!="Unpublished":
145
- if title!="Direct Submission":
146
- queries.append(title)
147
-
148
- return queries
149
-
150
- # def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
151
- # TRUSTED_DOMAINS = [
152
- # "ncbi.nlm.nih.gov",
153
- # "pubmed.ncbi.nlm.nih.gov",
154
- # "pmc.ncbi.nlm.nih.gov",
155
- # "biorxiv.org",
156
- # "researchgate.net",
157
- # "nature.com",
158
- # "sciencedirect.com"
159
- # ]
160
- # if stop_flag is not None and stop_flag.value:
161
- # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
162
- # return []
163
- # def is_trusted_link(link):
164
- # for domain in TRUSTED_DOMAINS:
165
- # if domain in link:
166
- # return True
167
- # return False
168
- # def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
169
- # output = []
170
- # keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
171
- # if accession:
172
- # keywords = [accession] + keywords
173
- # title_snippet = link.lower()
174
- # print("save link folder inside this filter function: ", saveLinkFolder)
175
- # success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
176
- # if stop_flag is not None and stop_flag.value:
177
- # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
178
- # return []
179
- # if success_process:
180
- # article_text = output_process
181
- # print("yes succeed for getting article text")
182
- # else:
183
- # print("no suceed, fallback to no link")
184
- # article_text = ""
185
- # #article_text = data_preprocess.extract_text(link,saveLinkFolder)
186
- # print("article text")
187
- # #print(article_text)
188
- # if stop_flag is not None and stop_flag.value:
189
- # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
190
- # return []
191
- # try:
192
- # ext = link.split(".")[-1].lower()
193
- # if ext not in ["pdf", "docx", "xlsx"]:
194
- # html = extractHTML.HTML("", link)
195
- # if stop_flag is not None and stop_flag.value:
196
- # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
197
- # return []
198
- # jsonSM = html.getSupMaterial()
199
- # if jsonSM:
200
- # output += sum((jsonSM[key] for key in jsonSM), [])
201
- # except Exception:
202
- # pass # continue silently
203
- # for keyword in keywords:
204
- # if keyword.lower() in article_text.lower():
205
- # if link not in output:
206
- # output.append([link,keyword.lower()])
207
- # print("link and keyword for article text: ", link, keyword)
208
- # return output
209
- # if keyword.lower() in title_snippet.lower():
210
- # if link not in output:
211
- # output.append([link,keyword.lower()])
212
- # print("link and keyword for title: ", link, keyword)
213
- # return output
214
- # return output
215
-
216
- # filtered = []
217
- # better_filter = []
218
- # if len(search_results) > 0:
219
- # for link in search_results:
220
- # # if is_trusted_link(link):
221
- # # if link not in filtered:
222
- # # filtered.append(link)
223
- # # else:
224
- # print(link)
225
- # if stop_flag is not None and stop_flag.value:
226
- # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
227
- # return []
228
- # if link:
229
- # output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
230
- # print("output link: ")
231
- # print(output_link)
232
- # for out_link in output_link:
233
- # if isinstance(out_link,list) and len(out_link) > 1:
234
- # print(out_link)
235
- # kw = out_link[1]
236
- # print("kw and acc: ", kw, accession.lower())
237
- # if accession and kw == accession.lower():
238
- # better_filter.append(out_link[0])
239
- # filtered.append(out_link[0])
240
- # else: filtered.append(out_link)
241
- # print("done with link and here is filter: ",filtered)
242
- # if better_filter:
243
- # filtered = better_filter
244
- # return filtered
245
-
246
- def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
247
- TRUSTED_DOMAINS = [
248
- "ncbi.nlm.nih.gov",
249
- "pubmed.ncbi.nlm.nih.gov",
250
- "pmc.ncbi.nlm.nih.gov",
251
- "biorxiv.org",
252
- "researchgate.net",
253
- "nature.com",
254
- "sciencedirect.com"
255
- ]
256
- def is_trusted_link(link):
257
- for domain in TRUSTED_DOMAINS:
258
- if domain in link:
259
- return True
260
- return False
261
- def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
262
- output = []
263
- keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
264
- #keywords = ["mtDNA", "mitochondrial"]
265
- if accession:
266
- keywords = [accession] + keywords
267
- title_snippet = link.lower()
268
- #print("save link folder inside this filter function: ", saveLinkFolder)
269
- article_text = data_preprocess.extract_text(link,saveLinkFolder)
270
- print("article text done")
271
- #print(article_text)
272
- try:
273
- ext = link.split(".")[-1].lower()
274
- if ext not in ["pdf", "docx", "xlsx"]:
275
- html = extractHTML.HTML("", link)
276
- jsonSM = html.getSupMaterial()
277
- if jsonSM:
278
- output += sum((jsonSM[key] for key in jsonSM), [])
279
- except Exception:
280
- pass # continue silently
281
- for keyword in keywords:
282
- if article_text:
283
- if keyword.lower() in article_text.lower():
284
- if link not in output:
285
- output.append([link,keyword.lower(), article_text])
286
- return output
287
- if keyword.lower() in title_snippet.lower():
288
- if link not in output:
289
- output.append([link,keyword.lower()])
290
- print("link and keyword for title: ", link, keyword)
291
- return output
292
- return output
293
-
294
- filtered = {}
295
- better_filter = {}
296
- if len(search_results) > 0:
297
- print(search_results)
298
- for link in search_results:
299
- # if is_trusted_link(link):
300
- # if link not in filtered:
301
- # filtered.append(link)
302
- # else:
303
- print(link)
304
- if link:
305
- output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
306
- print("output link: ")
307
- print(output_link)
308
- for out_link in output_link:
309
- if isinstance(out_link,list) and len(out_link) > 1:
310
- print(out_link)
311
- kw = out_link[1]
312
- if accession and kw == accession.lower():
313
- if len(out_link) == 2:
314
- better_filter[out_link[0]] = ""
315
- elif len(out_link) == 3:
316
- # save article
317
- better_filter[out_link[0]] = out_link[2]
318
- if len(out_link) == 2:
319
- better_filter[out_link[0]] = ""
320
- elif len(out_link) == 3:
321
- # save article
322
- better_filter[out_link[0]] = out_link[2]
323
- else: filtered[out_link] = ""
324
- print("done with link and here is filter: ",filtered)
325
- if better_filter:
326
- filtered = better_filter
327
- return filtered
328
-
329
- def smart_google_search(metadata):
330
- queries = smart_google_queries(metadata)
331
- links = []
332
- for q in queries:
333
- #print("\nπŸ” Query:", q)
334
- results = mtdna_classifier.search_google_custom(q,2)
335
- for link in results:
336
- #print(f"- {link}")
337
- if link not in links:
338
- links.append(link)
339
- #filter_links = filter_links_by_metadata(links)
340
- return links
341
- # Method 2: Prompt LLM better or better ai search api with all
342
  # the total information from even ncbi and all search
 
1
+ from Bio import Entrez, Medline
2
+ #import model
3
+ import mtdna_classifier
4
+ from NER.html import extractHTML
5
+ import data_preprocess
6
+ import pipeline
7
+ # Setup
8
+ def fetch_ncbi(accession_number):
9
+ try:
10
+ Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
11
+ handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
12
+ record = Entrez.read(handle)
13
+ handle.close()
14
+ outputs = {"authors":"unknown",
15
+ "institution":"unknown",
16
+ "isolate":"unknown",
17
+ "definition":"unknown",
18
+ "title":"unknown",
19
+ "seq_comment":"unknown",
20
+ "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
21
+ gb_seq = None
22
+ # Validate record structure: It should be a list with at least one element (a dict)
23
+ if isinstance(record, list) and len(record) > 0:
24
+ if isinstance(record[0], dict):
25
+ gb_seq = record[0]
26
+ else:
27
+ print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
28
+ # extract collection date
29
+ if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
30
+ outputs["collection_date"] = gb_seq["GBSeq_create-date"]
31
+ else:
32
+ if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
33
+ outputs["collection_date"] = gb_seq["GBSeq_update-date"]
34
+ # extract definition
35
+ if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
36
+ outputs["definition"] = gb_seq["GBSeq_definition"]
37
+ # extract related-reference things
38
+ if "GBSeq_references" in gb_seq:
39
+ for ref in gb_seq["GBSeq_references"]:
40
+ # extract authors
41
+ if "GBReference_authors" in ref and outputs["authors"]=="unknown":
42
+ outputs["authors"] = "and ".join(ref["GBReference_authors"])
43
+ # extract title
44
+ if "GBReference_title" in ref and outputs["title"]=="unknown":
45
+ outputs["title"] = ref["GBReference_title"]
46
+ # extract submitted journal
47
+ if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
48
+ outputs["institution"] = ref['GBReference_journal']
49
+ # extract seq_comment
50
+ if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
51
+ outputs["seq_comment"] = gb_seq["GBSeq_comment"]
52
+ # extract isolate
53
+ if "GBSeq_feature-table" in gb_seq:
54
+ if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
55
+ for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
56
+ if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
57
+ outputs["isolate"] = ref["GBQualifier_value"]
58
+ else:
59
+ print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
60
+
61
+ # If gb_seq is still None, return defaults
62
+ if gb_seq is None:
63
+ return {"authors":"unknown",
64
+ "institution":"unknown",
65
+ "isolate":"unknown",
66
+ "definition":"unknown",
67
+ "title":"unknown",
68
+ "seq_comment":"unknown",
69
+ "collection_date":"unknown" }
70
+ return outputs
71
+ except:
72
+ print("error in fetching ncbi data")
73
+ return {"authors":"unknown",
74
+ "institution":"unknown",
75
+ "isolate":"unknown",
76
+ "definition":"unknown",
77
+ "title":"unknown",
78
+ "seq_comment":"unknown",
79
+ "collection_date":"unknown" }
80
+ # Fallback if NCBI crashed or cannot find accession on NBCI
81
+ def google_accession_search(accession_id):
82
+ """
83
+ Search for metadata by accession ID using Google Custom Search.
84
+ Falls back to known biological databases and archives.
85
+ """
86
+ queries = [
87
+ f"{accession_id}",
88
+ f"{accession_id} site:ncbi.nlm.nih.gov",
89
+ f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
90
+ f"{accession_id} site:europepmc.org",
91
+ f"{accession_id} site:researchgate.net",
92
+ f"{accession_id} mtDNA",
93
+ f"{accession_id} mitochondrial DNA"
94
+ ]
95
+
96
+ links = []
97
+ for query in queries:
98
+ search_results = mtdna_classifier.search_google_custom(query, 2)
99
+ for link in search_results:
100
+ if link not in links:
101
+ links.append(link)
102
+ return links
103
+
104
+ # Method 1: Smarter Google
105
+ def smart_google_queries(metadata: dict):
106
+ queries = []
107
+
108
+ # Extract useful fields
109
+ isolate = metadata.get("isolate")
110
+ author = metadata.get("authors")
111
+ institution = metadata.get("institution")
112
+ title = metadata.get("title")
113
+ combined = []
114
+ # Construct queries
115
+ if isolate and isolate!="unknown" and isolate!="Unpublished":
116
+ queries.append(f'"{isolate}" mitochondrial DNA')
117
+ queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
118
+
119
+ if author and author!="unknown" and author!="Unpublished":
120
+ # try:
121
+ # author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
122
+ # except:
123
+ # try:
124
+ # author_name = author.split(',')[0] # Use last name only
125
+ # except:
126
+ # author_name = author
127
+ try:
128
+ author_name = author.split(',')[0] # Use last name only
129
+ except:
130
+ author_name = author
131
+ queries.append(f'"{author_name}" mitochondrial DNA')
132
+ queries.append(f'"{author_name}" mtDNA site:researchgate.net')
133
+
134
+ if institution and institution!="unknown" and institution!="Unpublished":
135
+ try:
136
+ short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
137
+ except:
138
+ try:
139
+ short_inst = institution.split(',')[0]
140
+ except:
141
+ short_inst = institution
142
+ queries.append(f'"{short_inst}" mtDNA sequence')
143
+ #queries.append(f'"{short_inst}" isolate site:nature.com')
144
+ if title and title!='unknown' and title!="Unpublished":
145
+ if title!="Direct Submission":
146
+ queries.append(title)
147
+
148
+ return queries
149
+
150
+ # def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
151
+ # TRUSTED_DOMAINS = [
152
+ # "ncbi.nlm.nih.gov",
153
+ # "pubmed.ncbi.nlm.nih.gov",
154
+ # "pmc.ncbi.nlm.nih.gov",
155
+ # "biorxiv.org",
156
+ # "researchgate.net",
157
+ # "nature.com",
158
+ # "sciencedirect.com"
159
+ # ]
160
+ # if stop_flag is not None and stop_flag.value:
161
+ # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
162
+ # return []
163
+ # def is_trusted_link(link):
164
+ # for domain in TRUSTED_DOMAINS:
165
+ # if domain in link:
166
+ # return True
167
+ # return False
168
+ # def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
169
+ # output = []
170
+ # keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
171
+ # if accession:
172
+ # keywords = [accession] + keywords
173
+ # title_snippet = link.lower()
174
+ # print("save link folder inside this filter function: ", saveLinkFolder)
175
+ # success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
176
+ # if stop_flag is not None and stop_flag.value:
177
+ # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
178
+ # return []
179
+ # if success_process:
180
+ # article_text = output_process
181
+ # print("yes succeed for getting article text")
182
+ # else:
183
+ # print("no suceed, fallback to no link")
184
+ # article_text = ""
185
+ # #article_text = data_preprocess.extract_text(link,saveLinkFolder)
186
+ # print("article text")
187
+ # #print(article_text)
188
+ # if stop_flag is not None and stop_flag.value:
189
+ # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
190
+ # return []
191
+ # try:
192
+ # ext = link.split(".")[-1].lower()
193
+ # if ext not in ["pdf", "docx", "xlsx"]:
194
+ # html = extractHTML.HTML("", link)
195
+ # if stop_flag is not None and stop_flag.value:
196
+ # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
197
+ # return []
198
+ # jsonSM = html.getSupMaterial()
199
+ # if jsonSM:
200
+ # output += sum((jsonSM[key] for key in jsonSM), [])
201
+ # except Exception:
202
+ # pass # continue silently
203
+ # for keyword in keywords:
204
+ # if keyword.lower() in article_text.lower():
205
+ # if link not in output:
206
+ # output.append([link,keyword.lower()])
207
+ # print("link and keyword for article text: ", link, keyword)
208
+ # return output
209
+ # if keyword.lower() in title_snippet.lower():
210
+ # if link not in output:
211
+ # output.append([link,keyword.lower()])
212
+ # print("link and keyword for title: ", link, keyword)
213
+ # return output
214
+ # return output
215
+
216
+ # filtered = []
217
+ # better_filter = []
218
+ # if len(search_results) > 0:
219
+ # for link in search_results:
220
+ # # if is_trusted_link(link):
221
+ # # if link not in filtered:
222
+ # # filtered.append(link)
223
+ # # else:
224
+ # print(link)
225
+ # if stop_flag is not None and stop_flag.value:
226
+ # print(f"πŸ›‘ Stop detected {accession}, aborting early...")
227
+ # return []
228
+ # if link:
229
+ # output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
230
+ # print("output link: ")
231
+ # print(output_link)
232
+ # for out_link in output_link:
233
+ # if isinstance(out_link,list) and len(out_link) > 1:
234
+ # print(out_link)
235
+ # kw = out_link[1]
236
+ # print("kw and acc: ", kw, accession.lower())
237
+ # if accession and kw == accession.lower():
238
+ # better_filter.append(out_link[0])
239
+ # filtered.append(out_link[0])
240
+ # else: filtered.append(out_link)
241
+ # print("done with link and here is filter: ",filtered)
242
+ # if better_filter:
243
+ # filtered = better_filter
244
+ # return filtered
245
+
246
+ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
247
+ TRUSTED_DOMAINS = [
248
+ "ncbi.nlm.nih.gov",
249
+ "pubmed.ncbi.nlm.nih.gov",
250
+ "pmc.ncbi.nlm.nih.gov",
251
+ "biorxiv.org",
252
+ "researchgate.net",
253
+ "nature.com",
254
+ "sciencedirect.com"
255
+ ]
256
+ def is_trusted_link(link):
257
+ for domain in TRUSTED_DOMAINS:
258
+ if domain in link:
259
+ return True
260
+ return False
261
+ def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
262
+ output = []
263
+ keywords = ["mtDNA", "mitochondrial", "Homo sapiens"]
264
+ #keywords = ["mtDNA", "mitochondrial"]
265
+ if accession:
266
+ keywords = [accession] + keywords
267
+ title_snippet = link.lower()
268
+ #print("save link folder inside this filter function: ", saveLinkFolder)
269
+ article_text = data_preprocess.extract_text(link,saveLinkFolder)
270
+ print("article text done")
271
+ #print(article_text)
272
+ try:
273
+ ext = link.split(".")[-1].lower()
274
+ if ext not in ["pdf", "docx", "xlsx"]:
275
+ html = extractHTML.HTML("", link)
276
+ jsonSM = html.getSupMaterial()
277
+ if jsonSM:
278
+ output += sum((jsonSM[key] for key in jsonSM), [])
279
+ except Exception:
280
+ pass # continue silently
281
+ for keyword in keywords:
282
+ if article_text:
283
+ if keyword.lower() in article_text.lower():
284
+ if link not in output:
285
+ output.append([link,keyword.lower(), article_text])
286
+ return output
287
+ if keyword.lower() in title_snippet.lower():
288
+ if link not in output:
289
+ output.append([link,keyword.lower()])
290
+ print("link and keyword for title: ", link, keyword)
291
+ return output
292
+ return output
293
+
294
+ filtered = {}
295
+ better_filter = {}
296
+ if len(search_results) > 0:
297
+ print(search_results)
298
+ for link in search_results:
299
+ # if is_trusted_link(link):
300
+ # if link not in filtered:
301
+ # filtered.append(link)
302
+ # else:
303
+ print(link)
304
+ if link:
305
+ output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
306
+ print("output link: ")
307
+ print(output_link)
308
+ for out_link in output_link:
309
+ if isinstance(out_link,list) and len(out_link) > 1:
310
+ print(out_link)
311
+ kw = out_link[1]
312
+ if accession and kw == accession.lower():
313
+ if len(out_link) == 2:
314
+ better_filter[out_link[0]] = ""
315
+ elif len(out_link) == 3:
316
+ # save article
317
+ better_filter[out_link[0]] = out_link[2]
318
+ if len(out_link) == 2:
319
+ better_filter[out_link[0]] = ""
320
+ elif len(out_link) == 3:
321
+ # save article
322
+ better_filter[out_link[0]] = out_link[2]
323
+ else: filtered[out_link] = ""
324
+ print("done with link and here is filter: ",filtered)
325
+ if better_filter:
326
+ filtered = better_filter
327
+ return filtered
328
+
329
+ def smart_google_search(metadata):
330
+ queries = smart_google_queries(metadata)
331
+ links = []
332
+ for q in queries:
333
+ #print("\nπŸ” Query:", q)
334
+ results = mtdna_classifier.search_google_custom(q,2)
335
+ for link in results:
336
+ #print(f"- {link}")
337
+ if link not in links:
338
+ links.append(link)
339
+ #filter_links = filter_links_by_metadata(links)
340
+ return links
341
+ # Method 2: Prompt LLM better or better ai search api with all
342
  # the total information from even ncbi and all search