VyLala commited on
Commit
26b8b6f
·
verified ·
1 Parent(s): 6b33000

Update smart_fallback.py

Browse files
Files changed (1) hide show
  1. smart_fallback.py +208 -204
smart_fallback.py CHANGED
@@ -1,205 +1,209 @@
1
- from Bio import Entrez, Medline
2
- #import model
3
- import mtdna_classifier
4
- from NER.html import extractHTML
5
- import data_preprocess
6
- # Setup
7
- def fetch_ncbi(accession_number):
8
- try:
9
- Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
10
- handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
11
- record = Entrez.read(handle)
12
- handle.close()
13
- outputs = {"authors":"unknown",
14
- "institution":"unknown",
15
- "isolate":"unknown",
16
- "definition":"unknown",
17
- "title":"unknown",
18
- "seq_comment":"unknown",
19
- "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
20
- gb_seq = None
21
- # Validate record structure: It should be a list with at least one element (a dict)
22
- if isinstance(record, list) and len(record) > 0:
23
- if isinstance(record[0], dict):
24
- gb_seq = record[0]
25
- else:
26
- print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
27
- # extract collection date
28
- if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
29
- outputs["collection_date"] = gb_seq["GBSeq_create-date"]
30
- else:
31
- if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
32
- outputs["collection_date"] = gb_seq["GBSeq_update-date"]
33
- # extract definition
34
- if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
35
- outputs["definition"] = gb_seq["GBSeq_definition"]
36
- # extract related-reference things
37
- if "GBSeq_references" in gb_seq:
38
- for ref in gb_seq["GBSeq_references"]:
39
- # extract authors
40
- if "GBReference_authors" in ref and outputs["authors"]=="unknown":
41
- outputs["authors"] = "and ".join(ref["GBReference_authors"])
42
- # extract title
43
- if "GBReference_title" in ref and outputs["title"]=="unknown":
44
- outputs["title"] = ref["GBReference_title"]
45
- # extract submitted journal
46
- if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
47
- outputs["institution"] = ref['GBReference_journal']
48
- # extract seq_comment
49
- if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
50
- outputs["seq_comment"] = gb_seq["GBSeq_comment"]
51
- # extract isolate
52
- if "GBSeq_feature-table" in gb_seq:
53
- if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
54
- for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
55
- if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
56
- outputs["isolate"] = ref["GBQualifier_value"]
57
- else:
58
- print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
59
-
60
- # If gb_seq is still None, return defaults
61
- if gb_seq is None:
62
- return {"authors":"unknown",
63
- "institution":"unknown",
64
- "isolate":"unknown",
65
- "definition":"unknown",
66
- "title":"unknown",
67
- "seq_comment":"unknown",
68
- "collection_date":"unknown" }
69
- return outputs
70
- except:
71
- print("error in fetching ncbi data")
72
- return {"authors":"unknown",
73
- "institution":"unknown",
74
- "isolate":"unknown",
75
- "definition":"unknown",
76
- "title":"unknown",
77
- "seq_comment":"unknown",
78
- "collection_date":"unknown" }
79
- # Fallback if NCBI crashed or cannot find accession on NBCI
80
- def google_accession_search(accession_id):
81
- """
82
- Search for metadata by accession ID using Google Custom Search.
83
- Falls back to known biological databases and archives.
84
- """
85
- queries = [
86
- f"{accession_id}",
87
- f"{accession_id} site:ncbi.nlm.nih.gov",
88
- f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
89
- f"{accession_id} site:europepmc.org",
90
- f"{accession_id} site:researchgate.net",
91
- f"{accession_id} mtDNA",
92
- f"{accession_id} mitochondrial DNA"
93
- ]
94
-
95
- links = []
96
- for query in queries:
97
- search_results = mtdna_classifier.search_google_custom(query, 2)
98
- for link in search_results:
99
- if link not in links:
100
- links.append(link)
101
- return links
102
-
103
- # Method 1: Smarter Google
104
- def smart_google_queries(metadata: dict):
105
- queries = []
106
-
107
- # Extract useful fields
108
- isolate = metadata.get("isolate")
109
- author = metadata.get("authors")
110
- institution = metadata.get("institution")
111
- title = metadata.get("title")
112
- combined = []
113
- # Construct queries
114
- if isolate and isolate!="unknown":
115
- queries.append(f'"{isolate}" mitochondrial DNA')
116
- queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
117
-
118
- if author and author!="unknown":
119
- try:
120
- author_name = author.split(',')[0] # Use last name only
121
- except:
122
- author_name = author
123
- queries.append(f'"{author_name}" mitochondrial DNA')
124
- queries.append(f'"{author_name}" mtDNA site:researchgate.net')
125
-
126
- if institution and institution!="unknown":
127
- try:
128
- short_inst = institution.split(',')[0] # Take first part of institution
129
- except:
130
- short_inst = institution
131
- queries.append(f'"{short_inst}" mtDNA sequence')
132
- queries.append(f'"{short_inst}" isolate site:nature.com')
133
- if title and title!='unknown':
134
- if title!="Direct Submission":
135
- queries.append(title)
136
- return queries
137
-
138
- def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
139
- TRUSTED_DOMAINS = [
140
- "ncbi.nlm.nih.gov",
141
- "pubmed.ncbi.nlm.nih.gov",
142
- "pmc.ncbi.nlm.nih.gov",
143
- "biorxiv.org",
144
- "researchgate.net",
145
- "nature.com",
146
- "sciencedirect.com"
147
- ]
148
- def is_trusted_link(link):
149
- for domain in TRUSTED_DOMAINS:
150
- if domain in link:
151
- return True
152
- return False
153
- def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
154
- output = []
155
- keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
156
- if accession:
157
- keywords = [accession] + keywords
158
- title_snippet = link.lower()
159
- print("save link folder inside this filter function: ", saveLinkFolder)
160
- article_text = data_preprocess.extract_text(link,saveLinkFolder)
161
- print("article text")
162
- print(article_text)
163
- if link.split(".")[-1].lower():
164
- if link.split(".")[-1].lower() != "pdf" and link.split(".")[-1].lower() not in "docx" and link.split(".")[-1].lower() not in "xlxs":
165
- html = extractHTML.HTML("",link)
166
- jsonSM = html.getSupMaterial()
167
- if jsonSM: output += sum((jsonSM[key] for key in jsonSM),[])
168
- for keyword in keywords:
169
- if keyword.lower() in article_text.lower():
170
- if link not in output:
171
- output.append(link)
172
- print("link and keyword: ", link, keyword)
173
- return output
174
- if keyword.lower() in title_snippet.lower():
175
- if link not in output:
176
- output.append(link)
177
- print("link and keyword: ", link, keyword)
178
- return output
179
- return output
180
-
181
- filtered = []
182
- if len(search_results) > 0:
183
- for link in search_results:
184
- if is_trusted_link(link):
185
- if link not in filtered:
186
- filtered.append(link)
187
- else:
188
- output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
189
- filtered += output_link
190
- return filtered
191
-
192
- def smart_google_search(metadata):
193
- queries = smart_google_queries(metadata)
194
- links = []
195
- for q in queries:
196
- #print("\n🔍 Query:", q)
197
- results = mtdna_classifier.search_google_custom(q,2)
198
- for link in results:
199
- #print(f"- {link}")
200
- if link not in links:
201
- links.append(link)
202
- #filter_links = filter_links_by_metadata(links)
203
- return links
204
- # Method 2: Prompt LLM better or better ai search api with all
 
 
 
 
205
  # the total information from even ncbi and all search
 
1
+ from Bio import Entrez, Medline
2
+ #import model
3
+ import mtdna_classifier
4
+ from NER.html import extractHTML
5
+ import data_preprocess
6
+ # Setup
7
+ def fetch_ncbi(accession_number):
8
+ try:
9
+ Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
10
+ handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
11
+ record = Entrez.read(handle)
12
+ handle.close()
13
+ outputs = {"authors":"unknown",
14
+ "institution":"unknown",
15
+ "isolate":"unknown",
16
+ "definition":"unknown",
17
+ "title":"unknown",
18
+ "seq_comment":"unknown",
19
+ "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
20
+ gb_seq = None
21
+ # Validate record structure: It should be a list with at least one element (a dict)
22
+ if isinstance(record, list) and len(record) > 0:
23
+ if isinstance(record[0], dict):
24
+ gb_seq = record[0]
25
+ else:
26
+ print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
27
+ # extract collection date
28
+ if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
29
+ outputs["collection_date"] = gb_seq["GBSeq_create-date"]
30
+ else:
31
+ if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
32
+ outputs["collection_date"] = gb_seq["GBSeq_update-date"]
33
+ # extract definition
34
+ if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
35
+ outputs["definition"] = gb_seq["GBSeq_definition"]
36
+ # extract related-reference things
37
+ if "GBSeq_references" in gb_seq:
38
+ for ref in gb_seq["GBSeq_references"]:
39
+ # extract authors
40
+ if "GBReference_authors" in ref and outputs["authors"]=="unknown":
41
+ outputs["authors"] = "and ".join(ref["GBReference_authors"])
42
+ # extract title
43
+ if "GBReference_title" in ref and outputs["title"]=="unknown":
44
+ outputs["title"] = ref["GBReference_title"]
45
+ # extract submitted journal
46
+ if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
47
+ outputs["institution"] = ref['GBReference_journal']
48
+ # extract seq_comment
49
+ if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
50
+ outputs["seq_comment"] = gb_seq["GBSeq_comment"]
51
+ # extract isolate
52
+ if "GBSeq_feature-table" in gb_seq:
53
+ if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
54
+ for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
55
+ if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
56
+ outputs["isolate"] = ref["GBQualifier_value"]
57
+ else:
58
+ print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
59
+
60
+ # If gb_seq is still None, return defaults
61
+ if gb_seq is None:
62
+ return {"authors":"unknown",
63
+ "institution":"unknown",
64
+ "isolate":"unknown",
65
+ "definition":"unknown",
66
+ "title":"unknown",
67
+ "seq_comment":"unknown",
68
+ "collection_date":"unknown" }
69
+ return outputs
70
+ except:
71
+ print("error in fetching ncbi data")
72
+ return {"authors":"unknown",
73
+ "institution":"unknown",
74
+ "isolate":"unknown",
75
+ "definition":"unknown",
76
+ "title":"unknown",
77
+ "seq_comment":"unknown",
78
+ "collection_date":"unknown" }
79
+ # Fallback if NCBI crashed or cannot find accession on NBCI
80
+ def google_accession_search(accession_id):
81
+ """
82
+ Search for metadata by accession ID using Google Custom Search.
83
+ Falls back to known biological databases and archives.
84
+ """
85
+ queries = [
86
+ f"{accession_id}",
87
+ f"{accession_id} site:ncbi.nlm.nih.gov",
88
+ f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
89
+ f"{accession_id} site:europepmc.org",
90
+ f"{accession_id} site:researchgate.net",
91
+ f"{accession_id} mtDNA",
92
+ f"{accession_id} mitochondrial DNA"
93
+ ]
94
+
95
+ links = []
96
+ for query in queries:
97
+ search_results = mtdna_classifier.search_google_custom(query, 2)
98
+ for link in search_results:
99
+ if link not in links:
100
+ links.append(link)
101
+ return links
102
+
103
+ # Method 1: Smarter Google
104
+ def smart_google_queries(metadata: dict):
105
+ queries = []
106
+
107
+ # Extract useful fields
108
+ isolate = metadata.get("isolate")
109
+ author = metadata.get("authors")
110
+ institution = metadata.get("institution")
111
+ title = metadata.get("title")
112
+ combined = []
113
+ # Construct queries
114
+ if isolate and isolate!="unknown":
115
+ queries.append(f'"{isolate}" mitochondrial DNA')
116
+ queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
117
+
118
+ if author and author!="unknown":
119
+ try:
120
+ author_name = author.split(',')[0] # Use last name only
121
+ except:
122
+ author_name = author
123
+ queries.append(f'"{author_name}" mitochondrial DNA')
124
+ queries.append(f'"{author_name}" mtDNA site:researchgate.net')
125
+
126
+ if institution and institution!="unknown":
127
+ try:
128
+ short_inst = institution.split(',')[0] # Take first part of institution
129
+ except:
130
+ short_inst = institution
131
+ queries.append(f'"{short_inst}" mtDNA sequence')
132
+ queries.append(f'"{short_inst}" isolate site:nature.com')
133
+ if title and title!='unknown':
134
+ if title!="Direct Submission":
135
+ queries.append(title)
136
+ return queries
137
+
138
+ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
139
+ TRUSTED_DOMAINS = [
140
+ "ncbi.nlm.nih.gov",
141
+ "pubmed.ncbi.nlm.nih.gov",
142
+ "pmc.ncbi.nlm.nih.gov",
143
+ "biorxiv.org",
144
+ "researchgate.net",
145
+ "nature.com",
146
+ "sciencedirect.com"
147
+ ]
148
+ def is_trusted_link(link):
149
+ for domain in TRUSTED_DOMAINS:
150
+ if domain in link:
151
+ return True
152
+ return False
153
+ def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
154
+ output = []
155
+ keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
156
+ if accession:
157
+ keywords = [accession] + keywords
158
+ title_snippet = link.lower()
159
+ print("save link folder inside this filter function: ", saveLinkFolder)
160
+ article_text = data_preprocess.extract_text(link,saveLinkFolder)
161
+ print("article text")
162
+ print(article_text)
163
+ try:
164
+ ext = link.split(".")[-1].lower()
165
+ if ext not in ["pdf", "docx", "xlsx"]:
166
+ html = extractHTML.HTML("", link)
167
+ jsonSM = html.getSupMaterial()
168
+ if jsonSM:
169
+ output += sum((jsonSM[key] for key in jsonSM), [])
170
+ except Exception:
171
+ pass # continue silently
172
+ for keyword in keywords:
173
+ if keyword.lower() in article_text.lower():
174
+ if link not in output:
175
+ output.append(link)
176
+ print("link and keyword: ", link, keyword)
177
+ return output
178
+ if keyword.lower() in title_snippet.lower():
179
+ if link not in output:
180
+ output.append(link)
181
+ print("link and keyword: ", link, keyword)
182
+ return output
183
+ return output
184
+
185
+ filtered = []
186
+ if len(search_results) > 0:
187
+ for link in search_results:
188
+ if is_trusted_link(link):
189
+ if link not in filtered:
190
+ filtered.append(link)
191
+ else:
192
+ output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
193
+ filtered += output_link
194
+ return filtered
195
+
196
+ def smart_google_search(metadata):
197
+ queries = smart_google_queries(metadata)
198
+ links = []
199
+ for q in queries:
200
+ #print("\n🔍 Query:", q)
201
+ results = mtdna_classifier.search_google_custom(q,2)
202
+ for link in results:
203
+ #print(f"- {link}")
204
+ if link not in links:
205
+ links.append(link)
206
+ #filter_links = filter_links_by_metadata(links)
207
+ return links
208
+ # Method 2: Prompt LLM better or better ai search api with all
209
  # the total information from even ncbi and all search