Spaces:
Running
Running
Update smart_fallback.py
Browse files- smart_fallback.py +41 -20
smart_fallback.py
CHANGED
@@ -111,28 +111,35 @@ def smart_google_queries(metadata: dict):
|
|
111 |
title = metadata.get("title")
|
112 |
combined = []
|
113 |
# Construct queries
|
114 |
-
if isolate and isolate!="unknown":
|
115 |
queries.append(f'"{isolate}" mitochondrial DNA')
|
116 |
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
117 |
|
118 |
-
if author and author!="unknown":
|
119 |
try:
|
120 |
-
author_name = author.split('
|
121 |
except:
|
122 |
-
|
|
|
|
|
|
|
123 |
queries.append(f'"{author_name}" mitochondrial DNA')
|
124 |
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
125 |
|
126 |
-
if institution and institution!="unknown":
|
127 |
try:
|
128 |
-
short_inst = institution.split(',')[
|
129 |
except:
|
130 |
-
|
|
|
|
|
|
|
131 |
queries.append(f'"{short_inst}" mtDNA sequence')
|
132 |
-
queries.append(f'"{short_inst}" isolate site:nature.com')
|
133 |
-
if title and title!='unknown':
|
134 |
if title!="Direct Submission":
|
135 |
-
queries.append(title)
|
|
|
136 |
return queries
|
137 |
|
138 |
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
@@ -159,7 +166,7 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
159 |
print("save link folder inside this filter function: ", saveLinkFolder)
|
160 |
article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
161 |
print("article text")
|
162 |
-
print(article_text)
|
163 |
try:
|
164 |
ext = link.split(".")[-1].lower()
|
165 |
if ext not in ["pdf", "docx", "xlsx"]:
|
@@ -172,25 +179,39 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
172 |
for keyword in keywords:
|
173 |
if keyword.lower() in article_text.lower():
|
174 |
if link not in output:
|
175 |
-
output.append(link)
|
176 |
-
print("link and keyword: ", link, keyword)
|
177 |
return output
|
178 |
if keyword.lower() in title_snippet.lower():
|
179 |
if link not in output:
|
180 |
-
output.append(link)
|
181 |
-
print("link and keyword: ", link, keyword)
|
182 |
return output
|
183 |
return output
|
184 |
|
185 |
filtered = []
|
186 |
if len(search_results) > 0:
|
187 |
for link in search_results:
|
188 |
-
if is_trusted_link(link):
|
189 |
-
|
190 |
-
|
191 |
-
else:
|
|
|
|
|
192 |
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
return filtered
|
195 |
|
196 |
def smart_google_search(metadata):
|
|
|
111 |
title = metadata.get("title")
|
112 |
combined = []
|
113 |
# Construct queries
|
114 |
+
if isolate and isolate!="unknown" and isolate!="Unpublished":
|
115 |
queries.append(f'"{isolate}" mitochondrial DNA')
|
116 |
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
117 |
|
118 |
+
if author and author!="unknown" and author!="Unpublished":
|
119 |
try:
|
120 |
+
author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
|
121 |
except:
|
122 |
+
try:
|
123 |
+
author_name = author.split(',')[0] # Use last name only
|
124 |
+
except:
|
125 |
+
author_name = author
|
126 |
queries.append(f'"{author_name}" mitochondrial DNA')
|
127 |
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
128 |
|
129 |
+
if institution and institution!="unknown" and institution!="Unpublished":
|
130 |
try:
|
131 |
+
short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
|
132 |
except:
|
133 |
+
try:
|
134 |
+
short_inst = institution.split(',')[0]
|
135 |
+
except:
|
136 |
+
short_inst = institution
|
137 |
queries.append(f'"{short_inst}" mtDNA sequence')
|
138 |
+
#queries.append(f'"{short_inst}" isolate site:nature.com')
|
139 |
+
if title and title!='unknown' and title!="Unpublished":
|
140 |
if title!="Direct Submission":
|
141 |
+
queries.append(title)
|
142 |
+
|
143 |
return queries
|
144 |
|
145 |
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
|
166 |
print("save link folder inside this filter function: ", saveLinkFolder)
|
167 |
article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
168 |
print("article text")
|
169 |
+
#print(article_text)
|
170 |
try:
|
171 |
ext = link.split(".")[-1].lower()
|
172 |
if ext not in ["pdf", "docx", "xlsx"]:
|
|
|
179 |
for keyword in keywords:
|
180 |
if keyword.lower() in article_text.lower():
|
181 |
if link not in output:
|
182 |
+
output.append([link,keyword.lower()])
|
183 |
+
print("link and keyword for article text: ", link, keyword)
|
184 |
return output
|
185 |
if keyword.lower() in title_snippet.lower():
|
186 |
if link not in output:
|
187 |
+
output.append([link,keyword.lower()])
|
188 |
+
print("link and keyword for title: ", link, keyword)
|
189 |
return output
|
190 |
return output
|
191 |
|
192 |
filtered = []
|
193 |
if len(search_results) > 0:
|
194 |
for link in search_results:
|
195 |
+
# if is_trusted_link(link):
|
196 |
+
# if link not in filtered:
|
197 |
+
# filtered.append(link)
|
198 |
+
# else:
|
199 |
+
print(link)
|
200 |
+
if link:
|
201 |
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
202 |
+
print("output link: ")
|
203 |
+
print(output_link)
|
204 |
+
for out_link in output_link:
|
205 |
+
if isinstance(out_link,list) and len(out_link) > 1:
|
206 |
+
print(out_link)
|
207 |
+
kw = out_link[1]
|
208 |
+
if accession and kw == accession.lower():
|
209 |
+
better_filter.append(out_link[0])
|
210 |
+
filtered.append(out_link[0])
|
211 |
+
else: filtered.append(out_link)
|
212 |
+
print("done with link and here is filter: ",filtered)
|
213 |
+
if better_filter:
|
214 |
+
filtered = better_filter
|
215 |
return filtered
|
216 |
|
217 |
def smart_google_search(metadata):
|