VyLala commited on
Commit
86d55b0
·
verified ·
1 Parent(s): c330994

Update smart_fallback.py

Browse files
Files changed (1) hide show
  1. smart_fallback.py +41 -20
smart_fallback.py CHANGED
@@ -111,28 +111,35 @@ def smart_google_queries(metadata: dict):
111
  title = metadata.get("title")
112
  combined = []
113
  # Construct queries
114
- if isolate and isolate!="unknown":
115
  queries.append(f'"{isolate}" mitochondrial DNA')
116
  queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
117
 
118
- if author and author!="unknown":
119
  try:
120
- author_name = author.split(',')[0] # Use last name only
121
  except:
122
- author_name = author
 
 
 
123
  queries.append(f'"{author_name}" mitochondrial DNA')
124
  queries.append(f'"{author_name}" mtDNA site:researchgate.net')
125
 
126
- if institution and institution!="unknown":
127
  try:
128
- short_inst = institution.split(',')[0] # Take first part of institution
129
  except:
130
- short_inst = institution
 
 
 
131
  queries.append(f'"{short_inst}" mtDNA sequence')
132
- queries.append(f'"{short_inst}" isolate site:nature.com')
133
- if title and title!='unknown':
134
  if title!="Direct Submission":
135
- queries.append(title)
 
136
  return queries
137
 
138
  def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
@@ -159,7 +166,7 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
159
  print("save link folder inside this filter function: ", saveLinkFolder)
160
  article_text = data_preprocess.extract_text(link,saveLinkFolder)
161
  print("article text")
162
- print(article_text)
163
  try:
164
  ext = link.split(".")[-1].lower()
165
  if ext not in ["pdf", "docx", "xlsx"]:
@@ -172,25 +179,39 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
172
  for keyword in keywords:
173
  if keyword.lower() in article_text.lower():
174
  if link not in output:
175
- output.append(link)
176
- print("link and keyword: ", link, keyword)
177
  return output
178
  if keyword.lower() in title_snippet.lower():
179
  if link not in output:
180
- output.append(link)
181
- print("link and keyword: ", link, keyword)
182
  return output
183
  return output
184
 
185
  filtered = []
186
  if len(search_results) > 0:
187
  for link in search_results:
188
- if is_trusted_link(link):
189
- if link not in filtered:
190
- filtered.append(link)
191
- else:
 
 
192
  output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
193
- filtered += output_link
 
 
 
 
 
 
 
 
 
 
 
 
194
  return filtered
195
 
196
  def smart_google_search(metadata):
 
111
  title = metadata.get("title")
112
  combined = []
113
  # Construct queries
114
+ if isolate and isolate!="unknown" and isolate!="Unpublished":
115
  queries.append(f'"{isolate}" mitochondrial DNA')
116
  queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
117
 
118
+ if author and author!="unknown" and author!="Unpublished":
119
  try:
120
+ author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
121
  except:
122
+ try:
123
+ author_name = author.split(',')[0] # Use last name only
124
+ except:
125
+ author_name = author
126
  queries.append(f'"{author_name}" mitochondrial DNA')
127
  queries.append(f'"{author_name}" mtDNA site:researchgate.net')
128
 
129
+ if institution and institution!="unknown" and institution!="Unpublished":
130
  try:
131
+ short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
132
  except:
133
+ try:
134
+ short_inst = institution.split(',')[0]
135
+ except:
136
+ short_inst = institution
137
  queries.append(f'"{short_inst}" mtDNA sequence')
138
+ #queries.append(f'"{short_inst}" isolate site:nature.com')
139
+ if title and title!='unknown' and title!="Unpublished":
140
  if title!="Direct Submission":
141
+ queries.append(title)
142
+
143
  return queries
144
 
145
  def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
 
166
  print("save link folder inside this filter function: ", saveLinkFolder)
167
  article_text = data_preprocess.extract_text(link,saveLinkFolder)
168
  print("article text")
169
+ #print(article_text)
170
  try:
171
  ext = link.split(".")[-1].lower()
172
  if ext not in ["pdf", "docx", "xlsx"]:
 
179
  for keyword in keywords:
180
  if keyword.lower() in article_text.lower():
181
  if link not in output:
182
+ output.append([link,keyword.lower()])
183
+ print("link and keyword for article text: ", link, keyword)
184
  return output
185
  if keyword.lower() in title_snippet.lower():
186
  if link not in output:
187
+ output.append([link,keyword.lower()])
188
+ print("link and keyword for title: ", link, keyword)
189
  return output
190
  return output
191
 
192
  filtered = []
193
  if len(search_results) > 0:
194
  for link in search_results:
195
+ # if is_trusted_link(link):
196
+ # if link not in filtered:
197
+ # filtered.append(link)
198
+ # else:
199
+ print(link)
200
+ if link:
201
  output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
202
+ print("output link: ")
203
+ print(output_link)
204
+ for out_link in output_link:
205
+ if isinstance(out_link,list) and len(out_link) > 1:
206
+ print(out_link)
207
+ kw = out_link[1]
208
+ if accession and kw == accession.lower():
209
+ better_filter.append(out_link[0])
210
+ filtered.append(out_link[0])
211
+ else: filtered.append(out_link)
212
+ print("done with link and here is filter: ",filtered)
213
+ if better_filter:
214
+ filtered = better_filter
215
  return filtered
216
 
217
  def smart_google_search(metadata):