wzkariampuzha commited on
Commit
cdced10
1 Parent(s): aa32937

Upload extract_abs.py

Browse files
Files changed (1) hide show
  1. extract_abs.py +338 -0
extract_abs.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk import tokenize
5
+ STOPWORDS = set(stopwords.words('english'))
6
+ import string
7
+ PUNCTUATION = set(char for char in string.punctuation)
8
+ import csv
9
+ import spacy
10
+ import re
11
+ from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, pipeline
12
+ import numpy as np
13
+ import pandas as pd
14
+ import torch
15
+ import requests
16
+ import xml.etree.ElementTree as ET
17
+ import classify_abs
18
+ import json
19
+ import codecs
20
+ from unidecode import unidecode
21
+ from collections import OrderedDict
22
+ from typing import (
23
+ Dict,
24
+ List,
25
+ Tuple,
26
+ Set,
27
+ Optional,
28
+ Any,
29
+ Union,
30
+ )
31
+
32
+ ## Section: Dictionary Look-up for Disease Labeling
33
+ # This generates a dictionary of all GARD disease names. It is a dependency for get_diseases, autosearch, and all higher level functions that utilize those functions.
34
+ # GARD_dict, max_length = load_GARD_diseases()
35
+ def load_GARD_diseases() -> Tuple[Dict[str,str], int]:
36
+ diseases = json.load(codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig'))
37
+
38
+ #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
39
+ GARD_dict = {}
40
+
41
+ #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
42
+ max_length = -1
43
+ for entry in diseases:
44
+ if entry['name'] not in GARD_dict.keys():
45
+ s = entry['name'].lower().strip()
46
+ if s not in STOPWORDS and len(s)>5:
47
+ GARD_dict[s] = entry['gard_id']
48
+ #compare length
49
+ l = len(s.split())
50
+ if l>max_length:
51
+ max_length = l
52
+
53
+ if entry['synonyms']:
54
+ for synonym in entry['synonyms']:
55
+ if synonym not in GARD_dict.keys():
56
+ s = synonym.lower().strip()
57
+ if s not in STOPWORDS and len(s)>5:
58
+ GARD_dict[s] = entry['gard_id']
59
+ #compare length
60
+ l = len(s.split())
61
+ if l>max_length:
62
+ max_length = l
63
+
64
+ return GARD_dict, max_length
65
+
66
+ #Works much faster if broken down into sentences. Resulted in poorer testing when incorporating GARD_firstwd_dict.
67
+ #compares every phrase in a sentence to see if it matches anything in the GARD dictionary of diseases.
68
+ def get_diseases(sentence:str, GARD_dict:Dict[str,str], max_length:int) -> Tuple[List[str], List[str]]:
69
+ tokens = [s.lower().strip() for s in nltk.word_tokenize(sentence)]
70
+ diseases = []
71
+ ids = []
72
+ i=0
73
+ #Iterates through every word, builds string that is max_length or less to compare.
74
+ while i <len(tokens):
75
+ #Find out the length of the comparison string, either max_length or less. This brings algorithm from O(n^2) to O(n) time
76
+ compare_length = min(len(tokens)-i, max_length)
77
+
78
+ #Compares longest sequences first and goes down until there is a match
79
+ #print('(start compare_length)',compare_length)
80
+ while compare_length>0:
81
+ s = ' '.join(tokens[i:i+compare_length])
82
+ if s.lower() in GARD_dict.keys():
83
+ diseases.append(s)
84
+ ids.append(GARD_dict[s.lower()])
85
+ #Need to skip over the next few indexes
86
+ i+=compare_length-1
87
+ break
88
+ else:
89
+ compare_length-=1
90
+ i+=1
91
+ return diseases,ids
92
+
93
+ ## Section: Prepare ML/DL Models
94
+ # This fuction prepares the model. Should call before running in notebook. -- The [Any] Type is a Huggingface Pipeline variable
95
+ # Default with typing from here: https://stackoverflow.com/questions/38727520/how-do-i-add-default-parameters-to-functions-when-using-type-hinting
96
+ def init_NER_pipeline(name_or_path_to_model_folder:str = "ncats/EpiExtract4GARD-v2") -> Tuple[Any, Set[str]]: #NER_pipeline, entities = init_NER_pipeline()
97
+ tokenizer = BertTokenizer.from_pretrained(name_or_path_to_model_folder)
98
+ custommodel = AutoModelForTokenClassification.from_pretrained(name_or_path_to_model_folder)
99
+ customNER = pipeline('ner', custommodel, tokenizer=tokenizer, aggregation_strategy='simple')
100
+
101
+ config = BertConfig.from_pretrained(name_or_path_to_model_folder)
102
+ labels = {re.sub(".-","",label) for label in config.label2id.keys() if label != "O"}
103
+ return customNER, labels
104
+
105
+ ## Section: Information Acquisition
106
+ #moved PMID_getAb and search_getAbs to classify_abs.py
107
+
108
+ ## Section: Information Extraction
109
+ #Preprocessing function, turns abstracts into sentences
110
+ def str2sents(string:str) -> List[str]:
111
+ superscripts = re.findall('<sup>.</sup>', string)
112
+ for i in range(len(superscripts)):
113
+ string = re.sub('<sup>.</sup>', '^'+superscripts[i][5], string)
114
+ string = re.sub('<.{1,4}>', ' ', string)
115
+ string = re.sub(" *", " " , string)
116
+ string = re.sub("^ ", "" , string)
117
+ string = re.sub("$", "" , string)
118
+ string = re.sub(" ", " " , string)
119
+ string = re.sub("™", "" , string)
120
+ string = re.sub("®", "" , string)
121
+ string = re.sub("•", "" , string)
122
+ string = re.sub("…", "" , string)
123
+ string = re.sub("♀", "female" , string)
124
+ string = re.sub("♂", "male" , string)
125
+ string = unidecode(string)
126
+ string=string.strip()
127
+ sentences = tokenize.sent_tokenize(string)
128
+ return sentences
129
+
130
+ # Input: Sentences & Model Outputs
131
+ # Output: Dictionary with all entity types (dynamic to fit multiple models)
132
+ # model_outputs is list of NER_pipeline outputs
133
+ # labels are a set of all the possible entities (not including "O"). This is a misnomer. Was originally named "entities" but changed to not get confused with other code
134
+ def parse_info(sentences:List[str], model_outputs:List[List[Union[Dict[str,str],None]]], labels:Set, extract_diseases:bool, GARD_dict:Dict[str,str], max_length:int) -> Dict[str,Union[List[str],None]]:
135
+ #do not use dict.fromkeys(labels,set()) as the value is a single instance which all keys point to.
136
+ #The value is therefore effectively immutable.
137
+
138
+ #See: https://docs.python.org/3/library/stdtypes.html?highlight=dict%20fromkeys#dict.fromkeys
139
+ output_dict = {label:[] for label in labels}
140
+ for output in model_outputs:
141
+ #This abstracts the labels so that models with different types and numbers of labels can be used.
142
+ for label in labels:
143
+ output_dict[label]+=[entity_dict['word'] for entity_dict in output if entity_dict['entity_group'] ==label]
144
+
145
+ if 'DIS' not in output_dict.keys() and extract_diseases:
146
+ output_dict['DIS'] = []
147
+ output_dict['IDS'] = []
148
+ for sentence in sentences:
149
+ diseases,ids = get_diseases(sentence, GARD_dict, max_length)
150
+ output_dict['DIS']+=diseases
151
+ output_dict['IDS']+=ids
152
+
153
+ #Clean up Output Dict
154
+ for entity, output in output_dict.items():
155
+ if not output:
156
+ output_dict[entity] = None
157
+ else:
158
+ #remove duplicates from list but keep ordering instead of using sets
159
+ output = list(OrderedDict.fromkeys(output))
160
+ output_dict[entity] = output
161
+
162
+ if output_dict['EPI'] and (output_dict['STAT'] or output_dict['LOC'] or output_dict['DATE']):
163
+ return output_dict
164
+
165
+ #These are the main three main functions that can be called in a noteboook.
166
+ #Extracts Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic, etc. given a PubMed ID
167
+ #Dynamic dictionary output to fit multiple models
168
+ def PMID_extraction(pmid:Union[str,int], NER_pipeline:Any, labels:Union[Set[str],List[str]], GARD_dict:Dict[str,str], max_length:int) -> Dict[str,Union[str,List[str],None]]: #extraction = PMID_extraction(pmid, NER_pipeline, labels, GARD_dict, max_length)
169
+ text = classify_abs.PMID_getAb(pmid)
170
+ if len(text)>5:
171
+ sentences = str2sents(text)
172
+ model_outputs = [NER_pipeline(sent) for sent in sentences]
173
+ output_dict = parse_info(sentences, model_outputs, labels, GARD_dict, max_length)
174
+ output_dict['ABSTRACT'] = text
175
+ return output_dict
176
+ else:
177
+ out = ['ABSTRACT']
178
+ out+=list(labels)
179
+ output_dict =dict.fromkeys(out,"N/A")
180
+ output_dict['ABSTRACT'] = '*ABSTRACT NOT FOUND*'
181
+ return output_dict
182
+
183
+ #Can search by 7-digit GARD_ID, 12-digit "GARD:{GARD_ID}", matched search term, or arbitrary search term
184
+ #Returns list of terms to search by
185
+ # search_term_list = autosearch(search_term, GARD_dict)
186
+ def autosearch(searchterm:Union[str,int], GARD_dict:Dict[str,str], matching=2) -> List[str]:
187
+
188
+ #comparisons below only handly strings, allows int input
189
+ if type(searchterm) is not str:
190
+ searchterm = str(searchterm)
191
+
192
+ #for the disease names to match
193
+ searchterm = searchterm.lower()
194
+
195
+ while matching>=1:
196
+ #search in form of 'GARD:0000001'
197
+ if 'gard:' in searchterm and len(searchterm)==12:
198
+ searchterm = searchterm.replace('gard:','GARD:')
199
+ l = [k for k,v in GARD_dict.items() if v==searchterm]
200
+ if len(l)>0:
201
+ print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
202
+ return l
203
+
204
+ #can take int or str of digits of variable input
205
+ #search in form of 777 or '777' or '00777' or '0000777'
206
+ elif searchterm[0].isdigit() and searchterm[-1].isdigit():
207
+ if len(searchterm)>7:
208
+ raise ValueError('GARD ID IS NOT VALID. RE-ENTER SEARCH TERM')
209
+ searchterm = 'GARD:'+'0'*(7-len(str(searchterm)))+str(searchterm)
210
+ l = [k for k,v in GARD_dict.items() if v==searchterm]
211
+ if len(l)>0:
212
+ print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
213
+ return l
214
+
215
+ #search in form of 'mackay shek carr syndrome' and returns all synonyms ('retinal degeneration with nanophthalmos, cystic macular degeneration, and angle closure glaucoma', 'retinal degeneration, nanophthalmos, glaucoma', 'mackay shek carr syndrome')
216
+ #considers the GARD ID as the lemma, and the search term as one form. maps the form to the lemma and then uses that lemma to find all related forms in the GARD dict.
217
+ elif searchterm in GARD_dict.keys():
218
+ l = [k for k,v in GARD_dict.items() if v==GARD_dict[searchterm]]
219
+ print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
220
+ return l
221
+
222
+ else:
223
+ #This can be replaced with some other common error in user input that is easily fixed
224
+ searchterm = searchterm.replace(' ','-')
225
+ return autosearch(searchterm, GARD_dict, matching-1)
226
+ print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
227
+ return [searchterm]
228
+
229
+ #This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
230
+ def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
231
+ ordered_labels = []
232
+ label_order = ['DIS','ABRV','EPI','STAT','LOC','DATE','SEX','ETHN']
233
+ ordered_labels = [label for label in label_order if label in entity_classes]
234
+ #This adds any extra entities (from yet-to-be-created models) to the end of the ordered list of labels
235
+ for entity in entity_classes:
236
+ if entity not in label_order:
237
+ ordered_labels.append(entity)
238
+ return ordered_labels
239
+
240
+ #Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
241
+ #It then extracts Epidemiologic Information[Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic] for each abstract
242
+ # results = search_term_extraction(search_term, maxResults, filering, NER_pipeline, labels, extract_diseases, GARD_dict, max_length, classify_model_vars)
243
+ #Returns a Pandas dataframe
244
+ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
245
+ NER_pipeline:Any, entity_classes:Union[Set[str],List[str]], #for biobert extraction
246
+ extract_diseases:bool, GARD_dict:Dict[str,str], max_length:int, #for disease extraction
247
+ classify_model_vars:Tuple[Any,Any,Any,Any,Any]) -> Any: #for classification
248
+
249
+ #Format of Output
250
+ ordered_labels = order_labels(entity_classes)
251
+ if extract_diseases:
252
+ columns = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi','IDS','DIS']+ordered_labels
253
+ else:
254
+ columns = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi']+ordered_labels
255
+
256
+ results = pd.DataFrame(columns=columns)
257
+
258
+ ##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
259
+ search_term_list = autosearch(search_term, GARD_dict)
260
+
261
+ #Gather title+abstracts into a dictionary {pmid:abstract}
262
+ pmid_abs = classify_abs.search_getAbs(search_term_list, maxResults,filtering)
263
+
264
+ for pmid, abstract in pmid_abs.items():
265
+ epi_prob, isEpi = classify_abs.getTextPredictions(abstract, classify_model_vars)
266
+ if isEpi:
267
+ #Preprocessing Functions for Extraction
268
+ sentences = str2sents(abstract)
269
+ model_outputs = [NER_pipeline(sent) for sent in sentences]
270
+ extraction = parse_info(sentences, model_outputs, entity_classes, extract_diseases, GARD_dict, max_length)
271
+ if extraction:
272
+ extraction.update({'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi})
273
+ #Slow dataframe update
274
+ results = results.append(extraction, ignore_index=True)
275
+
276
+ print(len(results),'abstracts classified as epidemiological.')
277
+ return results.sort_values('EPI_PROB', ascending=False)
278
+
279
+ #Identical to search_term_extraction, except it returns a JSON object instead of a df
280
+ def API_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
281
+ NER_pipeline:Any, entity_classes:Union[Set[str],List[str]], #for biobert extraction
282
+ extract_diseases:bool, GARD_dict:Dict[str,str], max_length:int, #for disease extraction
283
+ classify_model_vars:Tuple[Any,Any,Any,Any,Any]) -> Any: #for classification
284
+
285
+ #Format of Output
286
+ ordered_labels = order_labels(entity_classes)
287
+ if extract_diseases:
288
+ json_output = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi','IDS','DIS']+ordered_labels
289
+ else:
290
+ json_output = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi']+ordered_labels
291
+
292
+ results = {'entries':[]}
293
+
294
+ ##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
295
+ search_term_list = autosearch(search_term, GARD_dict)
296
+
297
+ #Gather title+abstracts into a dictionary {pmid:abstract}
298
+ pmid_abs = classify_abs.search_getAbs(search_term_list, maxResults,filtering)
299
+
300
+ for pmid, abstract in pmid_abs.items():
301
+ epi_prob, isEpi = classify_abs.getTextPredictions(abstract, classify_model_vars)
302
+ if isEpi:
303
+ #Preprocessing Functions for Extraction
304
+ sentences = str2sents(abstract)
305
+ model_outputs = [NER_pipeline(sent) for sent in sentences]
306
+ extraction = parse_info(sentences, model_outputs, entity_classes, extract_diseases, GARD_dict, max_length)
307
+ if extraction:
308
+ extraction.update({'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi})
309
+ extraction = OrderedDict([(term, extraction[term]) for term in json_output])
310
+ results['entries'].append(extraction)
311
+
312
+ #sort
313
+ results['entries'].sort(reverse=True, key=lambda x:x['EPI_PROB'])
314
+
315
+ #float is not JSON serializable, so must convert all epi_probs to str
316
+ # This returns a map object, which is not JSON serializable
317
+ #results['entries'] = map(lambda entry:str(entry['EPI_PROB']),results['entries'])
318
+
319
+ for entry in results['entries']:
320
+ entry['EPI_PROB'] = str(entry['EPI_PROB'])
321
+
322
+ return json.dumps(results)
323
+
324
+ #Extract if you already have the text and you do not want epi_predictions (this makes things much faster)
325
+ #extraction = abstract_extraction(text, NER_pipeline, labels, GARD_dict, max_length)
326
+ def abstract_extraction(text:str, NER_pipeline:Any, entity_classes:Union[Set[str],List[str]], GARD_dict:Dict[str,str], max_length:int) -> Dict[str,Union[str,List[str],None]]:
327
+ if len(text)>5:
328
+ sentences = str2sents(text)
329
+ model_outputs = [NER_pipeline(sent) for sent in sentences]
330
+ output_dict = parse_info(sentences, model_outputs, entity_classes, GARD_dict, max_length)
331
+ output_dict['ABSTRACT'] = text
332
+ return output_dict
333
+ else:
334
+ out = ['ABSTRACT']
335
+ out+=list(entity_classes)
336
+ output_dict =dict.fromkeys(out,"N/A")
337
+ output_dict['ABSTRACT'] = '*ABSTRACT NOT FOUND*'
338
+ return output_dict