Spaces:
Running
Running
Updated classification model and GARD_search data
#1
by
wzkariampuzha
- opened
- epi_pipeline.py +52 -92
epi_pipeline.py
CHANGED
@@ -221,69 +221,38 @@ def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:
|
|
221 |
return pmid_abs, (found, relevant)
|
222 |
|
223 |
## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
|
224 |
-
import os
|
225 |
-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
226 |
-
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
227 |
-
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
228 |
-
import tensorflow as tf
|
229 |
-
import numpy as np
|
230 |
-
import spacy
|
231 |
-
import json
|
232 |
|
|
|
|
|
233 |
class Classify_Pipeline:
|
234 |
-
def __init__(self,
|
235 |
-
#
|
236 |
-
self.
|
237 |
-
self.
|
238 |
-
self.
|
239 |
-
|
240 |
-
|
241 |
-
self.classify_tokenizer = tokenizer_from_json(json.load(f))
|
242 |
-
#OLD Code - used pickle which is unsafe
|
243 |
-
#with open(model+'/tokenizer.pickle', 'rb') as handle:
|
244 |
-
# import pickle
|
245 |
-
# self.classify_tokenizer = pickle.load(handle)
|
246 |
-
# Defaults to load my_model_orphanet_final, the most up-to-date version of the classification model,
|
247 |
-
# but can also be run on any other tf.keras model
|
248 |
-
|
249 |
-
# load the model
|
250 |
-
self.classify_model = tf.keras.models.load_model(model_name)
|
251 |
-
# for preprocessing
|
252 |
-
from nltk.corpus import stopwords
|
253 |
-
self.STOPWORDS = set(stopwords.words('english'))
|
254 |
-
# Modes
|
255 |
-
self.max_length = 300
|
256 |
-
self.trunc_type = 'post'
|
257 |
-
self.padding_type = 'post'
|
258 |
-
|
259 |
def __str__(self) -> str:
|
260 |
-
return "Instantiation: epi_classify = Classify_Pipeline(
|
261 |
|
262 |
def __call__(self, abstract:str) -> Tuple[float,bool]:
|
263 |
return self.getTextPredictions(abstract)
|
264 |
|
265 |
def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
|
266 |
if len(abstract)>5:
|
267 |
-
#
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
prob = y_pred1[0][1]
|
282 |
-
if y_pred == 1:
|
283 |
-
isEpi = True
|
284 |
-
else:
|
285 |
-
isEpi = False
|
286 |
-
|
287 |
return prob, isEpi
|
288 |
else:
|
289 |
return 0.0, False
|
@@ -292,36 +261,6 @@ class Classify_Pipeline:
|
|
292 |
abstract = PMID_getAb(pmid)
|
293 |
prob, isEpi = self.getTextPredictions(abstract)
|
294 |
return abstract, prob, isEpi
|
295 |
-
|
296 |
-
# Standardize the abstract by replacing all named entities with their entity label.
|
297 |
-
# Eg. 3 patients reported at a clinic in England --> CARDINAL patients reported at a clinic in GPE
|
298 |
-
# expects the spaCy model en_core_web_lg as input
|
299 |
-
def standardizeAbstract(self, abstract:str) -> str:
|
300 |
-
doc = self.nlp(abstract)
|
301 |
-
newAbstract = abstract
|
302 |
-
for e in reversed(doc.ents):
|
303 |
-
if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
|
304 |
-
start = e.start_char
|
305 |
-
end = start + len(e.text)
|
306 |
-
newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
|
307 |
-
return newAbstract
|
308 |
-
|
309 |
-
# Same as above but replaces biomedical named entities from scispaCy models
|
310 |
-
# Expects as input en_ner_bc5cdr_md and en_ner_bionlp13cg_md
|
311 |
-
def standardizeSciTerms(self, abstract:str) -> str:
|
312 |
-
doc = self.nlpSci(abstract)
|
313 |
-
newAbstract = abstract
|
314 |
-
for e in reversed(doc.ents):
|
315 |
-
start = e.start_char
|
316 |
-
end = start + len(e.text)
|
317 |
-
newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
|
318 |
-
|
319 |
-
doc = self.nlpSci2(newAbstract)
|
320 |
-
for e in reversed(doc.ents):
|
321 |
-
start = e.start_char
|
322 |
-
end = start + len(e.text)
|
323 |
-
newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
|
324 |
-
return newAbstract
|
325 |
|
326 |
## Section: GARD SEARCH
|
327 |
# can identify rare diseases in text using the GARD dictionary from neo4j
|
@@ -331,11 +270,13 @@ class GARD_Search:
|
|
331 |
def __init__(self):
|
332 |
import json, codecs
|
333 |
#These are opened locally so that garbage collection removes them from memory
|
334 |
-
|
335 |
-
|
336 |
from nltk.corpus import stopwords
|
337 |
STOPWORDS = set(stopwords.words('english'))
|
338 |
|
|
|
|
|
339 |
#keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
|
340 |
GARD_dict = {}
|
341 |
#Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
|
@@ -356,6 +297,7 @@ class GARD_Search:
|
|
356 |
GARD_dict[s] = entry['gard_id']
|
357 |
max_length = max(max_length,len(s.split()))
|
358 |
|
|
|
359 |
self.GARD_dict = GARD_dict
|
360 |
self.max_length = max_length
|
361 |
|
@@ -444,6 +386,12 @@ class GARD_Search:
|
|
444 |
print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
|
445 |
return [searchterm]
|
446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
|
448 |
from nltk import tokenize as nltk_tokenize
|
449 |
from dataclasses import dataclass
|
@@ -455,6 +403,7 @@ import re
|
|
455 |
from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
|
456 |
from unidecode import unidecode
|
457 |
from collections import OrderedDict
|
|
|
458 |
import pandas as pd
|
459 |
from more_itertools import pairwise
|
460 |
|
@@ -855,6 +804,11 @@ def API_search_classification(search_term:Union[int,str], maxResults:int,
|
|
855 |
|
856 |
return results
|
857 |
|
|
|
|
|
|
|
|
|
|
|
858 |
def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
859 |
epi_prob, isEpi = epi_classify(text)
|
860 |
return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
|
@@ -900,7 +854,7 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
|
|
900 |
print(len(results),'abstracts classified as epidemiological.')
|
901 |
return results.sort_values('EPI_PROB', ascending=False)
|
902 |
|
903 |
-
#Returns a Pandas dataframe
|
904 |
def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
905 |
epi_ner:NER_Pipeline, #for biobert extraction
|
906 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
@@ -1021,7 +975,7 @@ def API_text_extraction(text:str, #Text to be extracted
|
|
1021 |
else:
|
1022 |
json_output = ['ABSTRACT']+ordered_labels
|
1023 |
|
1024 |
-
|
1025 |
#Do the extraction
|
1026 |
if extract_diseases:
|
1027 |
extraction = epi_ner(text, GARD_Search)
|
@@ -1031,15 +985,17 @@ def API_text_extraction(text:str, #Text to be extracted
|
|
1031 |
if extraction:
|
1032 |
#Re-order the dictionary into desired JSON output
|
1033 |
extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
|
1034 |
-
|
|
|
|
|
1035 |
|
1036 |
-
return
|
1037 |
|
1038 |
def API_text_classification_extraction(text:str, #Text to be extracted
|
1039 |
epi_ner:NER_Pipeline, #for biobert extraction
|
1040 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
1041 |
epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
1042 |
-
|
1043 |
#Format of Output
|
1044 |
ordered_labels = order_labels(epi_ner.labels)
|
1045 |
if extract_diseases:
|
@@ -1061,7 +1017,11 @@ def API_text_classification_extraction(text:str, #Text to be extracted
|
|
1061 |
|
1062 |
#Re-order the dictionary into desired JSON output
|
1063 |
output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
|
1064 |
-
|
|
|
|
|
|
|
|
|
1065 |
|
1066 |
## Section: Deprecated Functions
|
1067 |
import requests
|
|
|
221 |
return pmid_abs, (found, relevant)
|
222 |
|
223 |
## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
+
# Imports
|
226 |
+
from transformers import AutoModelForSequenceClassification, BertTokenizer, BertConfig
|
227 |
class Classify_Pipeline:
|
228 |
+
def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiClassify4GARD"):
|
229 |
+
#Initialize tokenizer and model
|
230 |
+
self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
|
231 |
+
self.tokenizer = BertTokenizer.from_pretrained(self.config._name_or_path, model_max_length=self.config.max_position_embeddings)
|
232 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(name_or_path_to_model_folder,config=self.config)
|
233 |
+
|
234 |
+
#Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
def __str__(self) -> str:
|
236 |
+
return "Instantiation: epi_classify = Classify_Pipeline(name_or_path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
|
237 |
|
238 |
def __call__(self, abstract:str) -> Tuple[float,bool]:
|
239 |
return self.getTextPredictions(abstract)
|
240 |
|
241 |
def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
|
242 |
if len(abstract)>5:
|
243 |
+
# input_ids
|
244 |
+
input_ids = self.tokenizer(text=abstract, max_length=self.config.max_position_embeddings,padding="max_length",truncation=True,return_tensors='pt')
|
245 |
+
if len(input_ids)>self.config.max_position_embeddings:
|
246 |
+
raise InputError(f"Token Embeddings of size {input_ids} exceed length for maximum model embedding input {self.config.max_position_embeddings}.")
|
247 |
+
#split into sentences?
|
248 |
+
# softmax output is a Torch Tensor with two classes [[vector_False_class,vector_True_class]]
|
249 |
+
output = self.model(**input_ids)
|
250 |
+
# True = 1, False = 0
|
251 |
+
isEpi = bool(output.logits.argmax().item())
|
252 |
+
# softmax output is a Torch Tensor with two classes [[prob_is_False,prob_is_True]]
|
253 |
+
prob_tensor = output.logits.softmax(dim=-1)
|
254 |
+
# We only want to return the probability that it is true
|
255 |
+
prob = prob_tensor.data[0][1].item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
return prob, isEpi
|
257 |
else:
|
258 |
return 0.0, False
|
|
|
261 |
abstract = PMID_getAb(pmid)
|
262 |
prob, isEpi = self.getTextPredictions(abstract)
|
263 |
return abstract, prob, isEpi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
## Section: GARD SEARCH
|
266 |
# can identify rare diseases in text using the GARD dictionary from neo4j
|
|
|
270 |
def __init__(self):
|
271 |
import json, codecs
|
272 |
#These are opened locally so that garbage collection removes them from memory
|
273 |
+
r = requests.get('https://raw.githubusercontent.com/ncats/epi4GARD/master/EpiExtract4GARD/gard-id-name-synonyms.json')
|
274 |
+
diseases = json.loads(r.content)
|
275 |
from nltk.corpus import stopwords
|
276 |
STOPWORDS = set(stopwords.words('english'))
|
277 |
|
278 |
+
#This should be a list of all GARD IDs for purposes like random choice
|
279 |
+
GARD_id_list = [entry['gard_id'] for entry in diseases]
|
280 |
#keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
|
281 |
GARD_dict = {}
|
282 |
#Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
|
|
|
297 |
GARD_dict[s] = entry['gard_id']
|
298 |
max_length = max(max_length,len(s.split()))
|
299 |
|
300 |
+
self.GARD_id_list = GARD_id_list
|
301 |
self.GARD_dict = GARD_dict
|
302 |
self.max_length = max_length
|
303 |
|
|
|
386 |
print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
|
387 |
return [searchterm]
|
388 |
|
389 |
+
# Return a random GARD_ID Search Term list
|
390 |
+
def random_disease(self) -> List[str]:
|
391 |
+
import random
|
392 |
+
gard_id = random.choice(self.GARD_id_list)
|
393 |
+
return self.autosearch(gard_id)
|
394 |
+
|
395 |
## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
|
396 |
from nltk import tokenize as nltk_tokenize
|
397 |
from dataclasses import dataclass
|
|
|
403 |
from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
|
404 |
from unidecode import unidecode
|
405 |
from collections import OrderedDict
|
406 |
+
import json
|
407 |
import pandas as pd
|
408 |
from more_itertools import pairwise
|
409 |
|
|
|
804 |
|
805 |
return results
|
806 |
|
807 |
+
def API_PMID_classification(pmid:Union[int,str], epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
808 |
+
text = PMID_getAb(pmid)
|
809 |
+
epi_prob, isEpi = epi_classify(text)
|
810 |
+
return {'PMID':pmid,'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
|
811 |
+
|
812 |
def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
813 |
epi_prob, isEpi = epi_classify(text)
|
814 |
return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
|
|
|
854 |
print(len(results),'abstracts classified as epidemiological.')
|
855 |
return results.sort_values('EPI_PROB', ascending=False)
|
856 |
|
857 |
+
#Returns a Pandas dataframe
|
858 |
def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
859 |
epi_ner:NER_Pipeline, #for biobert extraction
|
860 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
|
|
975 |
else:
|
976 |
json_output = ['ABSTRACT']+ordered_labels
|
977 |
|
978 |
+
extraction = dict()
|
979 |
#Do the extraction
|
980 |
if extract_diseases:
|
981 |
extraction = epi_ner(text, GARD_Search)
|
|
|
985 |
if extraction:
|
986 |
#Re-order the dictionary into desired JSON output
|
987 |
extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
|
988 |
+
else:
|
989 |
+
#This may return JSONs of different length than above
|
990 |
+
extraction = OrderedDict([(term, []) for term in json_output])
|
991 |
|
992 |
+
return extraction
|
993 |
|
994 |
def API_text_classification_extraction(text:str, #Text to be extracted
|
995 |
epi_ner:NER_Pipeline, #for biobert extraction
|
996 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
997 |
epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
998 |
+
|
999 |
#Format of Output
|
1000 |
ordered_labels = order_labels(epi_ner.labels)
|
1001 |
if extract_diseases:
|
|
|
1017 |
|
1018 |
#Re-order the dictionary into desired JSON output
|
1019 |
output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
|
1020 |
+
else:
|
1021 |
+
#This may return JSONs of different length than above
|
1022 |
+
output = OrderedDict([(term, []) for term in json_output])
|
1023 |
+
|
1024 |
+
return output
|
1025 |
|
1026 |
## Section: Deprecated Functions
|
1027 |
import requests
|