updated GARD search and Classify_Pipeline

#2
Files changed (1) hide show
  1. epi_pipeline.py +61 -96
epi_pipeline.py CHANGED
@@ -6,7 +6,7 @@ from typing import List, Dict, Union, Optional, Set, Tuple
6
 
7
  ## This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. The National Center for Advancing Translational Science (NCATS) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCATS and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCATS and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material.
8
 
9
- # Written by William Kariampuzha @ NIH/NCATS. Adapted from code written by Jennifer John, et al.
10
  # The transformer-based pipeline code has its own copyright notice under the Apache License.
11
  # The code was compiled into a single python file to make adding additional features and importing into other modules easy.
12
  # Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
@@ -91,7 +91,7 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
91
  if pmid[0].isdigit():
92
  pmids.add(pmid)
93
 
94
- #Construct sets for filtering (right before adding abstract to pmid_abs
95
  # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
96
  #if filtering is 'lenient' or default
97
  if filtering !='none' or filtering !='strict':
@@ -220,70 +220,39 @@ def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:
220
 
221
  return pmid_abs, (found, relevant)
222
 
223
- ## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
224
- import os
225
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
226
- from tensorflow.keras.preprocessing.sequence import pad_sequences
227
- from tensorflow.keras.preprocessing.text import tokenizer_from_json
228
- import tensorflow as tf
229
- import numpy as np
230
- import spacy
231
- import json
232
 
 
 
233
  class Classify_Pipeline:
234
- def __init__(self,model_name:str='LSTM_RNN_Model'):
235
- #Load spaCy models
236
- self.nlp = spacy.load('en_core_web_lg')
237
- self.nlpSci = spacy.load("en_ner_bc5cdr_md")
238
- self.nlpSci2 = spacy.load('en_ner_bionlp13cg_md')
239
- # load the tokenizer
240
- with open(model_name+'/tokenizer.json') as f:
241
- self.classify_tokenizer = tokenizer_from_json(json.load(f))
242
- #OLD Code - used pickle which is unsafe
243
- #with open(model+'/tokenizer.pickle', 'rb') as handle:
244
- # import pickle
245
- # self.classify_tokenizer = pickle.load(handle)
246
- # Defaults to load my_model_orphanet_final, the most up-to-date version of the classification model,
247
- # but can also be run on any other tf.keras model
248
-
249
- # load the model
250
- self.classify_model = tf.keras.models.load_model(model_name)
251
- # for preprocessing
252
- from nltk.corpus import stopwords
253
- self.STOPWORDS = set(stopwords.words('english'))
254
- # Modes
255
- self.max_length = 300
256
- self.trunc_type = 'post'
257
- self.padding_type = 'post'
258
-
259
  def __str__(self) -> str:
260
- return "Instantiation: epi_classify = Classify_Pipeline(path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
261
 
262
  def __call__(self, abstract:str) -> Tuple[float,bool]:
263
  return self.getTextPredictions(abstract)
264
 
265
  def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
266
  if len(abstract)>5:
267
- # remove stopwords
268
- for word in self.STOPWORDS:
269
- token = ' ' + word + ' '
270
- abstract = abstract.replace(token, ' ')
271
- abstract = abstract.replace(' ', ' ')
272
-
273
- # preprocess abstract
274
- abstract_standard = [self.standardizeAbstract(self.standardizeSciTerms(abstract))]
275
- sequence = self.classify_tokenizer.texts_to_sequences(abstract_standard)
276
- padded = pad_sequences(sequence, maxlen=self.max_length, padding=self.padding_type, truncating=self.trunc_type)
277
-
278
- y_pred1 = self.classify_model.predict(padded) # generate prediction
279
- y_pred = np.argmax(y_pred1, axis=1) # get binary prediction
280
-
281
- prob = y_pred1[0][1]
282
- if y_pred == 1:
283
- isEpi = True
284
- else:
285
- isEpi = False
286
-
287
  return prob, isEpi
288
  else:
289
  return 0.0, False
@@ -292,36 +261,6 @@ class Classify_Pipeline:
292
  abstract = PMID_getAb(pmid)
293
  prob, isEpi = self.getTextPredictions(abstract)
294
  return abstract, prob, isEpi
295
-
296
- # Standardize the abstract by replacing all named entities with their entity label.
297
- # Eg. 3 patients reported at a clinic in England --> CARDINAL patients reported at a clinic in GPE
298
- # expects the spaCy model en_core_web_lg as input
299
- def standardizeAbstract(self, abstract:str) -> str:
300
- doc = self.nlp(abstract)
301
- newAbstract = abstract
302
- for e in reversed(doc.ents):
303
- if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
304
- start = e.start_char
305
- end = start + len(e.text)
306
- newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
307
- return newAbstract
308
-
309
- # Same as above but replaces biomedical named entities from scispaCy models
310
- # Expects as input en_ner_bc5cdr_md and en_ner_bionlp13cg_md
311
- def standardizeSciTerms(self, abstract:str) -> str:
312
- doc = self.nlpSci(abstract)
313
- newAbstract = abstract
314
- for e in reversed(doc.ents):
315
- start = e.start_char
316
- end = start + len(e.text)
317
- newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
318
-
319
- doc = self.nlpSci2(newAbstract)
320
- for e in reversed(doc.ents):
321
- start = e.start_char
322
- end = start + len(e.text)
323
- newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
324
- return newAbstract
325
 
326
  ## Section: GARD SEARCH
327
  # can identify rare diseases in text using the GARD dictionary from neo4j
@@ -331,11 +270,18 @@ class GARD_Search:
331
  def __init__(self):
332
  import json, codecs
333
  #These are opened locally so that garbage collection removes them from memory
334
- with codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
335
- diseases = json.load(f)
 
 
 
 
 
336
  from nltk.corpus import stopwords
337
  STOPWORDS = set(stopwords.words('english'))
338
 
 
 
339
  #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
340
  GARD_dict = {}
341
  #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
@@ -356,6 +302,7 @@ class GARD_Search:
356
  GARD_dict[s] = entry['gard_id']
357
  max_length = max(max_length,len(s.split()))
358
 
 
359
  self.GARD_dict = GARD_dict
360
  self.max_length = max_length
361
 
@@ -444,6 +391,12 @@ class GARD_Search:
444
  print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
445
  return [searchterm]
446
 
 
 
 
 
 
 
447
  ## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
448
  from nltk import tokenize as nltk_tokenize
449
  from dataclasses import dataclass
@@ -455,6 +408,7 @@ import re
455
  from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
456
  from unidecode import unidecode
457
  from collections import OrderedDict
 
458
  import pandas as pd
459
  from more_itertools import pairwise
460
 
@@ -855,6 +809,11 @@ def API_search_classification(search_term:Union[int,str], maxResults:int,
855
 
856
  return results
857
 
 
 
 
 
 
858
  def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
859
  epi_prob, isEpi = epi_classify(text)
860
  return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
@@ -900,7 +859,7 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
900
  print(len(results),'abstracts classified as epidemiological.')
901
  return results.sort_values('EPI_PROB', ascending=False)
902
 
903
- #Returns a Pandas dataframe
904
  def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
905
  epi_ner:NER_Pipeline, #for biobert extraction
906
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
@@ -1021,7 +980,7 @@ def API_text_extraction(text:str, #Text to be extracted
1021
  else:
1022
  json_output = ['ABSTRACT']+ordered_labels
1023
 
1024
- results = {'entries':[]}
1025
  #Do the extraction
1026
  if extract_diseases:
1027
  extraction = epi_ner(text, GARD_Search)
@@ -1031,15 +990,17 @@ def API_text_extraction(text:str, #Text to be extracted
1031
  if extraction:
1032
  #Re-order the dictionary into desired JSON output
1033
  extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1034
- results['entries'].append(extraction)
 
 
1035
 
1036
- return results
1037
 
1038
  def API_text_classification_extraction(text:str, #Text to be extracted
1039
  epi_ner:NER_Pipeline, #for biobert extraction
1040
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1041
  epi_classify:Classify_Pipeline) -> Dict[str,str]:
1042
-
1043
  #Format of Output
1044
  ordered_labels = order_labels(epi_ner.labels)
1045
  if extract_diseases:
@@ -1061,7 +1022,11 @@ def API_text_classification_extraction(text:str, #Text to be extracted
1061
 
1062
  #Re-order the dictionary into desired JSON output
1063
  output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1064
- return output
 
 
 
 
1065
 
1066
  ## Section: Deprecated Functions
1067
  import requests
@@ -1148,4 +1113,4 @@ def search_EBI_API(searchterm_list:Union[List[str],str], maxResults:int) -> Dict
1148
  pmids_abs[pmid] = titles[0]+' '+abstracts[0]
1149
  i+=1
1150
 
1151
- return pmids_abs
 
6
 
7
  ## This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. The National Center for Advancing Translational Science (NCATS) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCATS and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCATS and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material.
8
 
9
+ # Written by William Kariampuzha @ NIH/NCATS.
10
  # The transformer-based pipeline code has its own copyright notice under the Apache License.
11
  # The code was compiled into a single python file to make adding additional features and importing into other modules easy.
12
  # Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
 
91
  if pmid[0].isdigit():
92
  pmids.add(pmid)
93
 
94
+ #Construct sets for filtering (right before adding abstract to pmid_abs)
95
  # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
96
  #if filtering is 'lenient' or default
97
  if filtering !='none' or filtering !='strict':
 
220
 
221
  return pmid_abs, (found, relevant)
222
 
223
+ ## Section: Transformer based Epi Classification Model (EpiClassify4GARD)
 
 
 
 
 
 
 
 
224
 
225
+ # Imports
226
+ from transformers import AutoModelForSequenceClassification, BertTokenizer, BertConfig
227
  class Classify_Pipeline:
228
+ def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiClassify4GARD"):
229
+ #Initialize tokenizer and model
230
+ self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
231
+ self.tokenizer = BertTokenizer.from_pretrained(self.config._name_or_path, model_max_length=self.config.max_position_embeddings)
232
+ self.model = AutoModelForSequenceClassification.from_pretrained(name_or_path_to_model_folder,config=self.config)
233
+
234
+ #Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  def __str__(self) -> str:
236
+ return "Instantiation: epi_classify = Classify_Pipeline(name_or_path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
237
 
238
  def __call__(self, abstract:str) -> Tuple[float,bool]:
239
  return self.getTextPredictions(abstract)
240
 
241
  def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
242
  if len(abstract)>5:
243
+ # input_ids
244
+ input_ids = self.tokenizer(text=abstract, max_length=self.config.max_position_embeddings,padding="max_length",truncation=True,return_tensors='pt')
245
+ if len(input_ids)>self.config.max_position_embeddings:
246
+ raise InputError(f"Token Embeddings of size {input_ids} exceed length for maximum model embedding input {self.config.max_position_embeddings}.")
247
+ #split into sentences?
248
+ # softmax output is a Torch Tensor with two classes [[vector_False_class,vector_True_class]]
249
+ output = self.model(**input_ids)
250
+ # True = 1, False = 0
251
+ isEpi = bool(output.logits.argmax().item())
252
+ # softmax output is a Torch Tensor with two classes [[prob_is_False,prob_is_True]]
253
+ prob_tensor = output.logits.softmax(dim=-1)
254
+ # We only want to return the probability that it is true
255
+ prob = prob_tensor.data[0][1].item()
 
 
 
 
 
 
 
256
  return prob, isEpi
257
  else:
258
  return 0.0, False
 
261
  abstract = PMID_getAb(pmid)
262
  prob, isEpi = self.getTextPredictions(abstract)
263
  return abstract, prob, isEpi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  ## Section: GARD SEARCH
266
  # can identify rare diseases in text using the GARD dictionary from neo4j
 
270
  def __init__(self):
271
  import json, codecs
272
  #These are opened locally so that garbage collection removes them from memory
273
+ try:
274
+ with codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
275
+ diseases = json.load(f)
276
+ except:
277
+ r = requests.get('https://raw.githubusercontent.com/ncats/epi4GARD/master/EpiExtract4GARD/gard-id-name-synonyms.json')
278
+ diseases = json.loads(r.content)
279
+
280
  from nltk.corpus import stopwords
281
  STOPWORDS = set(stopwords.words('english'))
282
 
283
+ #This should be a list of all GARD IDs for purposes like random choice for testing
284
+ GARD_id_list = [entry['gard_id'] for entry in diseases]
285
  #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
286
  GARD_dict = {}
287
  #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
 
302
  GARD_dict[s] = entry['gard_id']
303
  max_length = max(max_length,len(s.split()))
304
 
305
+ self.GARD_id_list = GARD_id_list
306
  self.GARD_dict = GARD_dict
307
  self.max_length = max_length
308
 
 
391
  print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
392
  return [searchterm]
393
 
394
+ # Return a random GARD_ID Search Term list
395
+ def random_disease(self) -> List[str]:
396
+ import random
397
+ gard_id = random.choice(self.GARD_id_list)
398
+ return self.autosearch(gard_id)
399
+
400
  ## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
401
  from nltk import tokenize as nltk_tokenize
402
  from dataclasses import dataclass
 
408
  from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
409
  from unidecode import unidecode
410
  from collections import OrderedDict
411
+ import json
412
  import pandas as pd
413
  from more_itertools import pairwise
414
 
 
809
 
810
  return results
811
 
812
+ def API_PMID_classification(pmid:Union[int,str], epi_classify:Classify_Pipeline) -> Dict[str,str]:
813
+ text = PMID_getAb(pmid)
814
+ epi_prob, isEpi = epi_classify(text)
815
+ return {'PMID':pmid,'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
816
+
817
  def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
818
  epi_prob, isEpi = epi_classify(text)
819
  return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
 
859
  print(len(results),'abstracts classified as epidemiological.')
860
  return results.sort_values('EPI_PROB', ascending=False)
861
 
862
+ #Returns a Pandas dataframe
863
  def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
864
  epi_ner:NER_Pipeline, #for biobert extraction
865
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
 
980
  else:
981
  json_output = ['ABSTRACT']+ordered_labels
982
 
983
+ extraction = dict()
984
  #Do the extraction
985
  if extract_diseases:
986
  extraction = epi_ner(text, GARD_Search)
 
990
  if extraction:
991
  #Re-order the dictionary into desired JSON output
992
  extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
993
+ else:
994
+ #This may return JSONs of different length than above
995
+ extraction = OrderedDict([(term, []) for term in json_output])
996
 
997
+ return extraction
998
 
999
  def API_text_classification_extraction(text:str, #Text to be extracted
1000
  epi_ner:NER_Pipeline, #for biobert extraction
1001
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1002
  epi_classify:Classify_Pipeline) -> Dict[str,str]:
1003
+
1004
  #Format of Output
1005
  ordered_labels = order_labels(epi_ner.labels)
1006
  if extract_diseases:
 
1022
 
1023
  #Re-order the dictionary into desired JSON output
1024
  output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1025
+ else:
1026
+ #This may return JSONs of different length than above
1027
+ output = OrderedDict([(term, []) for term in json_output])
1028
+
1029
+ return output
1030
 
1031
  ## Section: Deprecated Functions
1032
  import requests
 
1113
  pmids_abs[pmid] = titles[0]+' '+abstracts[0]
1114
  i+=1
1115
 
1116
+ return pmids_abs