Spaces:
Sleeping
Sleeping
wzkariampuzha
commited on
Commit
•
223e572
1
Parent(s):
d5db2e6
Update epi_pipeline.py
Browse files- epi_pipeline.py +130 -27
epi_pipeline.py
CHANGED
@@ -1,30 +1,21 @@
|
|
1 |
from typing import List, Dict, Union, Optional, Set, Tuple
|
2 |
|
3 |
# coding=utf-8
|
4 |
-
|
5 |
-
|
6 |
-
#
|
7 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
8 |
-
# you may not use this file except in compliance with the License.
|
9 |
-
# You may obtain a copy of the License at
|
10 |
-
#
|
11 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
12 |
-
#
|
13 |
-
# Unless required by applicable law or agreed to in writing, software
|
14 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
15 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16 |
-
# See the License for the specific language governing permissions and
|
17 |
-
# limitations under the License.
|
18 |
-
# ALSO See NCATS LICENSE
|
19 |
|
20 |
-
|
21 |
|
|
|
|
|
|
|
22 |
# Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
|
23 |
-
# the `Any` type is used in place of the specific class variable, not necessarily to mean that any object type can go there...
|
24 |
|
25 |
## Section: GATHER ABSTRACTS FROM APIs
|
26 |
import requests
|
27 |
import xml.etree.ElementTree as ET
|
|
|
|
|
28 |
from nltk.corpus import stopwords
|
29 |
STOPWORDS = set(stopwords.words('english'))
|
30 |
from nltk import tokenize as nltk_tokenize
|
@@ -464,6 +455,22 @@ import pandas as pd
|
|
464 |
from more_itertools import pairwise
|
465 |
|
466 |
# Subsection: Processing the abstracts into the correct data format
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
467 |
@dataclass
|
468 |
class NERInput:
|
469 |
"""
|
@@ -546,7 +553,10 @@ class NerDataset(Dataset):
|
|
546 |
def abstract2NERinputs(self, abstract:str) -> List[NERInput]:
|
547 |
guid_index = 0
|
548 |
sentences = self.str2sents(abstract)
|
549 |
-
ner_inputs = [NERInput(str(guid),
|
|
|
|
|
|
|
550 |
return ner_inputs
|
551 |
|
552 |
def convert_NERinputs_to_features(self,
|
@@ -662,6 +672,7 @@ class NerDataset(Dataset):
|
|
662 |
class NER_Pipeline:
|
663 |
def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiExtract4GARD-v2"):
|
664 |
self.bert_tokenizer = BertTokenizer.from_pretrained(name_or_path_to_model_folder)
|
|
|
665 |
#model = AutoModelForTokenClassification.from_pretrained(name_or_path_to_model_folder)
|
666 |
self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
|
667 |
self.labels = {re.sub(".-","",label) for label in self.config.label2id.keys() if label != "O"}
|
@@ -670,6 +681,7 @@ class NER_Pipeline:
|
|
670 |
def __str__(self):
|
671 |
return "Instantiation: pipe = NER_Pipeline(name_or_path_to_model_folder)"+"\n Calling: output_dict = pipe(text)"
|
672 |
|
|
|
673 |
def __call__(self, text:str, rd_identify:Union[GARD_Search,None] = None):
|
674 |
output_dict = {label:[] for label in self.labels}
|
675 |
|
@@ -767,6 +779,8 @@ class NER_Pipeline:
|
|
767 |
return bi, tag
|
768 |
|
769 |
|
|
|
|
|
770 |
#This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
|
771 |
def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
|
772 |
ordered_labels = []
|
@@ -778,9 +792,72 @@ def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
|
|
778 |
ordered_labels.append(entity)
|
779 |
return ordered_labels
|
780 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
781 |
# Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
|
782 |
# It then extracts Epidemiologic Information[Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic] for each abstract
|
783 |
-
# results = search_term_extraction(search_term, maxResults, filering, NER_pipeline,
|
784 |
#Returns a Pandas dataframe
|
785 |
def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
786 |
epi_ner:NER_Pipeline, #for biobert extraction
|
@@ -882,11 +959,11 @@ def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:s
|
|
882 |
|
883 |
return results.sort_values('PROB_OF_EPI', ascending=False), sankey_data, disease_gardID
|
884 |
|
885 |
-
#Identical to search_term_extraction, except it returns a JSON
|
886 |
def API_search_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
887 |
epi_ner:NER_Pipeline, #for biobert extraction
|
888 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
889 |
-
epi_classify:Classify_Pipeline) ->
|
890 |
|
891 |
#Format of Output
|
892 |
ordered_labels = order_labels(epi_ner.labels)
|
@@ -926,20 +1003,19 @@ def API_search_extraction(search_term:Union[int,str], maxResults:int, filtering:
|
|
926 |
entry['EPI_PROB'] = str(entry['EPI_PROB'])
|
927 |
|
928 |
return results
|
929 |
-
#return json.dumps(results)
|
930 |
|
931 |
-
#Identical to search_term_extraction, except it returns a JSON
|
932 |
def API_text_extraction(text:str, #Text to be extracted
|
933 |
epi_ner:NER_Pipeline, #for biobert extraction
|
934 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
935 |
-
) ->
|
936 |
-
|
937 |
#Format of Output
|
938 |
ordered_labels = order_labels(epi_ner.labels)
|
939 |
if extract_diseases:
|
940 |
json_output = ['ABSTRACT','IDS','DIS']+ordered_labels
|
941 |
else:
|
942 |
-
json_output = ['ABSTRACT'
|
943 |
|
944 |
results = {'entries':[]}
|
945 |
#Do the extraction
|
@@ -954,7 +1030,34 @@ def API_text_extraction(text:str, #Text to be extracted
|
|
954 |
results['entries'].append(extraction)
|
955 |
|
956 |
return results
|
957 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
958 |
|
959 |
## Section: Deprecated Functions
|
960 |
import requests
|
|
|
1 |
from typing import List, Dict, Union, Optional, Set, Tuple
|
2 |
|
3 |
# coding=utf-8
|
4 |
+
## PUBLIC DOMAIN NOTICE
|
5 |
+
## National Center for Advancing Translational Sciences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
## This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. The National Center for Advancing Translational Science (NCATS) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCATS and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCATS and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material.
|
8 |
|
9 |
+
# Written by William Kariampuzha @ NIH/NCATS. Adapted from code written by Jennifer John, et al.
|
10 |
+
# The transformer-based pipeline code has its own copyright notice under the Apache License.
|
11 |
+
# The code was compiled into a single python file to make adding additional features and importing into other modules easy.
|
12 |
# Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
|
|
|
13 |
|
14 |
## Section: GATHER ABSTRACTS FROM APIs
|
15 |
import requests
|
16 |
import xml.etree.ElementTree as ET
|
17 |
+
import nltk
|
18 |
+
nltk.data.path.extend(["/home/user/app/nltk_data","./nltk_data"])
|
19 |
from nltk.corpus import stopwords
|
20 |
STOPWORDS = set(stopwords.words('english'))
|
21 |
from nltk import tokenize as nltk_tokenize
|
|
|
455 |
from more_itertools import pairwise
|
456 |
|
457 |
# Subsection: Processing the abstracts into the correct data format
|
458 |
+
|
459 |
+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
460 |
+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
461 |
+
#
|
462 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
463 |
+
# you may not use this file except in compliance with the License.
|
464 |
+
# You may obtain a copy of the License at
|
465 |
+
#
|
466 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
467 |
+
#
|
468 |
+
# Unless required by applicable law or agreed to in writing, software
|
469 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
470 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
471 |
+
# See the License for the specific language governing permissions and
|
472 |
+
# limitations under the License.
|
473 |
+
|
474 |
@dataclass
|
475 |
class NERInput:
|
476 |
"""
|
|
|
553 |
def abstract2NERinputs(self, abstract:str) -> List[NERInput]:
|
554 |
guid_index = 0
|
555 |
sentences = self.str2sents(abstract)
|
556 |
+
ner_inputs = [NERInput(str(guid),
|
557 |
+
nltk_tokenize.word_tokenize(sent),
|
558 |
+
["O" for i in range(len(nltk_tokenize.word_tokenize(sent)))])
|
559 |
+
for guid, sent in enumerate(sentences)]
|
560 |
return ner_inputs
|
561 |
|
562 |
def convert_NERinputs_to_features(self,
|
|
|
672 |
class NER_Pipeline:
|
673 |
def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiExtract4GARD-v2"):
|
674 |
self.bert_tokenizer = BertTokenizer.from_pretrained(name_or_path_to_model_folder)
|
675 |
+
#no need for model variable because trainer wraps model and has more functions
|
676 |
#model = AutoModelForTokenClassification.from_pretrained(name_or_path_to_model_folder)
|
677 |
self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
|
678 |
self.labels = {re.sub(".-","",label) for label in self.config.label2id.keys() if label != "O"}
|
|
|
681 |
def __str__(self):
|
682 |
return "Instantiation: pipe = NER_Pipeline(name_or_path_to_model_folder)"+"\n Calling: output_dict = pipe(text)"
|
683 |
|
684 |
+
#Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
|
685 |
def __call__(self, text:str, rd_identify:Union[GARD_Search,None] = None):
|
686 |
output_dict = {label:[] for label in self.labels}
|
687 |
|
|
|
779 |
return bi, tag
|
780 |
|
781 |
|
782 |
+
# Unattached function -- not a method
|
783 |
+
# move this to the NER_pipeline as a method??
|
784 |
#This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
|
785 |
def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
|
786 |
ordered_labels = []
|
|
|
792 |
ordered_labels.append(entity)
|
793 |
return ordered_labels
|
794 |
|
795 |
+
## SECTION: PIPELINES
|
796 |
+
## This section combines all of the previous code into pipelines so that usage of these models and search functions are easy to implement in apps.
|
797 |
+
|
798 |
+
# Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
|
799 |
+
# results = search_term_extraction(search_term, maxResults, filering, GARD_dict, classify_model_vars)
|
800 |
+
#Returns a Pandas dataframe
|
801 |
+
def search_term_classification(search_term:Union[int,str], maxResults:int,
|
802 |
+
filtering:str, rd_identify:GARD_Search, #for abstract search & filtering
|
803 |
+
epi_classify:Classify_Pipeline) -> pd.DataFrame: #for classification
|
804 |
+
|
805 |
+
results = pd.DataFrame(columns=['PMID', 'ABSTRACT','EPI_PROB','IsEpi'])
|
806 |
+
|
807 |
+
##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
|
808 |
+
search_term_list = rd_identify.autosearch(search_term)
|
809 |
+
|
810 |
+
#Gather title+abstracts into a dictionary {pmid:abstract}
|
811 |
+
pmid_abs = search_getAbs(search_term_list, maxResults, filtering)
|
812 |
+
|
813 |
+
for pmid, abstract in pmid_abs.items():
|
814 |
+
epi_prob, isEpi = epi_classify(abstract)
|
815 |
+
result = {'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi}
|
816 |
+
#Slow dataframe update
|
817 |
+
results = results.append(result, ignore_index=True)
|
818 |
+
|
819 |
+
return results.sort_values('EPI_PROB', ascending=False)
|
820 |
+
|
821 |
+
#Identical to search_term_classification, except it returns a JSON-compatible dictionary instead of a df
|
822 |
+
def API_search_classification(search_term:Union[int,str], maxResults:int,
|
823 |
+
filtering:str, GARD_Search:GARD_Search, #for abstract search & filtering
|
824 |
+
epi_classify:Classify_Pipeline) -> Dict[str,str]: #for classification
|
825 |
+
|
826 |
+
#Format of Output
|
827 |
+
results = {'entries':[]}
|
828 |
+
|
829 |
+
##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
|
830 |
+
print('Inside `API_search_classification`. this is `search_term`:',search_term,type(search_term))
|
831 |
+
search_term_list = GARD_Search.autosearch(search_term)
|
832 |
+
|
833 |
+
#Gather title+abstracts into a dictionary {pmid:abstract}
|
834 |
+
pmid_abs = search_getAbs(search_term_list, maxResults, filtering)
|
835 |
+
|
836 |
+
for pmid, abstract in pmid_abs.items():
|
837 |
+
epi_prob, isEpi = epi_classify(abstract)
|
838 |
+
result = {'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi}
|
839 |
+
results['entries'].append(result)
|
840 |
+
|
841 |
+
#sort
|
842 |
+
results['entries'].sort(reverse=True, key=lambda x:x['EPI_PROB'])
|
843 |
+
|
844 |
+
# float is not JSON serializable, so must convert all epi_probs to str
|
845 |
+
# This returns a map object, which is not JSON serializable
|
846 |
+
# results['entries'] = map(lambda entry:str(entry['EPI_PROB']),results['entries'])
|
847 |
+
# so must convert floats to str the boring and slow way
|
848 |
+
|
849 |
+
for entry in results['entries']:
|
850 |
+
entry['EPI_PROB'] = str(entry['EPI_PROB'])
|
851 |
+
|
852 |
+
return results
|
853 |
+
|
854 |
+
def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
855 |
+
epi_prob, isEpi = epi_classify(text)
|
856 |
+
return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
|
857 |
+
|
858 |
# Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
|
859 |
# It then extracts Epidemiologic Information[Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic] for each abstract
|
860 |
+
# results = search_term_extraction(search_term, maxResults, filering, NER_pipeline, extract_diseases, GARD_Search, Classify_Pipeline)
|
861 |
#Returns a Pandas dataframe
|
862 |
def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
863 |
epi_ner:NER_Pipeline, #for biobert extraction
|
|
|
959 |
|
960 |
return results.sort_values('PROB_OF_EPI', ascending=False), sankey_data, disease_gardID
|
961 |
|
962 |
+
#Identical to search_term_extraction, except it returns a JSON-compatible dictionary instead of a df
|
963 |
def API_search_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
|
964 |
epi_ner:NER_Pipeline, #for biobert extraction
|
965 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
966 |
+
epi_classify:Classify_Pipeline) -> Dict[str,str]: #for classification
|
967 |
|
968 |
#Format of Output
|
969 |
ordered_labels = order_labels(epi_ner.labels)
|
|
|
1003 |
entry['EPI_PROB'] = str(entry['EPI_PROB'])
|
1004 |
|
1005 |
return results
|
|
|
1006 |
|
1007 |
+
#Identical to search_term_extraction, except it returns a JSON-compatible dictionary instead of a df
|
1008 |
def API_text_extraction(text:str, #Text to be extracted
|
1009 |
epi_ner:NER_Pipeline, #for biobert extraction
|
1010 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
1011 |
+
) -> Dict[str,str]:
|
1012 |
+
|
1013 |
#Format of Output
|
1014 |
ordered_labels = order_labels(epi_ner.labels)
|
1015 |
if extract_diseases:
|
1016 |
json_output = ['ABSTRACT','IDS','DIS']+ordered_labels
|
1017 |
else:
|
1018 |
+
json_output = ['ABSTRACT']+ordered_labels
|
1019 |
|
1020 |
results = {'entries':[]}
|
1021 |
#Do the extraction
|
|
|
1030 |
results['entries'].append(extraction)
|
1031 |
|
1032 |
return results
|
1033 |
+
|
1034 |
+
def API_text_classification_extraction(text:str, #Text to be extracted
|
1035 |
+
epi_ner:NER_Pipeline, #for biobert extraction
|
1036 |
+
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
|
1037 |
+
epi_classify:Classify_Pipeline) -> Dict[str,str]:
|
1038 |
+
|
1039 |
+
#Format of Output
|
1040 |
+
ordered_labels = order_labels(epi_ner.labels)
|
1041 |
+
if extract_diseases:
|
1042 |
+
json_output = ['ABSTRACT','IsEpi','EPI_PROB','IDS','DIS']+ordered_labels
|
1043 |
+
else:
|
1044 |
+
json_output = ['ABSTRACT','IsEpi','EPI_PROB']+ordered_labels
|
1045 |
+
|
1046 |
+
#Do the extraction
|
1047 |
+
if extract_diseases:
|
1048 |
+
extraction = epi_ner(text, GARD_Search)
|
1049 |
+
else:
|
1050 |
+
extraction = epi_ner(text)
|
1051 |
+
|
1052 |
+
if extraction:
|
1053 |
+
#Add the epidemiology probability and result
|
1054 |
+
#Does not matter which order these are done in but doing classification after may save some time if there is no valid extraction
|
1055 |
+
epi_prob, isEpi = epi_classify(text)
|
1056 |
+
extraction.update({'EPI_PROB':str(epi_prob),'IsEpi':isEpi})
|
1057 |
+
|
1058 |
+
#Re-order the dictionary into desired JSON output
|
1059 |
+
output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
|
1060 |
+
return output
|
1061 |
|
1062 |
## Section: Deprecated Functions
|
1063 |
import requests
|