Spaces:

ncats
/

EpiPipeline4RD

Running

File size: 17,788 Bytes

import argparse
import requests
import xml.etree.ElementTree as ET
import pickle
import re
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import numpy as np
import streamlit as st
from tensorflow.keras.preprocessing.sequence import pad_sequences
STOPWORDS = set(stopwords.words('english'))
max_length = 300
trunc_type = 'post'
padding_type = 'post'

from typing import (
    Dict,
    List,
    Tuple,
    Set,
    Optional,
    Any,
    Union,
)

# Standardize the abstract by replacing all named entities with their entity label.
# Eg. 3 patients reported at a clinic in England --> CARDINAL patients reported at a clinic in GPE
# expects the spaCy model en_core_web_lg as input
def standardizeAbstract(abstract:str, nlp:Any) -> str:
    doc = nlp(abstract)
    newAbstract = abstract
    for e in reversed(doc.ents):
        if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
            start = e.start_char
            end = start + len(e.text)
            newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
    return newAbstract

# Same as above but replaces biomedical named entities from scispaCy models
# Expects as input en_ner_bc5cdr_md and en_ner_bionlp13cg_md
def standardizeSciTerms(abstract:str, nlpSci:Any, nlpSci2:Any) -> str:
    doc = nlpSci(abstract)
    newAbstract = abstract
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
        
    doc = nlpSci2(newAbstract)
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
    return newAbstract

# Prepare model
#nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer= init_classify_model()
def init_classify_model(model:str='LSTM_RNN_Model') -> Tuple[Any,Any,Any,Any,Any]:
    #Load spaCy models
    nlp = spacy.load('en_core_web_lg')
    nlpSci = spacy.load("en_ner_bc5cdr_md")
    nlpSci2 = spacy.load('en_ner_bionlp13cg_md')
    
    # load the tokenizer
    with open('tokenizer.pickle', 'rb') as handle:
        classify_tokenizer = pickle.load(handle)
    
    # load the model
    classify_model = tf.keras.models.load_model(model) 
    
    return (nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer)

#Gets abstract and title (concatenated) from EBI API
def PMID_getAb(PMID:Union[int,str]) -> str: 
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(PMID)+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    titles = [title.text for title in root.iter('title')]
    abstracts = [abstract.text for abstract in root.iter('abstractText')]
    if len(abstracts) > 0 and len(abstracts[0])>5:
        return titles[0]+' '+abstracts[0]
    else:
        return ''

def search_Pubmed_API(searchterm_list:Union[List[str],str], maxResults:int) -> Dict[str,str]: #returns a dictionary of {pmids:abstracts} 
    print('search_Pubmed_API is DEPRECATED. UTILIZE search_NCBI_API for NCBI ENTREZ API results. Utilize search_getAbs for most comprehensive results.')
    return search_NCBI_API(searchterm_list, maxResults)
    
## DEPRECATED, use search_getAbs for more comprehensive results
def search_NCBI_API(searchterm_list:Union[List[str],str], maxResults:int) -> Dict[str,str]: #returns a dictionary of {pmids:abstracts} 
    print('search_NCBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.')
    pmid_to_abs = {}
    i = 0
    
    #type validation, allows string or list input
    if type(searchterm_list)!=list:
        if type(searchterm_list)==str:
            searchterm_list = [searchterm_list]
        else:
            searchterm_list = list(searchterm_list)
    
    #gathers pmids into a set first
    for dz in searchterm_list:
        # get results from searching for disease name through PubMed API
        term = ''
        dz_words = dz.split()
        for word in dz_words:
            term += word + '%20'
        query = term[:-3]
        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
        r = requests.get(url)
        root = ET.fromstring(r.content)    

        # loop over resulting articles
        for result in root.iter('IdList'):
            pmids = [pmid.text for pmid in result.iter('Id')]
            if i >= maxResults:
                break
            for pmid in pmids:
                if pmid not in pmid_to_abs.keys():
                    abstract = PMID_getAb(pmid)
                    if len(abstract)>5:
                        pmid_to_abs[pmid]=abstract
                        i+=1
                    
    return pmid_to_abs

## DEPRECATED, use search_getAbs for more comprehensive results
# get results from searching for disease name through EBI API
def search_EBI_API(searchterm_list:Union[List[str],str], maxResults:int) -> Dict[str,str]: #returns a dictionary of {pmids:abstracts}    
    print('DEPRECATED. Utilize search_getAbs for most comprehensive results.')
    pmids_abs = {}
    i = 0
    
    #type validation, allows string or list input
    if type(searchterm_list)!=list:
        if type(searchterm_list)==str:
            searchterm_list = [searchterm_list]
        else:
            searchterm_list = list(searchterm_list)
    
    #gathers pmids into a set first
    for dz in searchterm_list:
        if i >= maxResults:
            break
        term = ''
        dz_words = dz.split()
        for word in dz_words:
            term += word + '%20'
        query = term[:-3]
        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('result'):
            if i >= maxResults:
                break
            pmids = [pmid.text for pmid in result.iter('id')]
            if len(pmids) > 0:
                pmid = pmids[0]
                if pmid[0].isdigit():
                    abstracts = [abstract.text for abstract in result.iter('abstractText')]
                    titles = [title.text for title in result.iter('title')]
                    if len(abstracts) > 0:# and len(abstracts[0])>5:
                        pmids_abs[pmid] = titles[0]+' '+abstracts[0]
                        i+=1
    
    return pmids_abs

## This is the main, most comprehensive search_term function, it can take in a search term or a list of search terms and output a dictionary of {pmids:abstracts}
## Gets results from searching through both PubMed and EBI search term APIs, also makes use of the EBI API for PMIDs. 
## EBI API and PubMed API give different results
# This makes n+2 API calls where n<=maxResults, which is slow 
# There is a way to optimize by gathering abstracts from the EBI API when also getting pmids but did not pursue due to time constraints
# Filtering can be 
#   'strict' - must have some exact match to at leastone of search terms/phrases in text)
#   'lenient' - part of the abstract must match at least one word in the search term phrases.
#   'none'
def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
    #set of all pmids
    pmids = set()
    
    #dictionary {pmid:abstract}
    pmid_abs = {}
    
    #type validation, allows string or list input
    if type(searchterm_list)!=list:
        if type(searchterm_list)==str:
            searchterm_list = [searchterm_list]
        else:
            searchterm_list = list(searchterm_list)
    
    #gathers pmids into a set first
    for dz in searchterm_list:
        term = ''
        dz_words = dz.split()
        for word in dz_words:
            term += word + '%20'
        query = term[:-3]

        ## get pmid results from searching for disease name through PubMed API
        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('IdList'):
            if len(pmids) >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('Id')]
            pmids.update(pmidlist)

        ## get results from searching for disease name through EBI API
        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('result'):
            if len(pmids) >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('id')]
            #can also gather abstract and title here but for some reason did not work as intended the first time. Optimize in future versions to reduce latency.
            if len(pmidlist) > 0:
                pmid = pmidlist[0]
                if pmid[0].isdigit():
                    pmids.add(pmid)
    
    #Construct sets for filtering (right before adding abstract to pmid_abs
    # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
    #if filtering is 'lenient' or default
    if filtering !='none' or filtering !='strict':
        filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
        '''
        # The above is equivalent to this but uses less memory and may be faster:
        #create a single string of the terms within the searchterm_list
        joined = ' '.join(searchterm_list)
        #remove commas
        comma_gone = re.sub(',','',joined)
        #split the string into list of words and convert list into a Pythonic set
        split = set(comma_gone.split())
        #remove the STOPWORDS from the set of key words
        key_words = split.difference(STOPWORDS)
        #create a new set of the list members in searchterm_list
        search_set = set(searchterm_list)
        #join the two sets
        terms = search_set.union(key_words)
        #if any word(s) in the abstract intersect with any of these terms then the abstract is good to go.
        '''
    
    ## get abstracts from EBI PMID API and output a dictionary
    for pmid in pmids:
        abstract = PMID_getAb(pmid)
        if len(abstract)>5:
            #do filtering here
            if filtering == 'strict':
                uncased_ab = abstract.lower()
                for term in searchterm_list:
                    if term.lower() in uncased_ab:
                        pmid_abs[pmid] = abstract
                        break
            elif filtering =='none':
                pmid_abs[pmid] = abstract
            
            #Default filtering is 'lenient'.
            else:
                #Else and if are separated for readability and to better understand logical flow.
                if set(filter_terms).intersection(set(word_tokenize(abstract))):
                    pmid_abs[pmid] = abstract
                
                    
    print('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
    
    return pmid_abs

#This is a streamlit version of search_getAbs. Refer to search_getAbs for documentation
def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
    pmids = set()
    
    pmid_abs = {}
    
    if type(searchterm_list)!=list:
        if type(searchterm_list)==str:
            searchterm_list = [searchterm_list]
        else:
            searchterm_list = list(searchterm_list)
    #maxResults is multiplied by a little bit because sometimes the results returned is more than maxResults
    percent_by_step = 1/(maxResults*1.05) 
    PMIDs_bar = st.progress(0)
    for dz in searchterm_list:
        term = ''
        dz_words = dz.split()
        for word in dz_words:
            term += word + '%20'
        query = term[:-3]

        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
        r = requests.get(url)
        root = ET.fromstring(r.content)

        for result in root.iter('IdList'):
            if len(pmids) >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('Id')]
            pmids.update(pmidlist)
            PMIDs_bar.progress(round(len(pmids)*percent_by_step,1))

        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
        r = requests.get(url)
        root = ET.fromstring(r.content)

        for result in root.iter('result'):
            if len(pmids) >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('id')]
            if len(pmidlist) > 0:
                pmid = pmidlist[0]
                if pmid[0].isdigit():
                    pmids.add(pmid)
                    PMIDs_bar.progress(round(len(pmids)*percent_by_step,1))
    PMIDs_bar.empty()
    
    with st.success('Found '+str(len(pmids))+' PMIDs. Gathering Abstracts and Filtering...'):
        abstracts_bar = st.progress(0)
        percent_by_step = 1/(maxResults)
        if filtering !='none' or filtering !='strict':
            filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))

        for pmid in pmids:
            abstract = PMID_getAb(pmid)
            if len(abstract)>5:
                #do filtering here
                if filtering == 'strict':
                    uncased_ab = abstract.lower()
                    for term in searchterm_list:
                        if term.lower() in uncased_ab:
                            pmid_abs[pmid] = abstract
                            abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
                            break
                elif filtering =='none':
                    pmid_abs[pmid] = abstract
                    abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
                
                #Default filtering is 'lenient'.
                else:
                    #Else and if are separated for readability and to better understand logical flow.
                    if set(filter_terms).intersection(set(word_tokenize(abstract))):
                        pmid_abs[pmid] = abstract
                        abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
        abstracts_bar.empty()
    
    st.success('Found '+str(len(pmids))+' PMIDs. Gathered '+str(len(pmid_abs))+' Relevant Abstracts.')
    
    return pmid_abs

# Generate predictions for a PubMed Id
# nlp: en_core_web_lg
# nlpSci: en_ner_bc5cdr_md
# nlpSci2: en_ner_bionlp13cg_md
# Defaults to load my_model_orphanet_final, the most up-to-date version of the classification model,
# but can also be run on any other tf.keras model
#This was originally getPredictions
def getPMIDPredictions(pmid:Union[str,int], classify_model_vars:Tuple[Any,Any,Any,Any,Any]) -> Tuple[str,float,bool]:
    nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer = classify_model_vars
    abstract = PMID_getAb(pmid)
    
    if len(abstract)>5:
        # remove stopwords
        for word in STOPWORDS:
            token = ' ' + word + ' '
            abstract = abstract.replace(token, ' ')
            abstract = abstract.replace(' ', ' ')

        # preprocess abstract
        abstract_standard = [standardizeAbstract(standardizeSciTerms(abstract, nlpSci, nlpSci2), nlp)]
        sequence = classify_tokenizer.texts_to_sequences(abstract_standard)
        padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

        y_pred1 = classify_model.predict(padded) # generate prediction
        y_pred = np.argmax(y_pred1, axis=1) # get binary prediction

        prob = y_pred1[0][1]
        if y_pred == 1:
            isEpi = True
        else:
            isEpi = False

        return abstract, prob, isEpi
    
    else:
        return abstract, 0.0, False


def getTextPredictions(abstract:str, classify_model_vars:Tuple[Any,Any,Any,Any,Any]) -> Tuple[float,bool]:
    
    nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer = classify_model_vars
    
    if len(abstract)>5:
        # remove stopwords
        for word in STOPWORDS:
            token = ' ' + word + ' '
            abstract = abstract.replace(token, ' ')
            abstract = abstract.replace(' ', ' ')

        # preprocess abstract
        abstract_standard = [standardizeAbstract(standardizeSciTerms(abstract, nlpSci, nlpSci2), nlp)]
        sequence = classify_tokenizer.texts_to_sequences(abstract_standard)
        padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

        y_pred1 = classify_model.predict(padded) # generate prediction
        y_pred = np.argmax(y_pred1, axis=1) # get binary prediction

        prob = y_pred1[0][1]
        if y_pred == 1:
            isEpi = True
        else:
            isEpi = False

        return prob, isEpi
    
    else:
        return 0.0, False

if __name__ == '__main__':
    print('Loading 5 NLP models...')
    classify_model_vars= init_classify_model()
    print('All models loaded.')
    pmid = input('\nEnter PubMed PMID (or DONE): ')
    while pmid != 'DONE':
        abstract, prob, isEpi = getPredictions(pmid, classify_model_vars)
        print(abstract, prob, isEpi)
        pmid = input('\nEnter PubMed PMID (or DONE): ')