Spaces:

ncats
/

EpiPipeline4RD

Running

App Files Files Community

wzkariampuzha commited on Mar 24, 2022

Commit

d5406d4

1 Parent(s): 31ca6c1

Update classify_abs.py

Browse files

Files changed (1) hide show

classify_abs.py +100 -0

classify_abs.py CHANGED Viewed

@@ -277,6 +277,106 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
     return pmid_abs
 # Generate predictions for a PubMed Id
 # nlp: en_core_web_lg
 # nlpSci: en_ner_bc5cdr_md

     return pmid_abs
+def streamlist_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
+    #set of all pmids
+    pmids = set()
+    #dictionary {pmid:abstract}
+    pmid_abs = {}
+    #type validation, allows string or list input
+    if type(searchterm_list)!=list:
+        if type(searchterm_list)==str:
+            searchterm_list = [searchterm_list]
+        else:
+            searchterm_list = list(searchterm_list)
+    my_bar = st.progress(0)
+    percent_by_step = 100/maxResults
+    #gathers pmids into a set first
+    for dz in searchterm_list:
+        term = ''
+        dz_words = dz.split()
+        for word in dz_words:
+            term += word + '%20'
+        query = term[:-3]
+        ## get pmid results from searching for disease name through PubMed API
+        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
+        r = requests.get(url)
+        root = ET.fromstring(r.content)
+        # loop over resulting articles
+        for result in root.iter('IdList'):
+            if len(pmids) >= maxResults:
+                break
+            pmidlist = [pmid.text for pmid in result.iter('Id')]
+            pmids.update(pmidlist)
+        ## get results from searching for disease name through EBI API
+        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
+        r = requests.get(url)
+        root = ET.fromstring(r.content)
+        # loop over resulting articles
+        for result in root.iter('result'):
+            if len(pmids) >= maxResults:
+                break
+            pmidlist = [pmid.text for pmid in result.iter('id')]
+            #can also gather abstract and title here but for some reason did not work as intended the first time. Optimize in future versions to reduce latency.
+            if len(pmidlist) > 0:
+                pmid = pmidlist[0]
+                if pmid[0].isdigit():
+                    pmids.add(pmid)
+    #Construct sets for filtering (right before adding abstract to pmid_abs
+    # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
+    #if filtering is 'lenient' or default
+    if filtering !='none' or filtering !='strict':
+        filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
+        '''
+        # The above is equivalent to this but uses less memory and may be faster:
+        #create a single string of the terms within the searchterm_list
+        joined = ' '.join(searchterm_list)
+        #remove commas
+        comma_gone = re.sub(',','',joined)
+        #split the string into list of words and convert list into a Pythonic set
+        split = set(comma_gone.split())
+        #remove the STOPWORDS from the set of key words
+        key_words = split.difference(STOPWORDS)
+        #create a new set of the list members in searchterm_list
+        search_set = set(searchterm_list)
+        #join the two sets
+        terms = search_set.union(key_words)
+        #if any word(s) in the abstract intersect with any of these terms then the abstract is good to go.
+        '''
+    ## get abstracts from EBI PMID API and output a dictionary
+    for pmid in pmids:
+        abstract = PMID_getAb(pmid)
+        if len(abstract)>5:
+            #do filtering here
+            if filtering == 'strict':
+                uncased_ab = abstract.lower()
+                for term in searchterm_list:
+                    if term.lower() in uncased_ab:
+                        pmid_abs[pmid] = abstract
+                        break
+            elif filtering =='none':
+                pmid_abs[pmid] = abstract
+            #Default filtering is 'lenient'.
+            else:
+                #Else and if are separated for readability and to better understand logical flow.
+                if set(filter_terms).intersection(set(word_tokenize(abstract))):
+                    pmid_abs[pmid] = abstract
+    print('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
+    return pmid_abs
 # Generate predictions for a PubMed Id
 # nlp: en_core_web_lg
 # nlpSci: en_ner_bc5cdr_md