Spaces:

ncats
/

EpiPipeline4RD

Sleeping

App Files Files Community

wzkariampuzha commited on Mar 24, 2022

Commit

e517955

1 Parent(s): d5406d4

Update classify_abs.py

Browse files

Files changed (1) hide show

classify_abs.py +61 -80

classify_abs.py CHANGED Viewed

@@ -277,103 +277,84 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
     return pmid_abs
 def streamlist_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
-    #set of all pmids
     pmids = set()
-    #dictionary {pmid:abstract}
     pmid_abs = {}
-    #type validation, allows string or list input
     if type(searchterm_list)!=list:
         if type(searchterm_list)==str:
             searchterm_list = [searchterm_list]
         else:
             searchterm_list = list(searchterm_list)
-    my_bar = st.progress(0)
-    percent_by_step = 100/maxResults
-    #gathers pmids into a set first
-    for dz in searchterm_list:
-        term = ''
-        dz_words = dz.split()
-        for word in dz_words:
-            term += word + '%20'
-        query = term[:-3]
-        ## get pmid results from searching for disease name through PubMed API
-        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
-        r = requests.get(url)
-        root = ET.fromstring(r.content)
-        # loop over resulting articles
-        for result in root.iter('IdList'):
-            if len(pmids) >= maxResults:
-                break
-            pmidlist = [pmid.text for pmid in result.iter('Id')]
-            pmids.update(pmidlist)
-        ## get results from searching for disease name through EBI API
-        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
-        r = requests.get(url)
-        root = ET.fromstring(r.content)
-        # loop over resulting articles
-        for result in root.iter('result'):
-            if len(pmids) >= maxResults:
-                break
-            pmidlist = [pmid.text for pmid in result.iter('id')]
-            #can also gather abstract and title here but for some reason did not work as intended the first time. Optimize in future versions to reduce latency.
-            if len(pmidlist) > 0:
-                pmid = pmidlist[0]
-                if pmid[0].isdigit():
-                    pmids.add(pmid)
-    #Construct sets for filtering (right before adding abstract to pmid_abs
-    # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
-    #if filtering is 'lenient' or default
-    if filtering !='none' or filtering !='strict':
-        filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
-        '''
-        # The above is equivalent to this but uses less memory and may be faster:
-        #create a single string of the terms within the searchterm_list
-        joined = ' '.join(searchterm_list)
-        #remove commas
-        comma_gone = re.sub(',','',joined)
-        #split the string into list of words and convert list into a Pythonic set
-        split = set(comma_gone.split())
-        #remove the STOPWORDS from the set of key words
-        key_words = split.difference(STOPWORDS)
-        #create a new set of the list members in searchterm_list
-        search_set = set(searchterm_list)
-        #join the two sets
-        terms = search_set.union(key_words)
-        #if any word(s) in the abstract intersect with any of these terms then the abstract is good to go.
-        '''
-    ## get abstracts from EBI PMID API and output a dictionary
-    for pmid in pmids:
-        abstract = PMID_getAb(pmid)
-        if len(abstract)>5:
-            #do filtering here
-            if filtering == 'strict':
-                uncased_ab = abstract.lower()
-                for term in searchterm_list:
-                    if term.lower() in uncased_ab:
-                        pmid_abs[pmid] = abstract
-                        break
-            elif filtering =='none':
-                pmid_abs[pmid] = abstract
-            #Default filtering is 'lenient'.
-            else:
-                #Else and if are separated for readability and to better understand logical flow.
-                if set(filter_terms).intersection(set(word_tokenize(abstract))):
                     pmid_abs[pmid] = abstract
-    print('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
     return pmid_abs

     return pmid_abs
+#This is a streamlit version of search_getAbs. Refer to search_getAbs for documentation
 def streamlist_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
     pmids = set()
     pmid_abs = {}
     if type(searchterm_list)!=list:
         if type(searchterm_list)==str:
             searchterm_list = [searchterm_list]
         else:
             searchterm_list = list(searchterm_list)
+    percent_by_step = 1/(maxResults*1.25) #maxResults is multiplied by a little bit because sometimes the results returned is more than maxResults
+    with PMIDs_bar = st.progress(0):
+        for dz in searchterm_list:
+            term = ''
+            dz_words = dz.split()
+            for word in dz_words:
+                term += word + '%20'
+            query = term[:-3]
+            url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
+            r = requests.get(url)
+            root = ET.fromstring(r.content)
+            for result in root.iter('IdList'):
+                if len(pmids) >= maxResults:
+                    break
+                pmidlist = [pmid.text for pmid in result.iter('Id')]
+                pmids.update(pmidlist)
+                PMIDs_bar.progress(round(len(pmids)*percent_by_step,1))
+            url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
+            r = requests.get(url)
+            root = ET.fromstring(r.content)
+            for result in root.iter('result'):
+                if len(pmids) >= maxResults:
+                    break
+                pmidlist = [pmid.text for pmid in result.iter('id')]
+                if len(pmidlist) > 0:
+                    pmid = pmidlist[0]
+                    if pmid[0].isdigit():
+                        pmids.add(pmid)
+                        PMIDs_bar.progress(round(len(pmids)*percent_by_step,1))
+        st.success('Found',len(pmids),'PMIDs. Gathering Abstracts and Filtering...')
+    with abstracts_bar = st.progress(0):
+        percent_by_step = 1/(maxResults)
+        if filtering !='none' or filtering !='strict':
+            filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
+        for pmid in pmids:
+            abstract = PMID_getAb(pmid)
+            if len(abstract)>5:
+                #do filtering here
+                if filtering == 'strict':
+                    uncased_ab = abstract.lower()
+                    for term in searchterm_list:
+                        if term.lower() in uncased_ab:
+                            pmid_abs[pmid] = abstract
+                            abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
+                            break
+                elif filtering =='none':
                     pmid_abs[pmid] = abstract
+                    abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
+                #Default filtering is 'lenient'.
+                else:
+                    #Else and if are separated for readability and to better understand logical flow.
+                    if set(filter_terms).intersection(set(word_tokenize(abstract))):
+                        pmid_abs[pmid] = abstract
+                        abstracts_bar.progress(round(len(pmid_abs)*percent_by_step,1))
+    st.success('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
     return pmid_abs