Spaces:
Running
Running
wzkariampuzha
commited on
Commit
·
b892550
1
Parent(s):
32ef4d9
Update epi_pipeline.py
Browse files- epi_pipeline.py +24 -1
epi_pipeline.py
CHANGED
@@ -11,6 +11,24 @@ from typing import List, Dict, Union, Optional, Set, Tuple
|
|
11 |
# The code was compiled into a single python file to make adding additional features and importing into other modules easy.
|
12 |
# Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
## Section: GATHER ABSTRACTS FROM APIs
|
15 |
import requests
|
16 |
import xml.etree.ElementTree as ET
|
@@ -41,6 +59,7 @@ def PMID_getAb(PMID:Union[int,str]) -> str:
|
|
41 |
# 'strict' - must have some exact match to at least one of search terms/phrases in text)
|
42 |
# 'lenient' - part of the abstract must match at least one word in the search term phrases.
|
43 |
# 'none'
|
|
|
44 |
def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
|
45 |
#set of all pmids
|
46 |
pmids = set()
|
@@ -141,6 +160,7 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
|
|
141 |
|
142 |
#This is a streamlit version of search_getAbs. Refer to search_getAbs for documentation
|
143 |
import streamlit as st
|
|
|
144 |
def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
|
145 |
pmids = set()
|
146 |
|
@@ -237,7 +257,7 @@ class Classify_Pipeline:
|
|
237 |
|
238 |
def __call__(self, abstract:str) -> Tuple[float,bool]:
|
239 |
return self.getTextPredictions(abstract)
|
240 |
-
|
241 |
def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
|
242 |
if len(abstract)>5:
|
243 |
# input_ids
|
@@ -318,6 +338,7 @@ class GARD_Search:
|
|
318 |
|
319 |
#Works much faster if broken down into sentences.
|
320 |
#compares every phrase in a sentence to see if it matches anything in the GARD dictionary of diseases.
|
|
|
321 |
def get_diseases(self, sentence:str) -> Tuple[List[str], List[str]]:
|
322 |
tokens = [s.lower().strip() for s in nltk_tokenize.word_tokenize(sentence)]
|
323 |
diseases = []
|
@@ -641,6 +662,7 @@ class NER_Pipeline:
|
|
641 |
return "Instantiation: pipe = NER_Pipeline(name_or_path_to_model_folder)"+"\n Calling: output_dict = pipe(text)"
|
642 |
|
643 |
#Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
|
|
|
644 |
def __call__(self, text:str, rd_identify:Union[GARD_Search,None] = None):
|
645 |
output_dict = {label:[] for label in self.labels}
|
646 |
|
@@ -896,6 +918,7 @@ def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:s
|
|
896 |
percent_at_step = 100/len(pmid_abs)
|
897 |
for pmid, abstract in pmid_abs.items():
|
898 |
epi_prob, isEpi = epi_classify(abstract)
|
|
|
899 |
if isEpi:
|
900 |
if extract_diseases:
|
901 |
extraction = epi_ner(abstract, GARD_Search)
|
|
|
11 |
# The code was compiled into a single python file to make adding additional features and importing into other modules easy.
|
12 |
# Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
|
13 |
|
14 |
+
## SECTION: PERFORMANCE (Adding a timer decorator for functions)
|
15 |
+
# Use @timeit decorator at the beginning of class methods or functions
|
16 |
+
# https://dev.to/kcdchennai/python-decorator-to-measure-execution-time-54hk
|
17 |
+
from functools import wraps
|
18 |
+
import time
|
19 |
+
|
20 |
+
def timeit(func):
|
21 |
+
@wraps(func)
|
22 |
+
def timeit_wrapper(*args, **kwargs):
|
23 |
+
start_time = time.perf_counter()
|
24 |
+
result = func(*args, **kwargs)
|
25 |
+
end_time = time.perf_counter()
|
26 |
+
total_time = end_time - start_time
|
27 |
+
# first item in the args, ie `args[0]` is `self`
|
28 |
+
print(f'Function {func.__name__}{args} took {total_time:.4f} seconds to execute')
|
29 |
+
return result
|
30 |
+
return timeit_wrapper
|
31 |
+
|
32 |
## Section: GATHER ABSTRACTS FROM APIs
|
33 |
import requests
|
34 |
import xml.etree.ElementTree as ET
|
|
|
59 |
# 'strict' - must have some exact match to at least one of search terms/phrases in text)
|
60 |
# 'lenient' - part of the abstract must match at least one word in the search term phrases.
|
61 |
# 'none'
|
62 |
+
@timeit
|
63 |
def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
|
64 |
#set of all pmids
|
65 |
pmids = set()
|
|
|
160 |
|
161 |
#This is a streamlit version of search_getAbs. Refer to search_getAbs for documentation
|
162 |
import streamlit as st
|
163 |
+
@timeit
|
164 |
def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
|
165 |
pmids = set()
|
166 |
|
|
|
257 |
|
258 |
def __call__(self, abstract:str) -> Tuple[float,bool]:
|
259 |
return self.getTextPredictions(abstract)
|
260 |
+
@timeit
|
261 |
def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
|
262 |
if len(abstract)>5:
|
263 |
# input_ids
|
|
|
338 |
|
339 |
#Works much faster if broken down into sentences.
|
340 |
#compares every phrase in a sentence to see if it matches anything in the GARD dictionary of diseases.
|
341 |
+
@timeit
|
342 |
def get_diseases(self, sentence:str) -> Tuple[List[str], List[str]]:
|
343 |
tokens = [s.lower().strip() for s in nltk_tokenize.word_tokenize(sentence)]
|
344 |
diseases = []
|
|
|
662 |
return "Instantiation: pipe = NER_Pipeline(name_or_path_to_model_folder)"+"\n Calling: output_dict = pipe(text)"
|
663 |
|
664 |
#Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
|
665 |
+
@timeit
|
666 |
def __call__(self, text:str, rd_identify:Union[GARD_Search,None] = None):
|
667 |
output_dict = {label:[] for label in self.labels}
|
668 |
|
|
|
918 |
percent_at_step = 100/len(pmid_abs)
|
919 |
for pmid, abstract in pmid_abs.items():
|
920 |
epi_prob, isEpi = epi_classify(abstract)
|
921 |
+
print(f"Abstract with PMID: {pmid} was classified as {isEpi}")
|
922 |
if isEpi:
|
923 |
if extract_diseases:
|
924 |
extraction = epi_ner(abstract, GARD_Search)
|