mvy commited on
Commit
c1db962
·
1 Parent(s): 545b919
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +2 -0
  3. app.py +32 -0
  4. ner.py +78 -0
  5. requirements.txt +2 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -8,6 +8,8 @@ sdk_version: 4.14.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ models:
12
+ - knowledgator/UTC-DeBERTa-small
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from ner import NER
4
+
5
+ examples = [
6
+ [
7
+ 'scientist, university, city',
8
+ ('Dr. Paul Hammond, a renowned neurologist at'
9
+ ' Johns Hopkins University, has recently published'
10
+ ' a paper in the prestigious journal "Nature Neuroscience".'
11
+ ' His research focuses on a rare genetic mutation, found'
12
+ ' in less than 0.01% of the population, that appears to'
13
+ ' prevent the development of Alzheimer\'s disease.'
14
+ ' Collaborating with researchers at the University'
15
+ ' of California, San Francisco, the team is now working'
16
+ ' to understand the mechanism by which this mutation'
17
+ ' confers its protective effect.\n'
18
+ 'Funded by the National Institutes of Health, their'
19
+ ' research could potentially open new avenues for'
20
+ ' Alzheimer\'s treatment.')
21
+ ],
22
+ ]
23
+
24
+ gradio_app = gr.Interface(
25
+ NER.ner,
26
+ inputs = ['text', gr.Textbox(placeholder="Enter sentence here..."), gr.Number(value=0.0, label="treshold")],
27
+ outputs = [gr.HighlightedText()],
28
+ examples=examples
29
+ )
30
+
31
+ if __name__ == "__main__":
32
+ gradio_app.launch()
ner.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
2
+ import spacy
3
+ import torch
4
+
5
+ nlp = spacy.load('en_core_web_sm', disable = ['lemmatizer', 'parser', 'tagger', 'ner'])
6
+ nlp.add_pipe('sentencizer')
7
+
8
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
9
+
10
+
11
+ class NER:
12
+ model_name = 'knowledgator/UTC-DeBERTa-small'
13
+ prompt="""
14
+ Identify entities in the text having the following classes:
15
+ {}
16
+
17
+ Text:
18
+ """
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
21
+ ner_pipeline = pipeline(
22
+ "ner",
23
+ model=model,
24
+ tokenizer=tokenizer,
25
+ aggregation_strategy='first',
26
+ batch_size=12,
27
+ device=device
28
+ )
29
+
30
+ @classmethod
31
+ def chunkanize(cls, text, prompt_ = '', n_sents = 10):
32
+ doc = nlp(text)
33
+ chunks = []
34
+ starts = []
35
+ start = 0
36
+ end = 0
37
+ proc = False
38
+ for id, sent in enumerate(doc.sents, start=1):
39
+ if not proc:
40
+ start = sent[0].idx
41
+ starts.append(start)
42
+ proc = True
43
+ end = sent[-1].idx+len(sent[-1].text)
44
+ if id%n_sents==0:
45
+ chunk_text = prompt_+text[start:end]
46
+ chunks.append(chunk_text)
47
+ proc = False
48
+ if proc:
49
+ chunk_text = prompt_+text[start:end]
50
+ chunks.append(chunk_text)
51
+ return chunks, starts
52
+
53
+
54
+ @classmethod
55
+ def ner(cls, labels, text, treshold = 0.):
56
+ chunks, starts, classes = [], [], []
57
+ label2prompt_len = {}
58
+ for label in labels.split(', '):
59
+ prompt_ = cls.prompt.format(label)
60
+ prompt_len = len(prompt_)
61
+ label2prompt_len[label] = prompt_len
62
+ curr_chunks, curr_starts = cls.chunkanize(text, prompt_)
63
+ curr_labels = [label for _ in range(len(curr_chunks))]
64
+ chunks+=curr_chunks
65
+ starts+=curr_starts
66
+ classes+=curr_labels
67
+ outputs = []
68
+ for id, output in enumerate(cls.ner_pipeline(chunks)):
69
+ label = classes[id]
70
+ prompt_len = label2prompt_len[label]
71
+ start = starts[id]-prompt_len
72
+ for ent in output:
73
+ if ent['score']>treshold:
74
+ ent['start'] += start
75
+ ent['end'] += start
76
+ ent['entity'] = label
77
+ outputs.append(ent)
78
+ return {"text": text, "entities": outputs}
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ spacy
2
+ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl