Spaces:

knowledgator
/

0-shot-NER

Build error

App Files Files Community

mvy commited on Jan 14, 2024

Commit

c1db962

1 Parent(s): 545b919

add app

Browse files

Files changed (5) hide show

.gitignore +1 -0
README.md +2 -0
app.py +32 -0
ner.py +78 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -8,6 +8,8 @@ sdk_version: 4.14.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: apache-2.0
+models:
+- knowledgator/UTC-DeBERTa-small
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+from ner import NER
+examples = [
+    [
+        'scientist, university, city',
+        ('Dr. Paul Hammond, a renowned neurologist at'
+        ' Johns Hopkins University, has recently published'
+        ' a paper in the prestigious journal "Nature Neuroscience".'
+        ' His research focuses on a rare genetic mutation, found'
+        ' in less than 0.01% of the population, that appears to'
+        ' prevent the development of Alzheimer\'s disease.'
+        ' Collaborating with researchers at the University'
+        ' of California, San Francisco, the team is now working'
+        ' to understand the mechanism by which this mutation'
+        ' confers its protective effect.\n'
+        'Funded by the National Institutes of Health, their'
+        ' research could potentially open new avenues for'
+        ' Alzheimer\'s treatment.')
+    ],
+]
+gradio_app = gr.Interface(
+    NER.ner,
+    inputs = ['text', gr.Textbox(placeholder="Enter sentence here..."), gr.Number(value=0.0, label="treshold")],
+    outputs = [gr.HighlightedText()],
+    examples=examples
+)
+if __name__ == "__main__":
+    gradio_app.launch()

ner.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+import spacy
+import torch
+nlp = spacy.load('en_core_web_sm', disable = ['lemmatizer', 'parser', 'tagger', 'ner'])
+nlp.add_pipe('sentencizer')
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+class NER:
+    model_name = 'knowledgator/UTC-DeBERTa-small'
+    prompt="""
+Identify entities in the text having the following classes:
+{}
+Text:
+"""
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForTokenClassification.from_pretrained(model_name)
+    ner_pipeline = pipeline(
+        "ner",
+        model=model,
+        tokenizer=tokenizer,
+        aggregation_strategy='first',
+        batch_size=12,
+        device=device
+    )
+    @classmethod
+    def chunkanize(cls, text, prompt_ = '', n_sents = 10):
+        doc = nlp(text)
+        chunks = []
+        starts = []
+        start = 0
+        end = 0
+        proc = False
+        for id, sent in enumerate(doc.sents, start=1):
+            if not proc:
+                start = sent[0].idx
+                starts.append(start)
+            proc = True
+            end = sent[-1].idx+len(sent[-1].text)
+            if id%n_sents==0:
+                chunk_text = prompt_+text[start:end]
+                chunks.append(chunk_text)
+                proc = False
+        if proc:
+            chunk_text = prompt_+text[start:end]
+            chunks.append(chunk_text)
+        return chunks, starts
+    @classmethod
+    def ner(cls, labels, text, treshold = 0.):
+        chunks, starts, classes = [], [], []
+        label2prompt_len = {}
+        for label in labels.split(', '):
+            prompt_ = cls.prompt.format(label)
+            prompt_len = len(prompt_)
+            label2prompt_len[label] = prompt_len
+            curr_chunks, curr_starts = cls.chunkanize(text, prompt_)
+            curr_labels = [label for _ in range(len(curr_chunks))]
+            chunks+=curr_chunks
+            starts+=curr_starts
+            classes+=curr_labels
+        outputs = []
+        for id, output in enumerate(cls.ner_pipeline(chunks)):
+            label = classes[id]
+            prompt_len = label2prompt_len[label]
+            start = starts[id]-prompt_len
+            for ent in output:
+                if ent['score']>treshold:
+                    ent['start'] += start
+                    ent['end'] += start
+                    ent['entity'] = label
+                    outputs.append(ent)
+        return {"text": text, "entities": outputs}

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ spacy
2	+ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl