File size: 4,078 Bytes
0fd6af9
 
 
58ee780
 
0fd6af9
44562bb
939362b
44562bb
f1d4807
 
44562bb
f1d4807
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939362b
 
f1d4807
 
58ee780
f1d4807
 
 
 
 
 
 
 
 
58ee780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1d4807
58ee780
 
 
f1d4807
 
 
 
58ee780
 
f1d4807
58ee780
 
 
 
f1d4807
 
 
 
 
939362b
 
 
 
58ee780
 
 
 
 
 
 
 
 
939362b
f1d4807
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
import os

# os.system("pip3 install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1+cpu -f "
#           "https://download.pytorch.org/whl/cpu/torch_stable.html")

import gradio as gr
from transformers import pipeline

import spacy
from spacy import displacy

ner_map = {0: '0',
           1: 'B-OSOBA',
           2: 'I-OSOBA',
           3: 'B-ORGANIZÁCIA',
           4: 'I-ORGANIZÁCIA',
           5: 'B-LOKALITA',
           6: 'I-LOKALITA'}

options = {"ents": ["OSOBA",
                    "ORGANIZÁCIA",
                    "LOKALITA"],
           "colors": {"OSOBA": "lightblue",
                      "ORGANIZÁCIA": "lightcoral",
                      "LOKALITA": "lightgreen"}}

ner_pipeline = pipeline(task='ner', model="crabz/slovakbert-ner")
nlp = spacy.blank("sk")


def postprocess(classifications):
    entities = []
    for i in range(len(classifications)):
        if classifications[i]['entity'] != 0:
            if ner_map[classifications[i]['entity']][0] == 'B':
                j = i + 1
                while j < len(classifications) and ner_map[classifications[j]['entity']][0] == 'I':
                    j += 1
                entities.append((ner_map[classifications[i]['entity']].split('-')[1], classifications[i]['start'],
                                 classifications[j - 1]['end']))
    to_remove = []
    merged_entities = []
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            if entities[i] != entities[j] and entities[i][0] == entities[j][0] and (entities[i][2] == entities[j][1] or
                                                                                    entities[i][1] == entities[j][2]):
                to_remove.append(entities[i])
                to_remove.append(entities[j])

                new_start = min(entities[i][1], entities[j][1])
                new_end = max(entities[i][2], entities[j][2])
                merged_entities.append((entities[i][0], new_start, new_end))
    for ent in to_remove:
        entities.remove(ent)
    entities += merged_entities
    return entities


def set_entities(sentence, entities):
    doc = nlp(sentence)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    return doc


def apply_ner(sentence: str):
    classifications = ner_pipeline(sentence)
    entities = postprocess(classifications)
    doc = set_entities(sentence, entities)
    displacy_html = displacy.render(doc, style="ent", options=options)
    return displacy_html


intf = gr.Interface(fn=apply_ner, inputs="text", outputs="html", title='Slovak Named Entity Recognition',
                    allow_flagging=False,
                    examples=[["Laboratóriá Úradu verejného zdravotníctva sekvenovaním potvrdili výskyt ďalších "
                               "štyroch prípadov variantu omikron na Slovensku."],
                              ["Čaputová opakovane tvrdí, že \"spravodlivosť na Slovensku neplatí vždy pre všetkých "
                               "rovnako\"."],
                              ["Minister financií a líder mandátovo najsilnejšieho hnutia OĽaNO Igor Matovič "
                               "upozorňuje, že následky tretej vlny budú na Slovensku veľmi veľké."],
                              ["Začiatkom roka sa objavili nezhody medzi Richardom Sulíkom a šéfom hnutia OĽANO "
                               "Igorom Matovičom, ktoré v istej miere pretrvávajú aj dodnes."]],
                    description="Named-entity recognition (NER) labels named-entities in unstructured text. This "
                                "implementation supports three labels: person (OSOBA), organization (ORGANIZÁCIA) and "
                                "location (LOKALITA). You can try out one of the examples below or type your own "
                                "sentence. Don't forget to use double quotes (\" \") instead of curved quotes („ “)",
                    article="")
intf.launch()