File size: 5,055 Bytes
0fd6af9 dad5c8f 0fd6af9 44562bb 939362b 44562bb f1d4807 44562bb f1d4807 939362b f1d4807 58ee780 f1d4807 6cef16b 58ee780 6cef16b 58ee780 f1d4807 58ee780 f1d4807 d12ceca f1d4807 58ee780 f1d4807 d12ceca 58ee780 d12ceca f1d4807 939362b 58ee780 6cef16b 58ee780 6cef16b 58ee780 d12ceca f1d4807 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# -*- coding: utf-8 -*-
import os
os.system("pip3 install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1+cpu -f "
"https://download.pytorch.org/whl/cpu/torch_stable.html")
import gradio as gr
from transformers import pipeline
import spacy
from spacy import displacy
ner_map = {0: '0',
1: 'B-OSOBA',
2: 'I-OSOBA',
3: 'B-ORGANIZÁCIA',
4: 'I-ORGANIZÁCIA',
5: 'B-LOKALITA',
6: 'I-LOKALITA'}
options = {"ents": ["OSOBA",
"ORGANIZÁCIA",
"LOKALITA"],
"colors": {"OSOBA": "lightblue",
"ORGANIZÁCIA": "lightcoral",
"LOKALITA": "lightgreen"}}
ner_pipeline = pipeline(task='ner', model="crabz/slovakbert-ner")
nlp = spacy.blank("sk")
def postprocess(classifications):
entities = []
for i in range(len(classifications)):
if classifications[i]['entity'] != 0:
if ner_map[classifications[i]['entity']][0] == 'B':
j = i + 1
while j < len(classifications) and ner_map[classifications[j]['entity']][0] == 'I':
j += 1
entities.append((ner_map[classifications[i]['entity']].split('-')[1], classifications[i]['start'],
classifications[j - 1]['end']))
while True:
merged = False
to_remove = []
merged_entities = []
for i in range(len(entities)):
for j in range(i + 1, len(entities)):
if entities[i] != entities[j] and entities[i][0] == entities[j][0] and \
(entities[i][2] == entities[j][1] or entities[i][1] == entities[j][2]):
to_remove.append(entities[i])
to_remove.append(entities[j])
new_start = min(entities[i][1], entities[j][1])
new_end = max(entities[i][2], entities[j][2])
merged_entities.append((entities[i][0], new_start, new_end))
merged = True
break
if merged:
break
for ent in to_remove:
entities.remove(ent)
entities += merged_entities
if not merged:
break
return entities
def set_entities(sentence, entities):
doc = nlp(sentence)
ents = []
for label, start, end in entities:
ents.append(doc.char_span(start, end, label))
doc.ents = ents
return doc
def apply_ner(Sentence: str):
classifications = ner_pipeline(Sentence)
entities = postprocess(classifications)
doc = set_entities(Sentence, entities)
displacy_html = displacy.render(doc, style="ent", options=options)
return displacy_html
intf = gr.Interface(fn=apply_ner, inputs="text", outputs="html", title='Slovak Named Entity Recognition',
allow_flagging=False,
examples=[["Laboratóriá Úradu verejného zdravotníctva sekvenovaním potvrdili výskyt ďalších "
"štyroch prípadov variantu omikron na Slovensku."],
["Čaputová opakovane tvrdí, že \"spravodlivosť na Slovensku neplatí vždy pre všetkých "
"rovnako\"."],
["Informácie o týchto veľkolepých plánoch prišli týždeň po tom, ako sa japonský "
"miliardár Jusaku Maezawa vrátil z 12-dňového pobytu na Medzinárodnej vesmírnej stanici "
"(ISS), čím sa stal prvým vesmírnym turistom, ktorý cestoval na ISS za viac ako desať "
"rokov."],
["Minister financií a líder mandátovo najsilnejšieho hnutia OĽaNO Igor Matovič "
"upozorňuje, že následky tretej vlny budú na Slovensku veľmi veľké."],
["Začiatkom roka 2021 sa objavili nezhody medzi Richardom Sulíkom a šéfom hnutia OĽANO "
"Igorom Matovičom, ktoré v istej miere pretrvávajú aj dodnes."]],
description="Named-entity recognition (NER) labels named-entities in unstructured text. This "
"implementation supports three labels: person (OSOBA), organization (ORGANIZÁCIA) and "
"location (LOKALITA). You can try out one of the examples below or type your own "
"sentence. Don't forget to use double quotes (\" \") instead of curved quotes („ “).",
article="This model is a fine-tuned version of [gerulata/slovakbert]"
"(https://huggingface.co/gerulata/slovakbert) on the Slovak wikiann dataset. It achieves "
"F1 score of 0.9398 on the evaluation set. The quote requirements comes from pretrained "
"SlovakBERT and was not introduced by me.")
intf.launch()
|