Spaces:
Running
Running
File size: 3,291 Bytes
19d4726 b29b5d8 19d4726 6795b3b 19d4726 b29b5d8 19d4726 b29b5d8 19d4726 b29b5d8 19d4726 b29b5d8 19d4726 b29b5d8 19d4726 6795b3b 19d4726 6795b3b 19d4726 6795b3b 19d4726 6795b3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import logging
import os
from typing import List, Tuple
import gradio as gr
import pandas as pd
import spacy
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
try:
nlp = spacy.load("pt_core_news_sm")
except Exception:
os.system("python -m spacy download pt_core_news_sm")
nlp = spacy.load("pt_core_news_sm")
model = AutoModelForTokenClassification.from_pretrained("Emanuel/porttagger-news-base")
tokenizer = AutoTokenizer.from_pretrained("Emanuel/porttagger-news-base")
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
def predict(text, nlp, logger=None) -> Tuple[List[str], List[str]]:
doc = nlp(text)
tokens = [token.text for token in doc]
logger.info("Starting predictions for sentence: {}".format(text))
input_tokens = tokenizer(
tokens,
return_tensors="pt",
is_split_into_words=True,
return_offsets_mapping=True,
return_special_tokens_mask=True,
)
output = model(input_tokens["input_ids"])
i_token = 0
labels = []
scores = []
for off, is_special_token, pred in zip(
input_tokens["offset_mapping"][0],
input_tokens["special_tokens_mask"][0],
output.logits[0],
):
if is_special_token or off[0] > 0:
continue
label = model.config.__dict__["id2label"][int(pred.argmax(axis=-1))]
if logger is not None:
logger.info("{}, {}, {}".format(off, tokens[i_token], label))
labels.append(label)
scores.append(
"{:.2f}".format(100 * float(torch.softmax(pred, dim=-1).detach().max()))
)
i_token += 1
return tokens, labels, scores
def text_analysis(text):
tokens, labels, scores = predict(text, nlp, logger)
pos_count = pd.DataFrame(
{
"token": tokens,
"etiqueta": labels,
"confiança": scores,
}
)
pos_tokens = []
for token, label in zip(tokens, labels):
pos_tokens.extend([(token, label), (" ", None)])
output_highlighted.update(visible=True)
output_df.update(visible=True)
return {
output_highlighted: output_highlighted.update(visible=True, value=(pos_tokens)),
output_df: output_df.update(visible=True, value=pos_count),
}
css = open("style.css").read()
top_html = open("top.html").read()
bottom_html = open("bottom.html").read()
with gr.Blocks(css=css) as demo:
gr.HTML(top_html)
text = gr.Textbox(placeholder="Enter your text here...", label="Input")
examples = gr.Examples(
examples=[
[
"A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
],
["Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."],
],
inputs=[text],
label="Select an example",
)
output_highlighted = gr.HighlightedText(label="Colorful output", visible=False)
output_df = gr.Dataframe(label="Tabular output", visible=False)
submit_btn = gr.Button("Send")
submit_btn.click(
fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
)
gr.HTML(bottom_html)
demo.launch(debug=True)
|