File size: 3,291 Bytes
19d4726
 
 
 
 
 
 
b29b5d8
19d4726
 
 
 
 
 
 
 
6795b3b
 
19d4726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b29b5d8
19d4726
 
 
 
 
 
 
 
 
 
 
b29b5d8
 
 
19d4726
 
b29b5d8
19d4726
 
 
b29b5d8
19d4726
 
 
 
b29b5d8
19d4726
 
 
 
 
 
6795b3b
 
 
 
 
 
 
19d4726
 
 
 
 
 
 
 
6795b3b
19d4726
 
 
 
 
 
 
 
6795b3b
 
 
 
 
 
 
19d4726
 
 
 
6795b3b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import logging
import os
from typing import List, Tuple

import gradio as gr
import pandas as pd
import spacy
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

try:
    nlp = spacy.load("pt_core_news_sm")
except Exception:
    os.system("python -m spacy download pt_core_news_sm")
    nlp = spacy.load("pt_core_news_sm")

model = AutoModelForTokenClassification.from_pretrained("Emanuel/porttagger-news-base")
tokenizer = AutoTokenizer.from_pretrained("Emanuel/porttagger-news-base")
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


def predict(text, nlp, logger=None) -> Tuple[List[str], List[str]]:
    doc = nlp(text)
    tokens = [token.text for token in doc]

    logger.info("Starting predictions for sentence: {}".format(text))

    input_tokens = tokenizer(
        tokens,
        return_tensors="pt",
        is_split_into_words=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
    )
    output = model(input_tokens["input_ids"])

    i_token = 0
    labels = []
    scores = []
    for off, is_special_token, pred in zip(
        input_tokens["offset_mapping"][0],
        input_tokens["special_tokens_mask"][0],
        output.logits[0],
    ):
        if is_special_token or off[0] > 0:
            continue
        label = model.config.__dict__["id2label"][int(pred.argmax(axis=-1))]
        if logger is not None:
            logger.info("{}, {}, {}".format(off, tokens[i_token], label))
        labels.append(label)
        scores.append(
            "{:.2f}".format(100 * float(torch.softmax(pred, dim=-1).detach().max()))
        )
        i_token += 1

    return tokens, labels, scores


def text_analysis(text):
    tokens, labels, scores = predict(text, nlp, logger)
    pos_count = pd.DataFrame(
        {
            "token": tokens,
            "etiqueta": labels,
            "confiança": scores,
        }
    )
    pos_tokens = []
    for token, label in zip(tokens, labels):
        pos_tokens.extend([(token, label), (" ", None)])

    output_highlighted.update(visible=True)
    output_df.update(visible=True)

    return {
        output_highlighted: output_highlighted.update(visible=True, value=(pos_tokens)),
        output_df: output_df.update(visible=True, value=pos_count),
    }


css = open("style.css").read()
top_html = open("top.html").read()
bottom_html = open("bottom.html").read()

with gr.Blocks(css=css) as demo:
    gr.HTML(top_html)
    text = gr.Textbox(placeholder="Enter your text here...", label="Input")
    examples = gr.Examples(
        examples=[
            [
                "A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
            ],
            ["Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."],
        ],
        inputs=[text],
        label="Select an example",
    )
    output_highlighted = gr.HighlightedText(label="Colorful output", visible=False)
    output_df = gr.Dataframe(label="Tabular output", visible=False)
    submit_btn = gr.Button("Send")
    submit_btn.click(
        fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
    )
    gr.HTML(bottom_html)


demo.launch(debug=True)