Spaces:
Running
Running
import logging | |
import os | |
import tempfile | |
from pathlib import Path | |
from typing import List, Tuple | |
import gradio as gr | |
import pandas as pd | |
import spacy | |
import torch | |
from transformers import AutoModelForTokenClassification, AutoTokenizer | |
from preprocessing import expand_contractions | |
try: | |
nlp = spacy.load("pt_core_news_sm") | |
except Exception: | |
os.system("python -m spacy download pt_core_news_sm") | |
nlp = spacy.load("pt_core_news_sm") | |
model = AutoModelForTokenClassification.from_pretrained("Emanuel/porttagger-news-base") | |
tokenizer = AutoTokenizer.from_pretrained("Emanuel/porttagger-news-base") | |
logger = logging.getLogger() | |
logger.setLevel(logging.DEBUG) | |
def predict(text, nlp, logger=None) -> Tuple[List[str], List[str]]: | |
doc = nlp(text) | |
tokens = [token.text for token in doc] | |
logger.info("Starting predictions for sentence: {}".format(text)) | |
input_tokens = tokenizer( | |
tokens, | |
return_tensors="pt", | |
is_split_into_words=True, | |
return_offsets_mapping=True, | |
return_special_tokens_mask=True, | |
) | |
output = model(input_tokens["input_ids"]) | |
i_token = 0 | |
labels = [] | |
scores = [] | |
for off, is_special_token, pred in zip( | |
input_tokens["offset_mapping"][0], | |
input_tokens["special_tokens_mask"][0], | |
output.logits[0], | |
): | |
if is_special_token or off[0] > 0: | |
continue | |
label = model.config.__dict__["id2label"][int(pred.argmax(axis=-1))] | |
if logger is not None: | |
logger.info("{}, {}, {}".format(off, tokens[i_token], label)) | |
labels.append(label) | |
scores.append( | |
"{:.2f}".format(100 * float(torch.softmax(pred, dim=-1).detach().max())) | |
) | |
i_token += 1 | |
return tokens, labels, scores | |
def text_analysis(text): | |
text = expand_contractions(text) | |
tokens, labels, scores = predict(text, nlp, logger) | |
pos_count = pd.DataFrame( | |
{ | |
"token": tokens, | |
"etiqueta": labels, | |
"confiança": scores, | |
} | |
) | |
pos_tokens = [] | |
for token, label in zip(tokens, labels): | |
pos_tokens.extend([(token, label), (" ", None)]) | |
output_highlighted.update(visible=True) | |
output_df.update(visible=True) | |
return { | |
output_highlighted: output_highlighted.update(visible=True, value=(pos_tokens)), | |
output_df: output_df.update(visible=True, value=pos_count), | |
} | |
def batch_analysis(input_file): | |
text = open(input_file.name, encoding="utf-8").read() | |
text = text.split("\n") | |
name = Path(input_file.name).stem | |
sents = [] | |
for sent in text: | |
sub_sents = nlp(sent).sents | |
sub_sents = [str(_sent).strip() for _sent in sub_sents] | |
sents += sub_sents | |
conllu_output = [] | |
for i, sent in enumerate(sents): | |
sent = expand_contractions(sent) | |
conllu_output.append("# sent_id = {}-{}\n".format(name, i + 1)) | |
conllu_output.append("# text = {}\n".format(sent)) | |
tokens, labels, scores = predict(sent, nlp, logger) | |
for j, (token, label) in enumerate(zip(tokens, labels)): | |
conllu_output.append( | |
"{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 5 + "\n" | |
) | |
conllu_output.append("\n") | |
output_filename = "output.conllu" | |
with open(output_filename, "w") as out_f: | |
out_f.writelines(conllu_output) | |
return {output_file: output_file.update(visible=True, value=output_filename)} | |
css = open("style.css").read() | |
top_html = open("top.html").read() | |
bottom_html = open("bottom.html").read() | |
with gr.Blocks(css=css) as demo: | |
gr.HTML(top_html) | |
with gr.Tab("Single sentence"): | |
text = gr.Textbox(placeholder="Enter your text here...", label="Input") | |
examples = gr.Examples( | |
examples=[ | |
[ | |
"A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações." | |
], | |
[ | |
"Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista." | |
], | |
], | |
inputs=[text], | |
label="Select an example", | |
) | |
output_highlighted = gr.HighlightedText(label="Colorful output", visible=False) | |
output_df = gr.Dataframe(label="Tabular output", visible=False) | |
submit_btn = gr.Button("Send") | |
submit_btn.click( | |
fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df] | |
) | |
with gr.Tab("Multiple sentences"): | |
gr.HTML( | |
""" | |
<p>Upload file with raw sentences in it. Below is an example of what we expect the contents of the file to look like. | |
Sentences are automatically splitted by Spacy's sentencizer. | |
To force an explicit division, manually separate the sentences on different lines.</p> | |
""" | |
) | |
gr.Markdown( | |
""" | |
``` | |
Então ele hesitou, quase como se estivesse surpreso com as próprias palavras, e recitou: | |
– Vá e não tornes a pecar! | |
Baley, sorrindo de repente, pegou no cotovelo de R. Daneel e eles saíram juntos pela porta. | |
``` | |
""" | |
) | |
input_file = gr.File(label="Upload your input file here...") | |
output_file = gr.File(visible=False) | |
submit_btn_batch = gr.Button("Send") | |
submit_btn_batch.click( | |
fn=batch_analysis, inputs=input_file, outputs=output_file | |
) | |
gr.HTML(bottom_html) | |
demo.launch(debug=True) | |