|
import gradio as gr |
|
from transformers import RobertaTokenizer |
|
import pandas as pd |
|
import json |
|
|
|
|
|
tokenizer = RobertaTokenizer.from_pretrained("roberta-base") |
|
|
|
|
|
def process_text(text, include_special_tokens=False, show_attention_mask=False): |
|
|
|
encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True) |
|
|
|
tokens = tokenizer.tokenize(text) |
|
token_ids = tokenizer.encode(text) |
|
|
|
if not include_special_tokens: |
|
tokens = tokens |
|
token_ids = token_ids[1:-1] |
|
|
|
token_info = [] |
|
for token, token_id in zip(tokens, token_ids): |
|
info = { |
|
"Token": token, |
|
"ID": token_id, |
|
} |
|
|
|
if show_attention_mask: |
|
info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)] |
|
|
|
token_info.append(info) |
|
|
|
df = pd.DataFrame(token_info) |
|
|
|
stats = f""" |
|
Number of tokens: {len(tokens)} |
|
Input text length: {len(text)} |
|
Tokens/character ratio: {len(tokens)/len(text):.2f} |
|
Vocabulary size: {tokenizer.vocab_size} |
|
""" |
|
|
|
json_output = json.dumps( |
|
{ |
|
"input_ids": token_ids, |
|
"tokens": tokens, |
|
}, |
|
indent=2, |
|
) |
|
|
|
return df, stats, json_output |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_text, |
|
inputs=[ |
|
gr.Textbox( |
|
lines=5, placeholder="Enter text to tokenize...", label="Input Text" |
|
), |
|
gr.Checkbox(label="Include Special Tokens", value=False), |
|
gr.Checkbox(label="Show Attention Mask", value=False), |
|
], |
|
outputs=[ |
|
gr.Dataframe( |
|
headers=["Token", "ID", "Attention Mask"], label="Tokenization Results" |
|
), |
|
gr.Textbox(label="Statistics", lines=4), |
|
gr.JSON(label="JSON Output"), |
|
], |
|
title="RoBERTa Tokenizer Playground", |
|
description=""" |
|
An interactive demonstration of the RoBERTa tokenizer. |
|
""", |
|
theme="default", |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch(share=True) |
|
|