|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
import pandas as pd |
|
import json |
|
|
|
def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False): |
|
""" |
|
Processes text using a specified Hugging Face tokenizer model. |
|
""" |
|
try: |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
except Exception as e: |
|
return ( |
|
pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]), |
|
"", |
|
"", |
|
) |
|
|
|
encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True) |
|
|
|
|
|
tokens = tokenizer.tokenize(text) |
|
token_ids = tokenizer.encode(text) |
|
|
|
|
|
if not include_special_tokens: |
|
|
|
|
|
|
|
try: |
|
decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True) |
|
token_ids = tokenizer.encode(decoded_text, add_special_tokens=False) |
|
tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False) |
|
except Exception as e: |
|
|
|
|
|
print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}") |
|
|
|
tokens = tokenizer.tokenize(text) |
|
token_ids = tokenizer.encode(text) |
|
|
|
token_info = [] |
|
|
|
min_len = min(len(tokens), len(token_ids)) |
|
for i in range(min_len): |
|
token = tokens[i] |
|
token_id = token_ids[i] |
|
info = { |
|
"Token": token, |
|
"ID": token_id, |
|
} |
|
|
|
|
|
if show_attention_mask and encoding["attention_mask"].shape[1] > i: |
|
info["Attention Mask"] = encoding["attention_mask"][0][i] |
|
|
|
token_info.append(info) |
|
|
|
df = pd.DataFrame(token_info) |
|
|
|
stats = f""" |
|
Number of tokens: {len(tokens)} |
|
Input text length: {len(text)} |
|
Tokens/character ratio: {len(tokens)/len(text):.2f} |
|
Vocabulary size: {tokenizer.vocab_size} |
|
""" |
|
|
|
json_output = json.dumps( |
|
{ |
|
"input_ids": token_ids, |
|
"tokens": tokens, |
|
}, |
|
indent=2, |
|
ensure_ascii=False |
|
) |
|
|
|
return df, stats, json_output |
|
|
|
|
|
model_choices = [ |
|
"roberta-base", |
|
"klue/roberta-large", |
|
"distilbert/distilbert-base-uncased", |
|
"BAAI/bge-m3-retromae", |
|
"DTAI-KULeuven/robbert-2023-dutch-base", |
|
"DTAI-KULeuven/robbert-2023-dutch-large", |
|
] |
|
|
|
iface = gr.Interface( |
|
fn=process_text, |
|
inputs=[ |
|
gr.Dropdown( |
|
choices=model_choices, |
|
value="roberta-base", |
|
label="Select Model", |
|
), |
|
gr.Textbox( |
|
lines=5, placeholder="Enter text to tokenize...", label="Input Text" |
|
), |
|
gr.Checkbox(label="Include Special Tokens", value=False), |
|
gr.Checkbox(label="Show Attention Mask", value=False), |
|
], |
|
outputs=[ |
|
gr.Dataframe( |
|
headers=["Token", "ID", "Attention Mask"], label="Tokenization Results" |
|
), |
|
gr.Textbox(label="Statistics", lines=4), |
|
gr.JSON(label="JSON Output"), |
|
], |
|
title="Hugging Face Tokenizer Playground", |
|
description=""" |
|
An interactive demonstration of various Hugging Face tokenizers. |
|
Select a model from the dropdown to see how it tokenizes your input text. |
|
""", |
|
theme="default", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch(share=True) |