Frinkleko's picture
add more base model
73ff676 verified
import gradio as gr
from transformers import AutoTokenizer
import pandas as pd
import json
def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
"""
Processes text using a specified Hugging Face tokenizer model.
"""
try:
# Dynamically load the tokenizer based on the selected model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
return (
pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
"",
"",
)
encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
# Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)
# Adjust special token handling based on the flag
if not include_special_tokens:
# Attempt to remove special tokens by decoding and then encoding without special tokens.
# This approach aims for a general solution but might behave differently for
# tokenizers with complex special token handling or if tokens are meant to be inseparable.
try:
decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
except Exception as e:
# Fallback if specific handling fails. It's better to process without removing
# special tokens if an error occurs rather than failing the whole process.
print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
# Keep original tokens and IDs which include special tokens
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)
token_info = []
# Ensure tokens and token_ids have matching lengths for zipping
min_len = min(len(tokens), len(token_ids))
for i in range(min_len):
token = tokens[i]
token_id = token_ids[i]
info = {
"Token": token,
"ID": token_id,
}
# Check if attention_mask is available and has the correct dimension before accessing
if show_attention_mask and encoding["attention_mask"].shape[1] > i:
info["Attention Mask"] = encoding["attention_mask"][0][i]
token_info.append(info)
df = pd.DataFrame(token_info)
stats = f"""
Number of tokens: {len(tokens)}
Input text length: {len(text)}
Tokens/character ratio: {len(tokens)/len(text):.2f}
Vocabulary size: {tokenizer.vocab_size}
"""
json_output = json.dumps(
{
"input_ids": token_ids,
"tokens": tokens,
},
indent=2,
ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
)
return df, stats, json_output
# Define available models using your specified paths
model_choices = [
"roberta-base",
"klue/roberta-large",
"distilbert/distilbert-base-uncased",
"BAAI/bge-m3-retromae",
"DTAI-KULeuven/robbert-2023-dutch-base",
"DTAI-KULeuven/robbert-2023-dutch-large",
]
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Dropdown(
choices=model_choices,
value="roberta-base",
label="Select Model",
),
gr.Textbox(
lines=5, placeholder="Enter text to tokenize...", label="Input Text"
),
gr.Checkbox(label="Include Special Tokens", value=False),
gr.Checkbox(label="Show Attention Mask", value=False),
],
outputs=[
gr.Dataframe(
headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
),
gr.Textbox(label="Statistics", lines=4),
gr.JSON(label="JSON Output"),
],
title="Hugging Face Tokenizer Playground",
description="""
An interactive demonstration of various Hugging Face tokenizers.
Select a model from the dropdown to see how it tokenizes your input text.
""",
theme="default",
)
if __name__ == "__main__":
iface.launch(share=True)