File size: 1,012 Bytes
2024883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import gradio as gr
from transformers import AutoTokenizer

# List of available tokenizers
tokenizers = [
    "bert-base-uncased",
    "gpt2",
    "roberta-base",
    "distilbert-base-uncased",
    "xlnet-base-cased"
]

def tokenize_text(text, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokens = tokenizer.tokenize(text)
    return " ".join(tokens)

def compare_tokenizers(text, selected_tokenizers):
    results = {}
    for tokenizer_name in selected_tokenizers:
        results[tokenizer_name] = tokenize_text(text, tokenizer_name)
    return results

# Create the Gradio interface
iface = gr.Interface(
    fn=compare_tokenizers,
    inputs=[
        gr.Textbox(label="Enter text to tokenize"),
        gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
    ],
    outputs=gr.JSON(label="Tokenization Results"),
    title="Tokenizer Comparison",
    description="Compare tokenization results from different tokenizers.",
)

# Launch the app
iface.launch()