File size: 4,361 Bytes
6dbdb73
73ff676
6dbdb73
 
 
73ff676
 
 
 
 
 
 
 
 
 
 
 
 
6dbdb73
 
 
73ff676
6dbdb73
 
 
73ff676
6dbdb73
73ff676
 
 
 
 
 
 
 
 
 
 
 
 
 
6dbdb73
 
73ff676
 
 
 
 
6dbdb73
 
 
 
 
73ff676
 
 
6dbdb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73ff676
6dbdb73
 
 
 
73ff676
 
 
 
 
 
 
 
 
6dbdb73
 
 
 
73ff676
 
 
 
 
6dbdb73
 
 
 
 
 
 
 
 
 
 
 
 
73ff676
6dbdb73
73ff676
 
6dbdb73
 
 
 
 
73ff676
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
from transformers import AutoTokenizer
import pandas as pd
import json

def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
    """
    Processes text using a specified Hugging Face tokenizer model.
    """
    try:
        # Dynamically load the tokenizer based on the selected model name
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        return (
            pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
            "",
            "",
        )

    encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)

    # Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)

    # Adjust special token handling based on the flag
    if not include_special_tokens:
        # Attempt to remove special tokens by decoding and then encoding without special tokens.
        # This approach aims for a general solution but might behave differently for
        # tokenizers with complex special token handling or if tokens are meant to be inseparable.
        try:
            decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
            token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
            tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
        except Exception as e:
            # Fallback if specific handling fails. It's better to process without removing
            # special tokens if an error occurs rather than failing the whole process.
            print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
            # Keep original tokens and IDs which include special tokens
            tokens = tokenizer.tokenize(text)
            token_ids = tokenizer.encode(text)

    token_info = []
    # Ensure tokens and token_ids have matching lengths for zipping
    min_len = min(len(tokens), len(token_ids))
    for i in range(min_len):
        token = tokens[i]
        token_id = token_ids[i]
        info = {
            "Token": token,
            "ID": token_id,
        }

        # Check if attention_mask is available and has the correct dimension before accessing
        if show_attention_mask and encoding["attention_mask"].shape[1] > i:
            info["Attention Mask"] = encoding["attention_mask"][0][i]

        token_info.append(info)

    df = pd.DataFrame(token_info)

    stats = f"""
    Number of tokens: {len(tokens)}
    Input text length: {len(text)}
    Tokens/character ratio: {len(tokens)/len(text):.2f}
    Vocabulary size: {tokenizer.vocab_size}
    """

    json_output = json.dumps(
        {
            "input_ids": token_ids,
            "tokens": tokens,
        },
        indent=2,
        ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
    )

    return df, stats, json_output

# Define available models using your specified paths
model_choices = [
    "roberta-base",
    "klue/roberta-large",
    "distilbert/distilbert-base-uncased",
    "BAAI/bge-m3-retromae",
    "DTAI-KULeuven/robbert-2023-dutch-base",
    "DTAI-KULeuven/robbert-2023-dutch-large",
]

iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Dropdown(
            choices=model_choices,
            value="roberta-base",
            label="Select Model",
        ),
        gr.Textbox(
            lines=5, placeholder="Enter text to tokenize...", label="Input Text"
        ),
        gr.Checkbox(label="Include Special Tokens", value=False),
        gr.Checkbox(label="Show Attention Mask", value=False),
    ],
    outputs=[
        gr.Dataframe(
            headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
        ),
        gr.Textbox(label="Statistics", lines=4),
        gr.JSON(label="JSON Output"),
    ],
    title="Hugging Face Tokenizer Playground",
    description="""
    An interactive demonstration of various Hugging Face tokenizers.
    Select a model from the dropdown to see how it tokenizes your input text.
    """,
    theme="default",
)

if __name__ == "__main__":
    iface.launch(share=True)