gpantaz commited on
Commit
33f7995
·
1 Parent(s): f34a973

Add application file

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Athens NLP Summer School
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tokenization Playground
3
+ emoji: 📝
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ pinned: false
8
+ short_description: Compare different tokenizers
9
+ ---
10
+
11
+ # tokenization_playground
12
+ Link to source code: https://github.com/athnlp/tokenization_playground
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ from huggingface_hub import login
5
+
6
+ from playground_app import demo as playground_tab
7
+
8
+ auth_token = os.environ.get("HF_TOKEN", None)
9
+ if auth_token:
10
+ login(token=auth_token)
11
+
12
+
13
+ title = """
14
+ <div align="center">
15
+ <span>Tokenization Playground</span>
16
+ </div>
17
+ """
18
+
19
+ with gr.Blocks() as demo:
20
+ gr.HTML(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
21
+ playground_tab.render()
22
+
23
+ if __name__ == "__main__":
24
+ demo.launch(share=True)
playground_app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from functools import lru_cache
3
+ from typing import Any
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from loguru import logger
8
+
9
+ from playground_examples import (
10
+ default_tokenizer_name_1,
11
+ default_tokenizer_name_2,
12
+ default_user_input,
13
+ examples,
14
+ )
15
+ from playground_tokenizers import TokenizerFactory
16
+
17
+
18
+ @lru_cache
19
+ def run_tokenization(
20
+ text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
21
+ ) -> tuple[list[tuple[str, str]], int, pd.DataFrame]:
22
+ """Tokenize an input text and return the tokens with their positions."""
23
+ logger.info(
24
+ "param="
25
+ + json.dumps(
26
+ {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
27
+ )
28
+ )
29
+ pos_tokens = []
30
+ tokenizer = TokenizerFactory().get_tokenizer(tokenizer_name)
31
+ encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text)
32
+ table = []
33
+
34
+ for idx, token_id in enumerate(encoding):
35
+ decoded_text = tokenizer.decode([token_id])
36
+ decoded_text = decoded_text.replace(
37
+ " ", "⋅"
38
+ ) # replace space with ⋅ for better visualization
39
+ pos_tokens.extend([(decoded_text, str(idx % color_num))])
40
+
41
+ try:
42
+ token = tokenizer.decode([token_id])[0]
43
+ except:
44
+ token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id]
45
+
46
+ if isinstance(token, bytes):
47
+ try:
48
+ token_str = token.decode("utf-8")
49
+ except:
50
+ token_str = token.decode("utf-8", errors="ignore")
51
+ logger.error(
52
+ f"{idx}: decode_error: {tokenizer_name}, {token} {token_str}"
53
+ )
54
+
55
+ elif isinstance(token, str):
56
+ token_str = token
57
+ else:
58
+ logger.error(
59
+ f"{idx}: wrong type for token {token_id} {type(token)} "
60
+ + json.dumps(
61
+ {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
62
+ )
63
+ )
64
+ token_str = token
65
+
66
+ table.append({"TokenID": token_id, "Text": decoded_text})
67
+
68
+ table_df = pd.DataFrame(table)
69
+ logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
70
+ return pos_tokens, len(encoding), table_df
71
+
72
+
73
+ def tokenize(
74
+ text: str, tokenizer_name: str, color_num: int = 5
75
+ ) -> tuple[dict[Any, Any], pd.DataFrame]:
76
+ """Tokenize an input text."""
77
+ pos_tokens, num_tokens, table_df = run_tokenization(text, tokenizer_name, color_num)
78
+ return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
79
+
80
+
81
+ def tokenize_pair(
82
+ text: str, tokenizer_name_1: str, tokenizer_name_2: str, color_num: int = 5
83
+ ):
84
+ """input_text.change."""
85
+ pos_tokens_1, table_df_1 = tokenize(
86
+ text=text, tokenizer_name=tokenizer_name_1, color_num=color_num
87
+ )
88
+ pos_tokens_2, table_df_2 = tokenize(
89
+ text=text, tokenizer_name=tokenizer_name_2, color_num=color_num
90
+ )
91
+ return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
92
+
93
+
94
+ def on_load(url_params: str, request: gr.Request | None = None) -> tuple[str, str, str]:
95
+ """Function triggered on page load to get URL parameters."""
96
+ text = default_user_input
97
+ tokenizer_type_1 = default_tokenizer_name_1
98
+ tokenizer_type_2 = default_tokenizer_name_2
99
+ return text, tokenizer_type_1, tokenizer_type_2
100
+
101
+
102
+ get_window_url_params = """
103
+ function(url_params) {
104
+ const params = new URLSearchParams(window.location.search);
105
+ url_params = JSON.stringify(Object.fromEntries(params));
106
+ return url_params;
107
+ }
108
+ """
109
+
110
+ all_tokenizer_name = [
111
+ (config.name_display, config.name_or_path)
112
+ for config in TokenizerFactory().all_tokenizer_configs
113
+ ]
114
+
115
+ with gr.Blocks() as demo:
116
+ with gr.Row():
117
+ gr.Markdown("## Input Text")
118
+ dropdown_examples = gr.Dropdown(
119
+ sorted(examples.keys()),
120
+ value="Examples",
121
+ type="index",
122
+ allow_custom_value=True,
123
+ show_label=False,
124
+ container=False,
125
+ scale=0,
126
+ elem_classes="example-style",
127
+ )
128
+ user_input = gr.Textbox(
129
+ label="Input Text",
130
+ lines=5,
131
+ show_label=False,
132
+ )
133
+
134
+ with gr.Row():
135
+ with gr.Column(scale=6), gr.Group():
136
+ tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1")
137
+
138
+ with gr.Column(scale=6), gr.Group():
139
+ tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2")
140
+
141
+ with gr.Row():
142
+ with gr.Column():
143
+ output_text_1 = gr.Highlightedtext(
144
+ show_legend=False, show_inline_category=False
145
+ )
146
+ with gr.Column():
147
+ output_text_2 = gr.Highlightedtext(
148
+ show_legend=False, show_inline_category=False
149
+ )
150
+
151
+ with gr.Row():
152
+ output_table_1 = gr.Dataframe()
153
+ output_table_2 = gr.Dataframe()
154
+
155
+ tokenizer_name_1.change(
156
+ tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1]
157
+ )
158
+
159
+ tokenizer_name_2.change(
160
+ tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2]
161
+ )
162
+
163
+ user_input.change(
164
+ tokenize_pair,
165
+ [user_input, tokenizer_name_1, tokenizer_name_2],
166
+ [output_text_1, output_table_1, output_text_2, output_table_2],
167
+ show_api=False,
168
+ )
169
+
170
+ dropdown_examples.change(
171
+ lambda example_idx: (
172
+ examples[sorted(examples.keys())[example_idx]]["text"],
173
+ examples[sorted(examples.keys())[example_idx]]["tokenizer_1"],
174
+ examples[sorted(examples.keys())[example_idx]]["tokenizer_2"],
175
+ ),
176
+ dropdown_examples,
177
+ [user_input, tokenizer_name_1, tokenizer_name_2],
178
+ show_api=False,
179
+ )
180
+
181
+ demo.load(
182
+ fn=on_load,
183
+ inputs=[user_input],
184
+ outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
185
+ js=get_window_url_params,
186
+ show_api=False,
187
+ )
188
+
189
+ if __name__ == "__main__":
190
+ demo.launch(share=True)
playground_examples.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_user_input = """Replace this text in the input field to see how tokenization works."""
2
+ default_tokenizer_name_1 = "openai/gpt-4o"
3
+ default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
4
+
5
+
6
+ number_example = """127+677=804
7
+ 127 + 677 = 804\n
8
+ 1275+6773 = 8041
9
+ 1275 + 6773 = 8048"""
10
+
11
+ code_example = """for i in range(1, 101):
12
+ if i % 3 == 0 and i % 5 == 0:
13
+ print("FizzBuzz")
14
+ elif i % 3 == 0:
15
+ print("Fizz")
16
+ elif i % 5 == 0:
17
+ print("Buzz")
18
+ else:
19
+ print(i)
20
+ """
21
+
22
+ spelling_example = """How do you spell "accommodate"?
23
+ How many letters are in the word "accommodate"?
24
+ How many r's are in the word strawberry?"""
25
+
26
+ examples = {
27
+ "number": {
28
+ "text": number_example,
29
+ "tokenizer_1": default_tokenizer_name_1,
30
+ "tokenizer_2": default_tokenizer_name_2,
31
+ },
32
+ "code": {
33
+ "text": code_example,
34
+ "tokenizer_1": default_tokenizer_name_1,
35
+ "tokenizer_2": default_tokenizer_name_2,
36
+ },
37
+ "spelling": {
38
+ "text": spelling_example,
39
+ "tokenizer_1": default_tokenizer_name_1,
40
+ "tokenizer_2": default_tokenizer_name_2,
41
+ },
42
+ }
playground_tokenizers.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum, auto
3
+ from typing import Any
4
+
5
+ import tiktoken
6
+ from loguru import logger
7
+ from transformers import AutoTokenizer
8
+
9
+
10
+ class TokenizerImpl(Enum):
11
+ BertTokenizer = "wordpiece.BertTokenizer"
12
+ ByteLevelBPETokenizer = "byte_level_bpe"
13
+ SentencePieceBPETokenizer = "sentencepiece_bpe"
14
+
15
+ SentencePiece = auto()
16
+ byte_level_bpe = auto()
17
+
18
+ TikToken = auto()
19
+
20
+
21
+ @dataclass
22
+ class TokenizerConfig:
23
+ """Tokenizer Configuration"""
24
+
25
+ name_or_path: str
26
+ name_display: str | None = None
27
+ impl: TokenizerImpl | None = None
28
+ org: str | None = None
29
+ link: str | None = None
30
+ desc: str | None = None
31
+ meta: str | None = None
32
+ level: str | None = None
33
+ lang: str | None = None
34
+ init_kwargs: dict[str, Any] = field(default_factory=dict)
35
+
36
+ def __post_init__(self):
37
+ if self.link is None:
38
+ self.link = "https://huggingface.co/" + self.name_or_path
39
+ if self.name_display is None:
40
+ self.name_display = self.name_or_path
41
+
42
+ @classmethod
43
+ def init_from_json_file(cls, json_filepath: str) -> "TokenizerConfig":
44
+ pass
45
+
46
+ def __eq__(self, other):
47
+ if isinstance(other, self.__class__):
48
+ return self.__dict__ == other.__dict__
49
+ else:
50
+ return False
51
+
52
+ def __hash__(self):
53
+ return hash(self.name_or_path)
54
+
55
+
56
+ tokenizer_configs = [
57
+ TokenizerConfig(
58
+ "google-bert/bert-base-uncased",
59
+ impl=TokenizerImpl.BertTokenizer,
60
+ org="Google",
61
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
62
+ ),
63
+ TokenizerConfig(
64
+ "google-bert/bert-base-multilingual-uncased",
65
+ impl=TokenizerImpl.BertTokenizer,
66
+ org="Google",
67
+ ),
68
+ TokenizerConfig(
69
+ "openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"
70
+ ),
71
+ TokenizerConfig(
72
+ "EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"
73
+ ),
74
+ TokenizerConfig(
75
+ "Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
76
+ ),
77
+ TokenizerConfig(
78
+ "Qwen/Qwen2.5-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
79
+ ),
80
+ TokenizerConfig(
81
+ "google-t5/t5-large",
82
+ name_display="google-t5/t5",
83
+ impl=TokenizerImpl.SentencePiece,
84
+ org="Google",
85
+ ),
86
+ TokenizerConfig("CohereForAI/aya-101", org="Cohere For AI"),
87
+ TokenizerConfig(
88
+ "meta-llama/Llama-3.2-3B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
89
+ ),
90
+ TokenizerConfig(
91
+ "openai/gpt-4o",
92
+ impl=TokenizerImpl.TikToken,
93
+ org="OpenAI",
94
+ link="https://github.com/openai/tiktoken",
95
+ ),
96
+ TokenizerConfig("google/mt5-large", org="Google"),
97
+ TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
98
+ TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
99
+ ]
100
+
101
+ assert len(set([config.name_display for config in tokenizer_configs])) == len(
102
+ tokenizer_configs
103
+ )
104
+ assert len(set([config.name_or_path for config in tokenizer_configs])) == len(
105
+ tokenizer_configs
106
+ )
107
+ assert len(
108
+ set([config.name_or_path.split("/")[-1] for config in tokenizer_configs])
109
+ ) == len(tokenizer_configs)
110
+
111
+
112
+ class TokenizerFactory:
113
+ def __init__(self):
114
+ self.all_tokenizer_configs = sorted(
115
+ tokenizer_configs, key=lambda k: k.name_display
116
+ )
117
+ self.all_tokenizer_names = [
118
+ config.name_or_path for config in self.all_tokenizer_configs
119
+ ]
120
+ self.name_to_config_list = [
121
+ {config.name_or_path: config for config in self.all_tokenizer_configs},
122
+ {config.name_display: config for config in self.all_tokenizer_configs},
123
+ {
124
+ config.name_display.split("/")[-1]: config
125
+ for config in self.all_tokenizer_configs
126
+ },
127
+ ]
128
+ self.tokenizer_cache = {}
129
+
130
+ def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig | None:
131
+ for name_to_config in self.name_to_config_list:
132
+ if tokenizer_name in name_to_config:
133
+ return name_to_config[tokenizer_name]
134
+ return None
135
+
136
+ def get_tokenizer(self, tokenizer_name: str) -> AutoTokenizer:
137
+ """Get the tokenizer by its name, loading it if not already cached."""
138
+ tokenizer_config = self.get_tokenizer_config(tokenizer_name)
139
+
140
+ if tokenizer_config in self.tokenizer_cache:
141
+ return self.tokenizer_cache[tokenizer_config]
142
+
143
+ tokenizer = self.load_tokenizer(tokenizer_config)
144
+
145
+ self.tokenizer_cache[tokenizer_config] = tokenizer
146
+ return tokenizer
147
+
148
+ def get_name_with_hyperlink(self, tokenizer_name: str) -> str:
149
+ def model_hyperlink(link, model_name):
150
+ model_name = model_name
151
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
152
+
153
+ tokenizer_config = self.get_tokenizer_config(tokenizer_name)
154
+ return model_hyperlink(
155
+ tokenizer_config.link, tokenizer_config.name_display.split("/")[-1]
156
+ )
157
+
158
+ def load_tokenizer(self, tokenizer_config):
159
+ if tokenizer_config == None:
160
+ print("dd")
161
+ logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
162
+ if (
163
+ tokenizer_config.impl == TokenizerImpl.TikToken
164
+ and "openai" in tokenizer_config.name_or_path
165
+ ):
166
+ tokenizer = tiktoken.encoding_for_model(
167
+ tokenizer_config.name_or_path.replace("openai/", "")
168
+ )
169
+ else:
170
+ tokenizer = AutoTokenizer.from_pretrained(
171
+ tokenizer_config.name_or_path,
172
+ trust_remote_code=True,
173
+ **tokenizer_config.init_kwargs,
174
+ )
175
+ return tokenizer
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.38.1
2
+ transformers>4.40.0
3
+ sentencepiece
4
+ tiktoken
5
+ icetk
6
+ torch
7
+ nltk
8
+ boto3
9
+ protobuf==4.25.3
10
+ ai2-olmo
11
+ ipadic
12
+ fugashi
13
+ datasets
utils/__pycache__/i18n_util.cpython-311.pyc ADDED
Binary file (1.61 kB). View file
 
utils/__pycache__/lang_util.cpython-311.pyc ADDED
Binary file (3.24 kB). View file
 
utils/__pycache__/log_util.cpython-311.pyc ADDED
Binary file (633 Bytes). View file
 
utils/__pycache__/text_util.cpython-311.pyc ADDED
Binary file (2.21 kB). View file