Spaces:
Running
Running
gpantaz
commited on
Commit
·
33f7995
1
Parent(s):
f34a973
Add application file
Browse files- LICENSE +21 -0
- README.md +12 -0
- app.py +24 -0
- playground_app.py +190 -0
- playground_examples.py +42 -0
- playground_tokenizers.py +175 -0
- requirements.txt +13 -0
- utils/__pycache__/i18n_util.cpython-311.pyc +0 -0
- utils/__pycache__/lang_util.cpython-311.pyc +0 -0
- utils/__pycache__/log_util.cpython-311.pyc +0 -0
- utils/__pycache__/text_util.cpython-311.pyc +0 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Athens NLP Summer School
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Tokenization Playground
|
3 |
+
emoji: 📝
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
pinned: false
|
8 |
+
short_description: Compare different tokenizers
|
9 |
+
---
|
10 |
+
|
11 |
+
# tokenization_playground
|
12 |
+
Link to source code: https://github.com/athnlp/tokenization_playground
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from huggingface_hub import login
|
5 |
+
|
6 |
+
from playground_app import demo as playground_tab
|
7 |
+
|
8 |
+
auth_token = os.environ.get("HF_TOKEN", None)
|
9 |
+
if auth_token:
|
10 |
+
login(token=auth_token)
|
11 |
+
|
12 |
+
|
13 |
+
title = """
|
14 |
+
<div align="center">
|
15 |
+
<span>Tokenization Playground</span>
|
16 |
+
</div>
|
17 |
+
"""
|
18 |
+
|
19 |
+
with gr.Blocks() as demo:
|
20 |
+
gr.HTML(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
|
21 |
+
playground_tab.render()
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
demo.launch(share=True)
|
playground_app.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import Any
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import pandas as pd
|
7 |
+
from loguru import logger
|
8 |
+
|
9 |
+
from playground_examples import (
|
10 |
+
default_tokenizer_name_1,
|
11 |
+
default_tokenizer_name_2,
|
12 |
+
default_user_input,
|
13 |
+
examples,
|
14 |
+
)
|
15 |
+
from playground_tokenizers import TokenizerFactory
|
16 |
+
|
17 |
+
|
18 |
+
@lru_cache
|
19 |
+
def run_tokenization(
|
20 |
+
text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
|
21 |
+
) -> tuple[list[tuple[str, str]], int, pd.DataFrame]:
|
22 |
+
"""Tokenize an input text and return the tokens with their positions."""
|
23 |
+
logger.info(
|
24 |
+
"param="
|
25 |
+
+ json.dumps(
|
26 |
+
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
|
27 |
+
)
|
28 |
+
)
|
29 |
+
pos_tokens = []
|
30 |
+
tokenizer = TokenizerFactory().get_tokenizer(tokenizer_name)
|
31 |
+
encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text)
|
32 |
+
table = []
|
33 |
+
|
34 |
+
for idx, token_id in enumerate(encoding):
|
35 |
+
decoded_text = tokenizer.decode([token_id])
|
36 |
+
decoded_text = decoded_text.replace(
|
37 |
+
" ", "⋅"
|
38 |
+
) # replace space with ⋅ for better visualization
|
39 |
+
pos_tokens.extend([(decoded_text, str(idx % color_num))])
|
40 |
+
|
41 |
+
try:
|
42 |
+
token = tokenizer.decode([token_id])[0]
|
43 |
+
except:
|
44 |
+
token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id]
|
45 |
+
|
46 |
+
if isinstance(token, bytes):
|
47 |
+
try:
|
48 |
+
token_str = token.decode("utf-8")
|
49 |
+
except:
|
50 |
+
token_str = token.decode("utf-8", errors="ignore")
|
51 |
+
logger.error(
|
52 |
+
f"{idx}: decode_error: {tokenizer_name}, {token} {token_str}"
|
53 |
+
)
|
54 |
+
|
55 |
+
elif isinstance(token, str):
|
56 |
+
token_str = token
|
57 |
+
else:
|
58 |
+
logger.error(
|
59 |
+
f"{idx}: wrong type for token {token_id} {type(token)} "
|
60 |
+
+ json.dumps(
|
61 |
+
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
|
62 |
+
)
|
63 |
+
)
|
64 |
+
token_str = token
|
65 |
+
|
66 |
+
table.append({"TokenID": token_id, "Text": decoded_text})
|
67 |
+
|
68 |
+
table_df = pd.DataFrame(table)
|
69 |
+
logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
|
70 |
+
return pos_tokens, len(encoding), table_df
|
71 |
+
|
72 |
+
|
73 |
+
def tokenize(
|
74 |
+
text: str, tokenizer_name: str, color_num: int = 5
|
75 |
+
) -> tuple[dict[Any, Any], pd.DataFrame]:
|
76 |
+
"""Tokenize an input text."""
|
77 |
+
pos_tokens, num_tokens, table_df = run_tokenization(text, tokenizer_name, color_num)
|
78 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
79 |
+
|
80 |
+
|
81 |
+
def tokenize_pair(
|
82 |
+
text: str, tokenizer_name_1: str, tokenizer_name_2: str, color_num: int = 5
|
83 |
+
):
|
84 |
+
"""input_text.change."""
|
85 |
+
pos_tokens_1, table_df_1 = tokenize(
|
86 |
+
text=text, tokenizer_name=tokenizer_name_1, color_num=color_num
|
87 |
+
)
|
88 |
+
pos_tokens_2, table_df_2 = tokenize(
|
89 |
+
text=text, tokenizer_name=tokenizer_name_2, color_num=color_num
|
90 |
+
)
|
91 |
+
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
92 |
+
|
93 |
+
|
94 |
+
def on_load(url_params: str, request: gr.Request | None = None) -> tuple[str, str, str]:
|
95 |
+
"""Function triggered on page load to get URL parameters."""
|
96 |
+
text = default_user_input
|
97 |
+
tokenizer_type_1 = default_tokenizer_name_1
|
98 |
+
tokenizer_type_2 = default_tokenizer_name_2
|
99 |
+
return text, tokenizer_type_1, tokenizer_type_2
|
100 |
+
|
101 |
+
|
102 |
+
get_window_url_params = """
|
103 |
+
function(url_params) {
|
104 |
+
const params = new URLSearchParams(window.location.search);
|
105 |
+
url_params = JSON.stringify(Object.fromEntries(params));
|
106 |
+
return url_params;
|
107 |
+
}
|
108 |
+
"""
|
109 |
+
|
110 |
+
all_tokenizer_name = [
|
111 |
+
(config.name_display, config.name_or_path)
|
112 |
+
for config in TokenizerFactory().all_tokenizer_configs
|
113 |
+
]
|
114 |
+
|
115 |
+
with gr.Blocks() as demo:
|
116 |
+
with gr.Row():
|
117 |
+
gr.Markdown("## Input Text")
|
118 |
+
dropdown_examples = gr.Dropdown(
|
119 |
+
sorted(examples.keys()),
|
120 |
+
value="Examples",
|
121 |
+
type="index",
|
122 |
+
allow_custom_value=True,
|
123 |
+
show_label=False,
|
124 |
+
container=False,
|
125 |
+
scale=0,
|
126 |
+
elem_classes="example-style",
|
127 |
+
)
|
128 |
+
user_input = gr.Textbox(
|
129 |
+
label="Input Text",
|
130 |
+
lines=5,
|
131 |
+
show_label=False,
|
132 |
+
)
|
133 |
+
|
134 |
+
with gr.Row():
|
135 |
+
with gr.Column(scale=6), gr.Group():
|
136 |
+
tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1")
|
137 |
+
|
138 |
+
with gr.Column(scale=6), gr.Group():
|
139 |
+
tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2")
|
140 |
+
|
141 |
+
with gr.Row():
|
142 |
+
with gr.Column():
|
143 |
+
output_text_1 = gr.Highlightedtext(
|
144 |
+
show_legend=False, show_inline_category=False
|
145 |
+
)
|
146 |
+
with gr.Column():
|
147 |
+
output_text_2 = gr.Highlightedtext(
|
148 |
+
show_legend=False, show_inline_category=False
|
149 |
+
)
|
150 |
+
|
151 |
+
with gr.Row():
|
152 |
+
output_table_1 = gr.Dataframe()
|
153 |
+
output_table_2 = gr.Dataframe()
|
154 |
+
|
155 |
+
tokenizer_name_1.change(
|
156 |
+
tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1]
|
157 |
+
)
|
158 |
+
|
159 |
+
tokenizer_name_2.change(
|
160 |
+
tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2]
|
161 |
+
)
|
162 |
+
|
163 |
+
user_input.change(
|
164 |
+
tokenize_pair,
|
165 |
+
[user_input, tokenizer_name_1, tokenizer_name_2],
|
166 |
+
[output_text_1, output_table_1, output_text_2, output_table_2],
|
167 |
+
show_api=False,
|
168 |
+
)
|
169 |
+
|
170 |
+
dropdown_examples.change(
|
171 |
+
lambda example_idx: (
|
172 |
+
examples[sorted(examples.keys())[example_idx]]["text"],
|
173 |
+
examples[sorted(examples.keys())[example_idx]]["tokenizer_1"],
|
174 |
+
examples[sorted(examples.keys())[example_idx]]["tokenizer_2"],
|
175 |
+
),
|
176 |
+
dropdown_examples,
|
177 |
+
[user_input, tokenizer_name_1, tokenizer_name_2],
|
178 |
+
show_api=False,
|
179 |
+
)
|
180 |
+
|
181 |
+
demo.load(
|
182 |
+
fn=on_load,
|
183 |
+
inputs=[user_input],
|
184 |
+
outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
|
185 |
+
js=get_window_url_params,
|
186 |
+
show_api=False,
|
187 |
+
)
|
188 |
+
|
189 |
+
if __name__ == "__main__":
|
190 |
+
demo.launch(share=True)
|
playground_examples.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
default_user_input = """Replace this text in the input field to see how tokenization works."""
|
2 |
+
default_tokenizer_name_1 = "openai/gpt-4o"
|
3 |
+
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
4 |
+
|
5 |
+
|
6 |
+
number_example = """127+677=804
|
7 |
+
127 + 677 = 804\n
|
8 |
+
1275+6773 = 8041
|
9 |
+
1275 + 6773 = 8048"""
|
10 |
+
|
11 |
+
code_example = """for i in range(1, 101):
|
12 |
+
if i % 3 == 0 and i % 5 == 0:
|
13 |
+
print("FizzBuzz")
|
14 |
+
elif i % 3 == 0:
|
15 |
+
print("Fizz")
|
16 |
+
elif i % 5 == 0:
|
17 |
+
print("Buzz")
|
18 |
+
else:
|
19 |
+
print(i)
|
20 |
+
"""
|
21 |
+
|
22 |
+
spelling_example = """How do you spell "accommodate"?
|
23 |
+
How many letters are in the word "accommodate"?
|
24 |
+
How many r's are in the word strawberry?"""
|
25 |
+
|
26 |
+
examples = {
|
27 |
+
"number": {
|
28 |
+
"text": number_example,
|
29 |
+
"tokenizer_1": default_tokenizer_name_1,
|
30 |
+
"tokenizer_2": default_tokenizer_name_2,
|
31 |
+
},
|
32 |
+
"code": {
|
33 |
+
"text": code_example,
|
34 |
+
"tokenizer_1": default_tokenizer_name_1,
|
35 |
+
"tokenizer_2": default_tokenizer_name_2,
|
36 |
+
},
|
37 |
+
"spelling": {
|
38 |
+
"text": spelling_example,
|
39 |
+
"tokenizer_1": default_tokenizer_name_1,
|
40 |
+
"tokenizer_2": default_tokenizer_name_2,
|
41 |
+
},
|
42 |
+
}
|
playground_tokenizers.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from enum import Enum, auto
|
3 |
+
from typing import Any
|
4 |
+
|
5 |
+
import tiktoken
|
6 |
+
from loguru import logger
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
|
9 |
+
|
10 |
+
class TokenizerImpl(Enum):
|
11 |
+
BertTokenizer = "wordpiece.BertTokenizer"
|
12 |
+
ByteLevelBPETokenizer = "byte_level_bpe"
|
13 |
+
SentencePieceBPETokenizer = "sentencepiece_bpe"
|
14 |
+
|
15 |
+
SentencePiece = auto()
|
16 |
+
byte_level_bpe = auto()
|
17 |
+
|
18 |
+
TikToken = auto()
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass
|
22 |
+
class TokenizerConfig:
|
23 |
+
"""Tokenizer Configuration"""
|
24 |
+
|
25 |
+
name_or_path: str
|
26 |
+
name_display: str | None = None
|
27 |
+
impl: TokenizerImpl | None = None
|
28 |
+
org: str | None = None
|
29 |
+
link: str | None = None
|
30 |
+
desc: str | None = None
|
31 |
+
meta: str | None = None
|
32 |
+
level: str | None = None
|
33 |
+
lang: str | None = None
|
34 |
+
init_kwargs: dict[str, Any] = field(default_factory=dict)
|
35 |
+
|
36 |
+
def __post_init__(self):
|
37 |
+
if self.link is None:
|
38 |
+
self.link = "https://huggingface.co/" + self.name_or_path
|
39 |
+
if self.name_display is None:
|
40 |
+
self.name_display = self.name_or_path
|
41 |
+
|
42 |
+
@classmethod
|
43 |
+
def init_from_json_file(cls, json_filepath: str) -> "TokenizerConfig":
|
44 |
+
pass
|
45 |
+
|
46 |
+
def __eq__(self, other):
|
47 |
+
if isinstance(other, self.__class__):
|
48 |
+
return self.__dict__ == other.__dict__
|
49 |
+
else:
|
50 |
+
return False
|
51 |
+
|
52 |
+
def __hash__(self):
|
53 |
+
return hash(self.name_or_path)
|
54 |
+
|
55 |
+
|
56 |
+
tokenizer_configs = [
|
57 |
+
TokenizerConfig(
|
58 |
+
"google-bert/bert-base-uncased",
|
59 |
+
impl=TokenizerImpl.BertTokenizer,
|
60 |
+
org="Google",
|
61 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
62 |
+
),
|
63 |
+
TokenizerConfig(
|
64 |
+
"google-bert/bert-base-multilingual-uncased",
|
65 |
+
impl=TokenizerImpl.BertTokenizer,
|
66 |
+
org="Google",
|
67 |
+
),
|
68 |
+
TokenizerConfig(
|
69 |
+
"openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"
|
70 |
+
),
|
71 |
+
TokenizerConfig(
|
72 |
+
"EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"
|
73 |
+
),
|
74 |
+
TokenizerConfig(
|
75 |
+
"Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
76 |
+
),
|
77 |
+
TokenizerConfig(
|
78 |
+
"Qwen/Qwen2.5-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
79 |
+
),
|
80 |
+
TokenizerConfig(
|
81 |
+
"google-t5/t5-large",
|
82 |
+
name_display="google-t5/t5",
|
83 |
+
impl=TokenizerImpl.SentencePiece,
|
84 |
+
org="Google",
|
85 |
+
),
|
86 |
+
TokenizerConfig("CohereForAI/aya-101", org="Cohere For AI"),
|
87 |
+
TokenizerConfig(
|
88 |
+
"meta-llama/Llama-3.2-3B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
|
89 |
+
),
|
90 |
+
TokenizerConfig(
|
91 |
+
"openai/gpt-4o",
|
92 |
+
impl=TokenizerImpl.TikToken,
|
93 |
+
org="OpenAI",
|
94 |
+
link="https://github.com/openai/tiktoken",
|
95 |
+
),
|
96 |
+
TokenizerConfig("google/mt5-large", org="Google"),
|
97 |
+
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
98 |
+
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
99 |
+
]
|
100 |
+
|
101 |
+
assert len(set([config.name_display for config in tokenizer_configs])) == len(
|
102 |
+
tokenizer_configs
|
103 |
+
)
|
104 |
+
assert len(set([config.name_or_path for config in tokenizer_configs])) == len(
|
105 |
+
tokenizer_configs
|
106 |
+
)
|
107 |
+
assert len(
|
108 |
+
set([config.name_or_path.split("/")[-1] for config in tokenizer_configs])
|
109 |
+
) == len(tokenizer_configs)
|
110 |
+
|
111 |
+
|
112 |
+
class TokenizerFactory:
|
113 |
+
def __init__(self):
|
114 |
+
self.all_tokenizer_configs = sorted(
|
115 |
+
tokenizer_configs, key=lambda k: k.name_display
|
116 |
+
)
|
117 |
+
self.all_tokenizer_names = [
|
118 |
+
config.name_or_path for config in self.all_tokenizer_configs
|
119 |
+
]
|
120 |
+
self.name_to_config_list = [
|
121 |
+
{config.name_or_path: config for config in self.all_tokenizer_configs},
|
122 |
+
{config.name_display: config for config in self.all_tokenizer_configs},
|
123 |
+
{
|
124 |
+
config.name_display.split("/")[-1]: config
|
125 |
+
for config in self.all_tokenizer_configs
|
126 |
+
},
|
127 |
+
]
|
128 |
+
self.tokenizer_cache = {}
|
129 |
+
|
130 |
+
def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig | None:
|
131 |
+
for name_to_config in self.name_to_config_list:
|
132 |
+
if tokenizer_name in name_to_config:
|
133 |
+
return name_to_config[tokenizer_name]
|
134 |
+
return None
|
135 |
+
|
136 |
+
def get_tokenizer(self, tokenizer_name: str) -> AutoTokenizer:
|
137 |
+
"""Get the tokenizer by its name, loading it if not already cached."""
|
138 |
+
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
139 |
+
|
140 |
+
if tokenizer_config in self.tokenizer_cache:
|
141 |
+
return self.tokenizer_cache[tokenizer_config]
|
142 |
+
|
143 |
+
tokenizer = self.load_tokenizer(tokenizer_config)
|
144 |
+
|
145 |
+
self.tokenizer_cache[tokenizer_config] = tokenizer
|
146 |
+
return tokenizer
|
147 |
+
|
148 |
+
def get_name_with_hyperlink(self, tokenizer_name: str) -> str:
|
149 |
+
def model_hyperlink(link, model_name):
|
150 |
+
model_name = model_name
|
151 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
152 |
+
|
153 |
+
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
154 |
+
return model_hyperlink(
|
155 |
+
tokenizer_config.link, tokenizer_config.name_display.split("/")[-1]
|
156 |
+
)
|
157 |
+
|
158 |
+
def load_tokenizer(self, tokenizer_config):
|
159 |
+
if tokenizer_config == None:
|
160 |
+
print("dd")
|
161 |
+
logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
|
162 |
+
if (
|
163 |
+
tokenizer_config.impl == TokenizerImpl.TikToken
|
164 |
+
and "openai" in tokenizer_config.name_or_path
|
165 |
+
):
|
166 |
+
tokenizer = tiktoken.encoding_for_model(
|
167 |
+
tokenizer_config.name_or_path.replace("openai/", "")
|
168 |
+
)
|
169 |
+
else:
|
170 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
171 |
+
tokenizer_config.name_or_path,
|
172 |
+
trust_remote_code=True,
|
173 |
+
**tokenizer_config.init_kwargs,
|
174 |
+
)
|
175 |
+
return tokenizer
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.38.1
|
2 |
+
transformers>4.40.0
|
3 |
+
sentencepiece
|
4 |
+
tiktoken
|
5 |
+
icetk
|
6 |
+
torch
|
7 |
+
nltk
|
8 |
+
boto3
|
9 |
+
protobuf==4.25.3
|
10 |
+
ai2-olmo
|
11 |
+
ipadic
|
12 |
+
fugashi
|
13 |
+
datasets
|
utils/__pycache__/i18n_util.cpython-311.pyc
ADDED
Binary file (1.61 kB). View file
|
|
utils/__pycache__/lang_util.cpython-311.pyc
ADDED
Binary file (3.24 kB). View file
|
|
utils/__pycache__/log_util.cpython-311.pyc
ADDED
Binary file (633 Bytes). View file
|
|
utils/__pycache__/text_util.cpython-311.pyc
ADDED
Binary file (2.21 kB). View file
|
|