Spaces:
Running
Running
George Pantazopoulos
commited on
Commit
·
f34a973
1
Parent(s):
2a67d66
chore: cleanup
Browse files- .gitattributes +0 -35
- LICENSE +0 -21
- README.md +0 -12
- app.py +0 -25
- character_util.py +0 -178
- playground_app.py +0 -91
- playground_examples.py +0 -42
- playground_util.py +0 -107
- requirements.txt +0 -13
- utils/__pycache__/i18n_util.cpython-311.pyc +0 -0
- utils/__pycache__/lang_util.cpython-311.pyc +0 -0
- utils/__pycache__/log_util.cpython-311.pyc +0 -0
- utils/__pycache__/text_util.cpython-311.pyc +0 -0
- utils/i18n_util.py +0 -26
- utils/lang_util.py +0 -89
- utils/log_util.py +0 -10
- utils/oov_util.py +0 -122
- utils/text_util.py +0 -47
- vocab.py +0 -754
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Copyright (c) 2025 Athens NLP Summer School
|
4 |
-
|
5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
of this software and associated documentation files (the "Software"), to deal
|
7 |
-
in the Software without restriction, including without limitation the rights
|
8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
copies of the Software, and to permit persons to whom the Software is
|
10 |
-
furnished to do so, subject to the following conditions:
|
11 |
-
|
12 |
-
The above copyright notice and this permission notice shall be included in all
|
13 |
-
copies or substantial portions of the Software.
|
14 |
-
|
15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Tokenization Playground
|
3 |
-
emoji: 📝
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: purple
|
6 |
-
sdk: gradio
|
7 |
-
pinned: false
|
8 |
-
short_description: Compare different tokenizers
|
9 |
-
---
|
10 |
-
|
11 |
-
# tokenization_playground
|
12 |
-
Link to source code: https://github.com/athnlp/tokenization_playground
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
from huggingface_hub import login
|
5 |
-
|
6 |
-
from playground_app import demo as playground_tab
|
7 |
-
|
8 |
-
auth_token = os.environ.get("HF_TOKEN", None)
|
9 |
-
if auth_token:
|
10 |
-
login(token=auth_token)
|
11 |
-
|
12 |
-
|
13 |
-
title = """
|
14 |
-
<div align="center">
|
15 |
-
<span>Tokenization Playground</span>
|
16 |
-
</div>
|
17 |
-
"""
|
18 |
-
|
19 |
-
with gr.Blocks() as demo:
|
20 |
-
_ = gr.HTML(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
|
21 |
-
_ = playground_tab.render()
|
22 |
-
|
23 |
-
if __name__ == "__main__":
|
24 |
-
demo.launch()
|
25 |
-
# demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
character_util.py
DELETED
@@ -1,178 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
from pathlib import Path
|
4 |
-
from typing import Literal
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
import pandas as pd
|
8 |
-
from utils.lang_util import detect_language_by_unicode, language_ranges
|
9 |
-
from utils.log_util import logger
|
10 |
-
from utils.text_util import contains_digit, get_space_count
|
11 |
-
from vocab import tokenizer_factory
|
12 |
-
|
13 |
-
CURRENT_DIR = Path.parent(Path.resolve(__file__))
|
14 |
-
|
15 |
-
cache = {}
|
16 |
-
default_columns = ["digit", "zh"]
|
17 |
-
|
18 |
-
|
19 |
-
def text_to_unicode(text: str) -> str:
|
20 |
-
"""Convert text to unicode representation."""
|
21 |
-
return "".join(rf"\u{ord(character):04X}" for character in text)
|
22 |
-
|
23 |
-
|
24 |
-
def calculate_dist(token_lens: list[int]) -> str:
|
25 |
-
"""Calculate the distribution of token lengths."""
|
26 |
-
if not token_lens:
|
27 |
-
return "-"
|
28 |
-
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
29 |
-
|
30 |
-
|
31 |
-
def iter_vocab(
|
32 |
-
tokenizer_name: str,
|
33 |
-
from_cache: bool = True,
|
34 |
-
cache_dir: str = "stats",
|
35 |
-
) -> pd.DataFrame | dict:
|
36 |
-
""":param tokenizer_name:
|
37 |
-
:param from_cache:
|
38 |
-
:param cache_dir:
|
39 |
-
:return:
|
40 |
-
"""
|
41 |
-
tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
|
42 |
-
|
43 |
-
cache_dir = os.path.join(CURRENT_DIR, cache_dir)
|
44 |
-
os.makedirs(cache_dir, exist_ok=True)
|
45 |
-
|
46 |
-
# load from cache
|
47 |
-
cache_path = os.path.join(cache_dir, "character_stats.json")
|
48 |
-
if not cache and os.path.exists(cache_path):
|
49 |
-
with open(cache_path, encoding="utf-8") as f_tmp:
|
50 |
-
cache.update(json.load(f_tmp))
|
51 |
-
if from_cache and tokenizer_name in cache:
|
52 |
-
# logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
53 |
-
return cache[tokenizer_name]
|
54 |
-
|
55 |
-
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
56 |
-
|
57 |
-
tokens_by_lang = {lang[1]: [] for lang in language_ranges}
|
58 |
-
digit_tokens = []
|
59 |
-
space_tokens = []
|
60 |
-
byte_tokens = []
|
61 |
-
|
62 |
-
buffer = []
|
63 |
-
for token_id in range(tokenizer.vocab_size):
|
64 |
-
# for token_id in tokenizer.get_vocab():
|
65 |
-
# for token_id in range(len(tokenizer)):
|
66 |
-
decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
|
67 |
-
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
68 |
-
tags = []
|
69 |
-
if token is None: # 有些词典有空的id(不连续)
|
70 |
-
continue
|
71 |
-
if isinstance(token, bytes):
|
72 |
-
token = token.decode("utf-8", errors="ignore")
|
73 |
-
|
74 |
-
if hasattr(tokenizer, "sp_model") and tokenizer.sp_model.is_byte(token_id):
|
75 |
-
tags.append("is_byte")
|
76 |
-
byte_tokens.append(token)
|
77 |
-
|
78 |
-
language_tags = detect_language_by_unicode(decode_str)
|
79 |
-
for language in language_tags:
|
80 |
-
tokens_by_lang[language[1]].append(decode_str)
|
81 |
-
|
82 |
-
if contains_digit(decode_str):
|
83 |
-
tags.append("digit")
|
84 |
-
digit_tokens.append(decode_str)
|
85 |
-
|
86 |
-
space_count = get_space_count(decode_str)
|
87 |
-
if space_count > 0:
|
88 |
-
space_tokens.append(decode_str)
|
89 |
-
|
90 |
-
buffer.append(
|
91 |
-
json.dumps(
|
92 |
-
{
|
93 |
-
"id": token_id,
|
94 |
-
"token": token,
|
95 |
-
"token_decode": decode_str,
|
96 |
-
"token_dumps": json.dumps(token),
|
97 |
-
"token_unicode": text_to_unicode(token),
|
98 |
-
"token_len": len(decode_str),
|
99 |
-
},
|
100 |
-
ensure_ascii=False,
|
101 |
-
)
|
102 |
-
+ "\n"
|
103 |
-
)
|
104 |
-
|
105 |
-
result = {
|
106 |
-
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
107 |
-
"organization": tokenizer_config.org,
|
108 |
-
"vocab_size": len(tokenizer),
|
109 |
-
"num(digit)": len(digit_tokens),
|
110 |
-
"len(digit)": calculate_dist([len(token) for token in digit_tokens]),
|
111 |
-
"num(space)": len(space_tokens),
|
112 |
-
"len(space)": calculate_dist([len(token) for token in space_tokens]),
|
113 |
-
}
|
114 |
-
|
115 |
-
for lang, tokens in tokens_by_lang.items():
|
116 |
-
result[f"num({lang})"] = len(tokens)
|
117 |
-
result["len(" + lang + ")"] = calculate_dist([len(token) for token in tokens])
|
118 |
-
|
119 |
-
out_path = os.path.join(
|
120 |
-
cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl"
|
121 |
-
)
|
122 |
-
with open(out_path, "w", encoding="utf-8") as f_out:
|
123 |
-
for line in buffer:
|
124 |
-
f_out.write(line)
|
125 |
-
len_before = len(cache)
|
126 |
-
cache[tokenizer_name] = result
|
127 |
-
len_after = len(cache)
|
128 |
-
logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
|
129 |
-
with open(cache_path, "w", encoding="utf-8") as f_out:
|
130 |
-
f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
|
131 |
-
return result
|
132 |
-
|
133 |
-
|
134 |
-
def to_dataframe(stats: dict[str, Any], columns: list[str]) -> pd.DataFrame:
|
135 |
-
table = []
|
136 |
-
for stat in stats.values():
|
137 |
-
filtered_stat = {}
|
138 |
-
for k, v in stat.items():
|
139 |
-
if not k.startswith("num") and not k.startswith("len"):
|
140 |
-
filtered_stat[k] = v
|
141 |
-
if any(column in k for column in columns):
|
142 |
-
k = k.replace("ja-kana", "kana")
|
143 |
-
filtered_stat[k] = v
|
144 |
-
table.append(filtered_stat)
|
145 |
-
return pd.DataFrame(table)
|
146 |
-
|
147 |
-
|
148 |
-
def get_character_table(
|
149 |
-
tokenizer_filter: str | None = None,
|
150 |
-
columns: list | None = None,
|
151 |
-
return_type: Literal["dict", "dataframe"] | None = "dataframe",
|
152 |
-
) -> pd.DataFrame | dict:
|
153 |
-
logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
|
154 |
-
stats = {}
|
155 |
-
if columns is None:
|
156 |
-
columns = default_columns
|
157 |
-
if tokenizer_filter is not None:
|
158 |
-
tokenizer_names = [
|
159 |
-
tokenizer_config.name_or_path
|
160 |
-
for tokenizer_config in tokenizer_factory.all_tokenizer_configs
|
161 |
-
if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()
|
162 |
-
]
|
163 |
-
else:
|
164 |
-
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
165 |
-
|
166 |
-
for tokenizer_name in tokenizer_names:
|
167 |
-
stat = iter_vocab(tokenizer_name)
|
168 |
-
stats[tokenizer_name] = stat
|
169 |
-
|
170 |
-
if return_type == "dataframe":
|
171 |
-
stats = to_dataframe(stats, columns)
|
172 |
-
return stats
|
173 |
-
|
174 |
-
|
175 |
-
if __name__ == "__main__":
|
176 |
-
# aa = get_character_table(tokenizer_filter="baichuan")
|
177 |
-
df = get_character_table()
|
178 |
-
logger.info(f"\n{df.to_markdown(index=False)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
playground_app.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from playground_examples import examples
|
3 |
-
from playground_util import on_load, tokenize, tokenize_pair
|
4 |
-
from vocab import tokenizer_factory
|
5 |
-
|
6 |
-
get_window_url_params = """
|
7 |
-
function(url_params) {
|
8 |
-
const params = new URLSearchParams(window.location.search);
|
9 |
-
url_params = JSON.stringify(Object.fromEntries(params));
|
10 |
-
return url_params;
|
11 |
-
}
|
12 |
-
"""
|
13 |
-
|
14 |
-
all_tokenizer_name = [
|
15 |
-
(config.name_display, config.name_or_path)
|
16 |
-
for config in tokenizer_factory.all_tokenizer_configs
|
17 |
-
]
|
18 |
-
|
19 |
-
with gr.Blocks() as demo:
|
20 |
-
with gr.Row():
|
21 |
-
gr.Markdown("## Input Text")
|
22 |
-
dropdown_examples = gr.Dropdown(
|
23 |
-
sorted(examples.keys()),
|
24 |
-
value="Examples",
|
25 |
-
type="index",
|
26 |
-
allow_custom_value=True,
|
27 |
-
show_label=False,
|
28 |
-
container=False,
|
29 |
-
scale=0,
|
30 |
-
elem_classes="example-style",
|
31 |
-
)
|
32 |
-
user_input = gr.Textbox(
|
33 |
-
label="Input Text",
|
34 |
-
lines=5,
|
35 |
-
show_label=False,
|
36 |
-
)
|
37 |
-
|
38 |
-
with gr.Row():
|
39 |
-
with gr.Column(scale=6), gr.Group():
|
40 |
-
tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1")
|
41 |
-
|
42 |
-
with gr.Column(scale=6), gr.Group():
|
43 |
-
tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2")
|
44 |
-
|
45 |
-
with gr.Row():
|
46 |
-
# dynamic change label
|
47 |
-
with gr.Column():
|
48 |
-
output_text_1 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
|
49 |
-
with gr.Column():
|
50 |
-
output_text_2 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
|
51 |
-
|
52 |
-
with gr.Row():
|
53 |
-
output_table_1 = gr.Dataframe()
|
54 |
-
output_table_2 = gr.Dataframe()
|
55 |
-
|
56 |
-
tokenizer_name_1.change(
|
57 |
-
tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1]
|
58 |
-
)
|
59 |
-
|
60 |
-
tokenizer_name_2.change(
|
61 |
-
tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2]
|
62 |
-
)
|
63 |
-
|
64 |
-
user_input.change(
|
65 |
-
tokenize_pair,
|
66 |
-
[user_input, tokenizer_name_1, tokenizer_name_2],
|
67 |
-
[output_text_1, output_table_1, output_text_2, output_table_2],
|
68 |
-
show_api=False,
|
69 |
-
)
|
70 |
-
|
71 |
-
dropdown_examples.change(
|
72 |
-
lambda example_idx: (
|
73 |
-
examples[sorted(examples.keys())[example_idx]]["text"],
|
74 |
-
examples[sorted(examples.keys())[example_idx]]["tokenizer_1"],
|
75 |
-
examples[sorted(examples.keys())[example_idx]]["tokenizer_2"],
|
76 |
-
),
|
77 |
-
dropdown_examples,
|
78 |
-
[user_input, tokenizer_name_1, tokenizer_name_2],
|
79 |
-
show_api=False,
|
80 |
-
)
|
81 |
-
|
82 |
-
demo.load(
|
83 |
-
fn=on_load,
|
84 |
-
inputs=[user_input],
|
85 |
-
outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
|
86 |
-
js=get_window_url_params,
|
87 |
-
show_api=False,
|
88 |
-
)
|
89 |
-
|
90 |
-
if __name__ == "__main__":
|
91 |
-
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
playground_examples.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
default_user_input = """Replace this text in the input field to see how tokenization works."""
|
2 |
-
default_tokenizer_name_1 = "openai/gpt-4o"
|
3 |
-
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
4 |
-
|
5 |
-
|
6 |
-
number_example = """127+677=804
|
7 |
-
127 + 677 = 804\n
|
8 |
-
1275+6773 = 8041
|
9 |
-
1275 + 6773 = 8048"""
|
10 |
-
|
11 |
-
code_example = """for i in range(1, 101):
|
12 |
-
if i % 3 == 0 and i % 5 == 0:
|
13 |
-
print("FizzBuzz")
|
14 |
-
elif i % 3 == 0:
|
15 |
-
print("Fizz")
|
16 |
-
elif i % 5 == 0:
|
17 |
-
print("Buzz")
|
18 |
-
else:
|
19 |
-
print(i)
|
20 |
-
"""
|
21 |
-
|
22 |
-
spelling_example = """How do you spell "accommodate"?
|
23 |
-
How many letters are in the word "accommodate"?
|
24 |
-
How many r's are in the word strawberry?"""
|
25 |
-
|
26 |
-
examples = {
|
27 |
-
"number": {
|
28 |
-
"text": number_example,
|
29 |
-
"tokenizer_1": default_tokenizer_name_1,
|
30 |
-
"tokenizer_2": default_tokenizer_name_2,
|
31 |
-
},
|
32 |
-
"code": {
|
33 |
-
"text": code_example,
|
34 |
-
"tokenizer_1": default_tokenizer_name_1,
|
35 |
-
"tokenizer_2": default_tokenizer_name_2,
|
36 |
-
},
|
37 |
-
"spelling": {
|
38 |
-
"text": spelling_example,
|
39 |
-
"tokenizer_1": default_tokenizer_name_1,
|
40 |
-
"tokenizer_2": default_tokenizer_name_2,
|
41 |
-
},
|
42 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
playground_util.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from functools import lru_cache
|
3 |
-
from typing import Any
|
4 |
-
|
5 |
-
import gradio as gr
|
6 |
-
import pandas as pd
|
7 |
-
from playground_examples import (
|
8 |
-
default_tokenizer_name_1,
|
9 |
-
default_tokenizer_name_2,
|
10 |
-
default_user_input,
|
11 |
-
)
|
12 |
-
from utils.i18n_util import get_lang
|
13 |
-
from utils.log_util import logger
|
14 |
-
from vocab import tokenizer_factory
|
15 |
-
|
16 |
-
|
17 |
-
@lru_cache
|
18 |
-
def _tokenize(text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False):
|
19 |
-
logger.info(
|
20 |
-
"param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
|
21 |
-
)
|
22 |
-
pos_tokens = []
|
23 |
-
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
24 |
-
encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text)
|
25 |
-
table = []
|
26 |
-
|
27 |
-
for idx, token_id in enumerate(encoding):
|
28 |
-
decoded_text = tokenizer.decode([token_id])
|
29 |
-
decoded_text = decoded_text.replace(
|
30 |
-
" ", "⋅"
|
31 |
-
) # replace space with ⋅ for better visualization
|
32 |
-
pos_tokens.extend([(decoded_text, str(idx % color_num))])
|
33 |
-
|
34 |
-
try:
|
35 |
-
token = tokenizer.decode([token_id])[0]
|
36 |
-
except:
|
37 |
-
token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id]
|
38 |
-
|
39 |
-
if isinstance(token, bytes):
|
40 |
-
try:
|
41 |
-
token_str = token.decode("utf-8")
|
42 |
-
except:
|
43 |
-
token_str = token.decode("utf-8", errors="ignore")
|
44 |
-
logger.error(
|
45 |
-
f"{idx}: decode_error: "
|
46 |
-
+ json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
47 |
-
{
|
48 |
-
"tokenizer_type": tokenizer_name,
|
49 |
-
"token": str(token),
|
50 |
-
"token_str": token_str,
|
51 |
-
},
|
52 |
-
ensure_ascii=False,
|
53 |
-
)
|
54 |
-
)
|
55 |
-
|
56 |
-
# json_dumps = json.dumps(token_str)
|
57 |
-
elif isinstance(token, str):
|
58 |
-
token_str = token
|
59 |
-
else:
|
60 |
-
logger.error(
|
61 |
-
f"{idx}: wrong type for token {token_id} {type(token)} "
|
62 |
-
+ json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
|
63 |
-
)
|
64 |
-
token_str = token
|
65 |
-
|
66 |
-
table.append({"TokenID": token_id, "Text": decoded_text})
|
67 |
-
|
68 |
-
table_df = pd.DataFrame(table)
|
69 |
-
logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
|
70 |
-
return pos_tokens, len(encoding), table_df
|
71 |
-
|
72 |
-
|
73 |
-
def tokenize(
|
74 |
-
text: str, tokenizer_name: str, color_num: int = 5
|
75 |
-
) -> tuple[dict[Any, Any], pd.DataFrame]:
|
76 |
-
"""Tokenize an input text."""
|
77 |
-
pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num)
|
78 |
-
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
79 |
-
|
80 |
-
|
81 |
-
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2, color_num: int = 5):
|
82 |
-
"""input_text.change."""
|
83 |
-
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1, color_num)
|
84 |
-
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2, color_num)
|
85 |
-
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
86 |
-
|
87 |
-
|
88 |
-
def on_load(url_params: str, request: gr.Request = None) -> tuple[str, str, str]:
|
89 |
-
"""Function triggered on page load to get URL parameters."""
|
90 |
-
text = default_user_input
|
91 |
-
tokenizer_type_1 = default_tokenizer_name_1
|
92 |
-
tokenizer_type_2 = default_tokenizer_name_2
|
93 |
-
try:
|
94 |
-
url_params_dict = json.loads(url_params)
|
95 |
-
except json.JSONDecodeError:
|
96 |
-
url_params_dict = {}
|
97 |
-
|
98 |
-
if request:
|
99 |
-
lang, _ = get_lang(request)
|
100 |
-
logger.info(str(request.headers))
|
101 |
-
client_ip = request.client.host
|
102 |
-
|
103 |
-
tokenizer_type_1 = url_params_dict.get("tokenizer1", default_tokenizer_name_1)
|
104 |
-
tokenizer_type_2 = url_params_dict.get("tokenizer2", default_tokenizer_name_2)
|
105 |
-
text = url_params_dict.get("text", default_user_input)
|
106 |
-
logger.info(f"client_ip: {client_ip}; lang: {lang} params: {url_params}")
|
107 |
-
return text, tokenizer_type_1, tokenizer_type_2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
gradio>=4.38.1
|
2 |
-
transformers>4.40.0
|
3 |
-
sentencepiece
|
4 |
-
tiktoken
|
5 |
-
icetk
|
6 |
-
torch
|
7 |
-
nltk
|
8 |
-
boto3
|
9 |
-
protobuf==4.25.3
|
10 |
-
ai2-olmo
|
11 |
-
ipadic
|
12 |
-
fugashi
|
13 |
-
datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/__pycache__/i18n_util.cpython-311.pyc
DELETED
Binary file (1.61 kB)
|
|
utils/__pycache__/lang_util.cpython-311.pyc
DELETED
Binary file (3.24 kB)
|
|
utils/__pycache__/log_util.cpython-311.pyc
DELETED
Binary file (633 Bytes)
|
|
utils/__pycache__/text_util.cpython-311.pyc
DELETED
Binary file (2.21 kB)
|
|
utils/i18n_util.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
def get_lang(request: gr.Request):
|
5 |
-
"""
|
6 |
-
'accept-language', b'zh,en;q=0.9,zh-CN;q=0.8')
|
7 |
-
"""
|
8 |
-
accept_language = None
|
9 |
-
langs = []
|
10 |
-
try:
|
11 |
-
accept_language = request.headers["Accept-Language"]
|
12 |
-
for lang in accept_language.split(",")[:5]:
|
13 |
-
lang = lang.lower()
|
14 |
-
if lang.startswith("en"):
|
15 |
-
langs.append("en")
|
16 |
-
elif lang.startswith("es"):
|
17 |
-
langs.append("es")
|
18 |
-
elif lang.startswith("zh"):
|
19 |
-
langs.append("zh")
|
20 |
-
elif lang.startswith("fr"):
|
21 |
-
langs.append("fr")
|
22 |
-
elif lang.startswith("de"):
|
23 |
-
langs.append("de")
|
24 |
-
except Exception as e:
|
25 |
-
print(e)
|
26 |
-
return accept_language, langs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/lang_util.py
DELETED
@@ -1,89 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
这个detect_language函数通过定义一系列语言字符的Unicode范围,然后使用regex包来检查输入字符串是否包含这些范围内的字符,
|
3 |
-
从而尝试确定字符串可能使用的语言。函数返回一个列表,包含所有匹配的语言名称;如果没有检测到已定义范围内的字符,则返回['Unknown']。
|
4 |
-
|
5 |
-
请注意,由于某些语言(如中文和日文)共享字符集的部分范围,这可能导致某些字符串被识别为多种语言。
|
6 |
-
此外,Latin范围非常广泛,几乎包括了所有西方语言的基本字母,因此可能需要更细致的逻辑来区分使用拉丁字母的具体语言。
|
7 |
-
|
8 |
-
|
9 |
-
通过检查特定的字母和重音符号来区分一些使用拉丁字母的语言。
|
10 |
-
然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
|
11 |
-
例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
|
12 |
-
此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
|
13 |
-
|
14 |
-
## common language
|
15 |
-
English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
|
16 |
-
"""
|
17 |
-
|
18 |
-
import re
|
19 |
-
from typing import List
|
20 |
-
|
21 |
-
# 由于大部分是'latin',所以就不统计了。
|
22 |
-
common_lang = ["Chinese", "Japanese-Kana", "Korean", "Arabic", "number"]
|
23 |
-
|
24 |
-
# Unicode range of different language
|
25 |
-
language_ranges = {
|
26 |
-
(
|
27 |
-
"Arabic",
|
28 |
-
"ar",
|
29 |
-
): r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]",
|
30 |
-
# 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
|
31 |
-
("Chinese", "zh"): r"[\u4e00-\u9fff]",
|
32 |
-
("Japanese", "ja"): r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]",
|
33 |
-
# https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
|
34 |
-
# Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
|
35 |
-
(
|
36 |
-
"Japanese-Kana",
|
37 |
-
"ja-kana",
|
38 |
-
): r"[\u3040-\u309F\u30A0-\u30FF]", # Hiragana & Katakana
|
39 |
-
("Korean", "ko"): r"[\uac00-\ud7a3]",
|
40 |
-
# 拉丁字母系列
|
41 |
-
# ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
|
42 |
-
# ('English', 'en'): r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
|
43 |
-
# ('French', 'fr'): r'[\u00C0-\u00FF]',
|
44 |
-
# ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
|
45 |
-
# ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
|
46 |
-
# 斯拉夫语系列
|
47 |
-
# ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
|
48 |
-
#
|
49 |
-
# 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
|
50 |
-
# 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
|
51 |
-
}
|
52 |
-
|
53 |
-
|
54 |
-
def detect_language_by_unicode(text: str) -> List:
|
55 |
-
"""
|
56 |
-
:param text:
|
57 |
-
:return:
|
58 |
-
"""
|
59 |
-
detected_languages = []
|
60 |
-
for language, pattern in language_ranges.items():
|
61 |
-
if re.search(pattern, text):
|
62 |
-
detected_languages.append(language)
|
63 |
-
|
64 |
-
return detected_languages
|
65 |
-
|
66 |
-
|
67 |
-
if __name__ == "__main__":
|
68 |
-
# 测试函数
|
69 |
-
test_strings = {
|
70 |
-
# 拉丁语系
|
71 |
-
"Hello, world!": "English/Latin",
|
72 |
-
"Hola": "Spanish",
|
73 |
-
"Bonjour": "French",
|
74 |
-
"Guten Tag": "German",
|
75 |
-
"Empieza donde estás. ": "Spanish",
|
76 |
-
# CJK
|
77 |
-
"你好": "Chinese",
|
78 |
-
"こんにちは": "Japanese",
|
79 |
-
"안녕하세요": "Korean",
|
80 |
-
# 其他
|
81 |
-
"Привет": "Russian/Cyrillic",
|
82 |
-
"مرحبا": "Arabic",
|
83 |
-
}
|
84 |
-
|
85 |
-
for s, expected in test_strings.items():
|
86 |
-
# print(f"'{s}' === Detected lang: {detect_language(s)} === Expected: {expected}")
|
87 |
-
print(
|
88 |
-
f"'{s}'\nDetected lang: {detect_language_by_unicode(s)}\nExpected lang: {expected}"
|
89 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/log_util.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
|
3 |
-
logging.basicConfig(
|
4 |
-
format="[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s",
|
5 |
-
level=logging.INFO,
|
6 |
-
datefmt="%Y-%m-%d %H:%M:%S",
|
7 |
-
)
|
8 |
-
|
9 |
-
logger = logging.getLogger(__name__)
|
10 |
-
logger.setLevel(logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/oov_util.py
DELETED
@@ -1,122 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
|
3 |
-
from vocab import TokenizerImpl, all_tokenizer_config, load_tokenizer
|
4 |
-
|
5 |
-
text = (
|
6 |
-
"hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;"
|
7 |
-
" 確実に春が近づいてること; a közoktatással? _ Belföld;"
|
8 |
-
" pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;"
|
9 |
-
" निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;"
|
10 |
-
" « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;"
|
11 |
-
"װיקיװערטערבוך "
|
12 |
-
)
|
13 |
-
whitespace = "\t \n\n\r "
|
14 |
-
bytes = b"\x00\x01\x02\x03\x04".decode("utf-8")
|
15 |
-
|
16 |
-
text += whitespace
|
17 |
-
|
18 |
-
|
19 |
-
def get_unk(tokenizer_config):
|
20 |
-
tokenizer = load_tokenizer(tokenizer_config)
|
21 |
-
if hasattr(tokenizer, "unk_token"):
|
22 |
-
return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
|
23 |
-
else:
|
24 |
-
return "unk_token not found"
|
25 |
-
|
26 |
-
|
27 |
-
# def infer_tokenizer_impl(tokenizer_config):
|
28 |
-
def infer_tokenizer_type(tokenizer_config):
|
29 |
-
tokenizer = load_tokenizer(tokenizer_config)
|
30 |
-
if tokenizer_config.impl == TokenizerImpl.TikToken:
|
31 |
-
return "tiktoken"
|
32 |
-
if hasattr(tokenizer, "backend_tokenizer"):
|
33 |
-
return str(
|
34 |
-
type(tokenizer.backend_tokenizer.model)
|
35 |
-
) # type(tokenizer._tokenizer.model))
|
36 |
-
# orion: sp_model.Load(vocab_file),继承 PreTrainedTokenizer
|
37 |
-
elif hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
38 |
-
# for i in range(tokenizer.sp_model.piece_size()):
|
39 |
-
# if tokenizer.sp_model.is_byte(i):
|
40 |
-
# print("")
|
41 |
-
return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
|
42 |
-
|
43 |
-
# sp.Load(model_path) ,并且包括image_tokenizer
|
44 |
-
elif "glm-" in tokenizer_config.name_or_path:
|
45 |
-
return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
|
46 |
-
# sp.Load(model_path) ,没有image_tokenizer
|
47 |
-
elif (
|
48 |
-
"glm2-" in tokenizer_config.name_or_path
|
49 |
-
or "glm3-" in tokenizer_config.name_or_path
|
50 |
-
or "CharacterGLM-6B" in tokenizer_config.name_or_path
|
51 |
-
):
|
52 |
-
return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
|
53 |
-
elif (
|
54 |
-
"abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path
|
55 |
-
): # 支持 byte-level,解决oov问题
|
56 |
-
return "japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
|
57 |
-
# bert-base-japanese: 特殊的地方在于 "word_tokenizer_type": "mecab",见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
|
58 |
-
elif "bert-base-japanese" in tokenizer_config.name_or_path:
|
59 |
-
return (
|
60 |
-
"wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
|
61 |
-
)
|
62 |
-
elif "moss" in tokenizer_config.name_or_path:
|
63 |
-
return "应该是 sentencepiece.byte_bpe,待确认"
|
64 |
-
elif "byt5" in tokenizer_config.name_or_path:
|
65 |
-
return "未知,待定"
|
66 |
-
else:
|
67 |
-
print("catch", tokenizer_config.name_or_path)
|
68 |
-
raise "error"
|
69 |
-
|
70 |
-
|
71 |
-
def test_lossless(tokenizer_config):
|
72 |
-
"""
|
73 |
-
xlm-roberta-base 为什么oov这么少?是因为有 byte吗?
|
74 |
-
:param tokenizer_config:
|
75 |
-
:return:
|
76 |
-
"""
|
77 |
-
tokenizer = load_tokenizer(tokenizer_config)
|
78 |
-
encoding = tokenizer.encode(text, add_special_tokens=False)
|
79 |
-
decoding = tokenizer.decode(encoding)
|
80 |
-
|
81 |
-
if text in decoding:
|
82 |
-
# print(tokenizer_config.name, tokenizer_config.impl, "lossless: true")
|
83 |
-
pass
|
84 |
-
else:
|
85 |
-
unk_count = sum(
|
86 |
-
[1 for token_id in encoding if token_id == tokenizer.unk_token_id]
|
87 |
-
)
|
88 |
-
oov_tokens = []
|
89 |
-
# if tokenizer_config.impl == TokenizerImpl.SentencePiece:
|
90 |
-
# print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
|
91 |
-
|
92 |
-
print("#######" * 5)
|
93 |
-
print(
|
94 |
-
f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
|
95 |
-
f"lossless: false; unk_token: {get_unk(tokenizer_config)},"
|
96 |
-
f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []"
|
97 |
-
)
|
98 |
-
for i in range(len(text)):
|
99 |
-
if text[i] != decoding[i]:
|
100 |
-
# print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
|
101 |
-
# f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
|
102 |
-
print(
|
103 |
-
f"text[{i}] = {json.dumps(text[i:], ensure_ascii=False)}, \n"
|
104 |
-
f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}"
|
105 |
-
)
|
106 |
-
|
107 |
-
break
|
108 |
-
|
109 |
-
|
110 |
-
for config in all_tokenizer_config:
|
111 |
-
# if "xlm-roberta-base" in config.name:
|
112 |
-
# if "xlm-roberta-base" in config.name:
|
113 |
-
# if "chatglm3-6b" in config.name:
|
114 |
-
# if "bert-base-japanese" in config.name:
|
115 |
-
# if "moss" in config.name:
|
116 |
-
# if "byt5" in config.name:
|
117 |
-
if "baichuan" in config.name_or_path:
|
118 |
-
# if "CharacterGLM-6B" in config.name:
|
119 |
-
# if "fastchat-t5" in config.name: # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
|
120 |
-
# if True:
|
121 |
-
# test_unk(config)
|
122 |
-
test_lossless(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/text_util.py
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
char_
|
3 |
-
"""
|
4 |
-
|
5 |
-
|
6 |
-
def detect_lang_from_unicode():
|
7 |
-
pass
|
8 |
-
|
9 |
-
|
10 |
-
def is_digit_char(uchar):
|
11 |
-
return uchar in "0123456789"
|
12 |
-
|
13 |
-
|
14 |
-
def contains_digit(text):
|
15 |
-
return any(is_digit_char(ch) for ch in text)
|
16 |
-
|
17 |
-
|
18 |
-
def get_digit_count(text):
|
19 |
-
pass
|
20 |
-
|
21 |
-
|
22 |
-
def is_all_digit(text):
|
23 |
-
return all(is_digit_char(char) for char in text)
|
24 |
-
|
25 |
-
|
26 |
-
def get_digit_count(text):
|
27 |
-
digit_count = 0
|
28 |
-
for char in text:
|
29 |
-
if char in "0123456789":
|
30 |
-
digit_count += 1
|
31 |
-
return digit_count
|
32 |
-
|
33 |
-
|
34 |
-
def has_space(text):
|
35 |
-
pass
|
36 |
-
|
37 |
-
|
38 |
-
def is_all_space(text):
|
39 |
-
pass
|
40 |
-
|
41 |
-
|
42 |
-
def get_space_count(text):
|
43 |
-
space_count = 0
|
44 |
-
for char in text:
|
45 |
-
if len(char.strip()) == 0:
|
46 |
-
space_count += 1
|
47 |
-
return space_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab.py
DELETED
@@ -1,754 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass, field
|
2 |
-
from enum import Enum, auto
|
3 |
-
from typing import Any, Dict
|
4 |
-
|
5 |
-
import tiktoken
|
6 |
-
from transformers import AutoTokenizer
|
7 |
-
from utils.log_util import logger
|
8 |
-
|
9 |
-
"""Interface:
|
10 |
-
# https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py
|
11 |
-
|
12 |
-
tokenizer.encode -> List[int]: Converts a string to a sequence of ids (integer)
|
13 |
-
tokenizer.decode
|
14 |
-
tokenizer.convert_tokens_to_string # gpt4 没有这个方法
|
15 |
-
tokenizer.convert_ids_to_tokens
|
16 |
-
tokenizer.tokenize -> List[str]: Converts a string into a sequence of tokens ->
|
17 |
-
|
18 |
-
|
19 |
-
tokenizer.parent = ""
|
20 |
-
tokenizer.vocab_size
|
21 |
-
tokenizer.get_vocab() # gpt-neox-20b, llama
|
22 |
-
tokenizer.type = TokenizerType.ByteBPE.name
|
23 |
-
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
24 |
-
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
|
25 |
-
|
26 |
-
|
27 |
-
tokenizer.comments = "split all numbers into individual digits, " \
|
28 |
-
"and fallback to bytes to decompose unknown UTF-8 characters"
|
29 |
-
|
30 |
-
tokenizer.all_special_tokens # baichuan
|
31 |
-
tokenizer.special_tokens_set # gpt3.5_turbo
|
32 |
-
tokenizer.special_tokens_map
|
33 |
-
"""
|
34 |
-
|
35 |
-
|
36 |
-
class TokenizerImpl(Enum):
|
37 |
-
"""
|
38 |
-
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
|
39 |
-
- https://huggingface.co/docs/transformers/tokenizer_summary
|
40 |
-
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
|
41 |
-
|
42 |
-
## google/BertTokenizer
|
43 |
-
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
|
44 |
-
- 特征
|
45 |
-
- 算法:BERT的编码器是 BPE-WordPiece,将单词拆分成多个前缀符号(比如BERT中的##)最小单元
|
46 |
-
- 词典:有##开头的token,表示subword,
|
47 |
-
- 中文采用char粒度分词
|
48 |
-
- 英文采用 WordPiece
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
## google/sentencepiece
|
54 |
-
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
|
55 |
-
- 支持 sentencepiece 和 wordpiece
|
56 |
-
- sentencepiece 有byte-bpe吗?
|
57 |
-
- UNIGRAM = 1; // Unigram language model with dynamic algorithm
|
58 |
-
- BPE = 2; // Byte Pair Encoding
|
59 |
-
- WORD = 3; // Delimitered by whitespace.
|
60 |
-
- CHAR = 4; // tokenizes into character sequence
|
61 |
-
- wordpiece
|
62 |
-
- 特征:
|
63 |
-
- 训练: spm_train --model_type unigram/bpe/char/word
|
64 |
-
- 特殊符号: Ġ
|
65 |
-
- 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称 (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
|
66 |
-
- 实现:
|
67 |
-
- 依赖: protobuf
|
68 |
-
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
69 |
-
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
70 |
-
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
71 |
-
- 分词:
|
72 |
-
- pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
|
73 |
-
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
74 |
-
- 示例:google-t5, llama,baichuan, orion,
|
75 |
-
- llama: tokenizer.json(包含model.vocab model.merges) tokenizer.model
|
76 |
-
- grok: 原始是 .model文件,后面转成了 tokenizer.json
|
77 |
-
- google-t5: tokenizer.json, spiece.model
|
78 |
-
- Skywork-13B-Math: tokenizer.model
|
79 |
-
- xlm_roberta: sentencepiece.bpe.model
|
80 |
-
- GPT2Tokenizer
|
81 |
-
- tokenizer.json, vocab.json, merges.txt (https://huggingface.co/openai-community/gpt2)
|
82 |
-
- vocab.bpe, encoder.json, dict.txt (fairseq版本,不常用,可以忽略这个版本)
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
## thu/icetk
|
87 |
-
- icetk: sentencepiece的分支,支持image_tokenizer。
|
88 |
-
- glm, chatglm1, chatglm2
|
89 |
-
|
90 |
-
## huggingface/tokenizers
|
91 |
-
- https://github.com/huggingface/tokenizers
|
92 |
-
- VS sentencepiece
|
93 |
-
- 支持sentencepiece
|
94 |
-
- .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
|
95 |
-
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
|
96 |
-
- 加载 merges.txt, vocab.json
|
97 |
-
- SentencePieceBPETokenizer https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
|
98 |
-
- 在 sentencepiece基础上,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好,支持special token
|
99 |
-
- 类型: 支持 BBPE, WordPiece or Unigram
|
100 |
-
- 特征:
|
101 |
-
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
102 |
-
- added_tokens 在vocab中不一定存在。
|
103 |
-
- 实现:
|
104 |
-
- 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
|
105 |
-
- 加载:
|
106 |
-
- 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
|
107 |
-
- .model 是 tokenizer.models.BPE 类型
|
108 |
-
- 词典有 Ġ "\u0120" 开头
|
109 |
-
- 优势
|
110 |
-
-
|
111 |
-
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
112 |
-
- 优势:相对sentence piece,
|
113 |
-
- ss
|
114 |
-
|
115 |
-
## openai/tiktoken
|
116 |
-
- 特征:空格就是空格,
|
117 |
-
- 示例:gpt3.5 gpt4, qwen,
|
118 |
-
"""
|
119 |
-
|
120 |
-
""" 算法体系 https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
|
121 |
-
- word-base tokenizer:
|
122 |
-
- char-base tokenizer:
|
123 |
-
- subword-based Tokenizer
|
124 |
-
- BPE
|
125 |
-
- byte-bpe: base vocabulary大小是256
|
126 |
-
- WordPiece:
|
127 |
-
- 相比BPE,WordPiece 仅保存最终词表,而不保存学到的 merge rule
|
128 |
-
- Unigram
|
129 |
-
- SentencePiece
|
130 |
-
|
131 |
-
"""
|
132 |
-
|
133 |
-
# 分类体系:https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
|
134 |
-
BertTokenizer = "wordpiece.BertTokenizer"
|
135 |
-
JapaneseTokenizer = (
|
136 |
-
"wordpiece.MecabTokenizer",
|
137 |
-
"https://github.com/polm/fugashi",
|
138 |
-
) # 常用日语包 ipadic,fugashi,
|
139 |
-
ByteLevelBPETokenizer = "byte_level_bpe" # BBPE
|
140 |
-
SentencePieceBPETokenizer = "sentencepiece_bpe"
|
141 |
-
|
142 |
-
# 分类体系
|
143 |
-
|
144 |
-
# SentencePeice(BPE)
|
145 |
-
SentencePiece = auto() # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
|
146 |
-
byte_level_bpe = auto()
|
147 |
-
# HFTokenizer = auto() # , 支持
|
148 |
-
TikToken = auto()
|
149 |
-
# subword-nmt
|
150 |
-
# WordPiece
|
151 |
-
|
152 |
-
|
153 |
-
# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
|
154 |
-
|
155 |
-
|
156 |
-
@dataclass
|
157 |
-
class TokenizerConfig:
|
158 |
-
"""
|
159 |
-
https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
|
160 |
-
"""
|
161 |
-
|
162 |
-
name_or_path: str # org/model (path on hub), as unique id
|
163 |
-
name_display: str = None #
|
164 |
-
impl: TokenizerImpl = None # implementation, tokenizer_class/type
|
165 |
-
org: str = None
|
166 |
-
link: str = None # http://**
|
167 |
-
desc: str = None # description
|
168 |
-
meta: str = None
|
169 |
-
level: str = None # char-level, word-level, byte-level
|
170 |
-
lang: str = None
|
171 |
-
init_kwargs: Dict[str, Any] = field(
|
172 |
-
default_factory=dict,
|
173 |
-
)
|
174 |
-
|
175 |
-
def __post_init__(self):
|
176 |
-
if self.link is None:
|
177 |
-
self.link = "https://huggingface.co/" + self.name_or_path # TODO + revision
|
178 |
-
if self.name_display is None:
|
179 |
-
self.name_display = self.name_or_path
|
180 |
-
|
181 |
-
@classmethod
|
182 |
-
def init_from_json_file(cls, json_filepath: str) -> "TokenizerConfig":
|
183 |
-
pass
|
184 |
-
|
185 |
-
def __eq__(self, other):
|
186 |
-
if isinstance(other, self.__class__):
|
187 |
-
return self.__dict__ == other.__dict__
|
188 |
-
else:
|
189 |
-
return False
|
190 |
-
|
191 |
-
def __hash__(self):
|
192 |
-
return hash(self.name_or_path)
|
193 |
-
|
194 |
-
|
195 |
-
# TODO: append link and description to the end of dropdown button.
|
196 |
-
# Add tokenizer_class/type, comments
|
197 |
-
_all_tokenizer_config = [
|
198 |
-
# bert style tokenizers
|
199 |
-
TokenizerConfig(
|
200 |
-
"google-bert/bert-base-cased",
|
201 |
-
impl=TokenizerImpl.BertTokenizer,
|
202 |
-
org="Google",
|
203 |
-
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
204 |
-
),
|
205 |
-
TokenizerConfig(
|
206 |
-
"google-bert/bert-base-uncased",
|
207 |
-
impl=TokenizerImpl.BertTokenizer,
|
208 |
-
org="Google",
|
209 |
-
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
210 |
-
),
|
211 |
-
TokenizerConfig(
|
212 |
-
"google-bert/bert-base-chinese",
|
213 |
-
impl=TokenizerImpl.BertTokenizer,
|
214 |
-
org="Google",
|
215 |
-
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
216 |
-
),
|
217 |
-
TokenizerConfig(
|
218 |
-
"google-bert/bert-base-german-cased",
|
219 |
-
impl=TokenizerImpl.BertTokenizer,
|
220 |
-
org="Google",
|
221 |
-
),
|
222 |
-
TokenizerConfig(
|
223 |
-
"dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"
|
224 |
-
),
|
225 |
-
TokenizerConfig(
|
226 |
-
"asafaya/bert-base-arabic", impl=TokenizerImpl.BertTokenizer, org="-"
|
227 |
-
),
|
228 |
-
TokenizerConfig(
|
229 |
-
"google-bert/bert-base-multilingual-uncased",
|
230 |
-
impl=TokenizerImpl.BertTokenizer,
|
231 |
-
org="Google",
|
232 |
-
),
|
233 |
-
TokenizerConfig(
|
234 |
-
"google-bert/bert-base-multilingual-cased",
|
235 |
-
impl=TokenizerImpl.BertTokenizer,
|
236 |
-
org="Google",
|
237 |
-
),
|
238 |
-
TokenizerConfig(
|
239 |
-
"tohoku-nlp/bert-base-japanese",
|
240 |
-
impl=TokenizerImpl.BertTokenizer,
|
241 |
-
org="Tohoku",
|
242 |
-
desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
|
243 |
-
"then split into subwords by the WordPiece algorithm.",
|
244 |
-
),
|
245 |
-
TokenizerConfig(
|
246 |
-
"clue/roberta_chinese_clue_tiny",
|
247 |
-
name_display="clue/roberta-chinese-clue",
|
248 |
-
impl=TokenizerImpl.BertTokenizer,
|
249 |
-
org="CLUE",
|
250 |
-
init_kwargs={"revision": "refs/pr/1"},
|
251 |
-
desc="",
|
252 |
-
meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md",
|
253 |
-
),
|
254 |
-
TokenizerConfig(
|
255 |
-
"eson/kplug-base-encoder",
|
256 |
-
name_display="eson/kplug",
|
257 |
-
impl=TokenizerImpl.BertTokenizer,
|
258 |
-
org="JD",
|
259 |
-
),
|
260 |
-
TokenizerConfig(
|
261 |
-
"ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"
|
262 |
-
), # 台湾中央研究院
|
263 |
-
# WoBERT https://kexue.fm/archives/7758
|
264 |
-
# WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
|
265 |
-
# gpt2 style tokenizers
|
266 |
-
TokenizerConfig(
|
267 |
-
"openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"
|
268 |
-
),
|
269 |
-
# byte-level BPE,没有byte,是unicode-level的吗?
|
270 |
-
TokenizerConfig(
|
271 |
-
"ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"
|
272 |
-
),
|
273 |
-
TokenizerConfig(
|
274 |
-
"ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"
|
275 |
-
),
|
276 |
-
TokenizerConfig(
|
277 |
-
"fnlp/moss-moon-003-sft",
|
278 |
-
impl=TokenizerImpl.SentencePiece,
|
279 |
-
init_kwargs={"revision": "refs/pr/6"},
|
280 |
-
org="Fudan",
|
281 |
-
desc="This tokenizer has been trained to treat spaces like parts of the tokens "
|
282 |
-
"(a bit like sentencepiece) so a word will be encoded differently whether "
|
283 |
-
"it is at the beginning of the sentence (without space) or not",
|
284 |
-
meta="在gpt2词典基础上,扩充了5万中文",
|
285 |
-
),
|
286 |
-
TokenizerConfig(
|
287 |
-
"bigscience/bloom",
|
288 |
-
impl=TokenizerImpl.SentencePiece,
|
289 |
-
org="BigScience",
|
290 |
-
meta="比gpt_neox的词典 对中文支持更好。",
|
291 |
-
),
|
292 |
-
# ("bloomz_6b4_zh",
|
293 |
-
# ("BelleGroup/BELLE-7B-2M", # 模型和词典都基于bloom
|
294 |
-
#
|
295 |
-
TokenizerConfig(
|
296 |
-
"EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"
|
297 |
-
), # 5万
|
298 |
-
TokenizerConfig(
|
299 |
-
"cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"
|
300 |
-
), # GPTNeoXTokenizer
|
301 |
-
TokenizerConfig(
|
302 |
-
"abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"
|
303 |
-
),
|
304 |
-
TokenizerConfig(
|
305 |
-
"rinna/bilingual-gpt-neox-4b",
|
306 |
-
impl=TokenizerImpl.SentencePiece,
|
307 |
-
org="ABEJA",
|
308 |
-
lang="en/ja",
|
309 |
-
),
|
310 |
-
TokenizerConfig(
|
311 |
-
"Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
312 |
-
), # 15万,速度有点慢
|
313 |
-
TokenizerConfig(
|
314 |
-
"Qwen/Qwen1.5-110B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
315 |
-
),
|
316 |
-
TokenizerConfig(
|
317 |
-
"Qwen/Qwen1.5-1.8B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
318 |
-
),
|
319 |
-
TokenizerConfig("Qwen/Qwen2-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
|
320 |
-
TokenizerConfig("Qwen/Qwen2-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
|
321 |
-
TokenizerConfig(
|
322 |
-
"Qwen/Qwen2.5-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
323 |
-
),
|
324 |
-
TokenizerConfig(
|
325 |
-
"Qwen/Qwen2.5-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
326 |
-
),
|
327 |
-
TokenizerConfig(
|
328 |
-
"HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"
|
329 |
-
),
|
330 |
-
####### google/sentencepiece tokenizer:
|
331 |
-
# T5 llama internlm
|
332 |
-
TokenizerConfig(
|
333 |
-
"google-t5/t5-large",
|
334 |
-
name_display="google-t5/t5",
|
335 |
-
impl=TokenizerImpl.SentencePiece,
|
336 |
-
org="Google",
|
337 |
-
),
|
338 |
-
# t5_small, t5_base, t5_large, flan_t5_base,
|
339 |
-
# ("t5_base", "", "sentencepiece"),
|
340 |
-
# TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
|
341 |
-
TokenizerConfig(
|
342 |
-
"lmsys/fastchat-t5-3b-v1.0",
|
343 |
-
impl=TokenizerImpl.SentencePiece,
|
344 |
-
org="LMSYS",
|
345 |
-
init_kwargs={
|
346 |
-
"use_fast": False
|
347 |
-
}, # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
|
348 |
-
),
|
349 |
-
TokenizerConfig(
|
350 |
-
"CohereForAI/aya-101", org="Cohere For AI"
|
351 |
-
), # "tokenizer_class": "T5Tokenizer",
|
352 |
-
TokenizerConfig(
|
353 |
-
"ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"
|
354 |
-
),
|
355 |
-
TokenizerConfig(
|
356 |
-
"ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"
|
357 |
-
),
|
358 |
-
# byte-level BPE
|
359 |
-
# '中文单字': 700, '中文多字': 0 meta-llama/Meta-Llama-3.1-405B
|
360 |
-
#
|
361 |
-
TokenizerConfig(
|
362 |
-
"meta-llama/Llama-3.2-1B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
|
363 |
-
),
|
364 |
-
TokenizerConfig(
|
365 |
-
"meta-llama/Llama-3.2-3B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
|
366 |
-
),
|
367 |
-
# TokenizerConfig("meta-llama/Llama-3.3-70B-Instruct", impl=TokenizerImpl.SentencePiece,
|
368 |
-
# org="Meta"),
|
369 |
-
TokenizerConfig(
|
370 |
-
"meta-llama/Meta-Llama-3.1-405B", impl=TokenizerImpl.SentencePiece, org="Meta"
|
371 |
-
),
|
372 |
-
TokenizerConfig(
|
373 |
-
"NousResearch/Hermes-3-Llama-3.1-405B",
|
374 |
-
impl=TokenizerImpl.SentencePiece,
|
375 |
-
org="NousResearch",
|
376 |
-
),
|
377 |
-
TokenizerConfig(
|
378 |
-
"gradientai/Llama-3-8B-Instruct-Gradient-1048k",
|
379 |
-
name_display="Meta/llama3",
|
380 |
-
impl=TokenizerImpl.SentencePiece,
|
381 |
-
org="Meta",
|
382 |
-
desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters",
|
383 |
-
),
|
384 |
-
TokenizerConfig(
|
385 |
-
"NousResearch/Llama-2-7b-chat-hf",
|
386 |
-
name_display="Meta/llama2",
|
387 |
-
impl=TokenizerImpl.SentencePiece,
|
388 |
-
org="Meta",
|
389 |
-
),
|
390 |
-
TokenizerConfig(
|
391 |
-
"huggyllama/llama-7b",
|
392 |
-
name_display="Meta/llama",
|
393 |
-
impl=TokenizerImpl.SentencePiece,
|
394 |
-
org="Meta",
|
395 |
-
),
|
396 |
-
TokenizerConfig(
|
397 |
-
"hpcai-tech/grok-1",
|
398 |
-
name_display="xai-org/grok-1",
|
399 |
-
impl=TokenizerImpl.SentencePiece,
|
400 |
-
org="xAI",
|
401 |
-
),
|
402 |
-
# 由.model文件转化为了
|
403 |
-
TokenizerConfig(
|
404 |
-
"hfl/chinese-llama-lora-7b",
|
405 |
-
impl=TokenizerImpl.SentencePiece,
|
406 |
-
org="-",
|
407 |
-
meta="向原始LLaMA的词汇表中添加2w个中文词汇,针对原版LLaMA模型扩充了中文词表, 提升了中文编解码效率",
|
408 |
-
),
|
409 |
-
#
|
410 |
-
TokenizerConfig(
|
411 |
-
"hfl/chinese-llama-2-7b",
|
412 |
-
impl=TokenizerImpl.SentencePiece,
|
413 |
-
org="-",
|
414 |
-
meta="重新设计了新词表(大小:55296),进一步提升了中文字词的覆盖程度",
|
415 |
-
), #
|
416 |
-
TokenizerConfig(
|
417 |
-
"hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"
|
418 |
-
),
|
419 |
-
TokenizerConfig(
|
420 |
-
"hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"
|
421 |
-
),
|
422 |
-
# 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 "比chinese_llama词典多一个`[PAD]`,请勿混用"
|
423 |
-
#
|
424 |
-
# ("belle_llama_ext_7b",
|
425 |
-
# ("alpaca_7b",
|
426 |
-
TokenizerConfig(
|
427 |
-
"baichuan-inc/Baichuan-7B",
|
428 |
-
name_display="baichuan-inc/baichuan",
|
429 |
-
impl=TokenizerImpl.SentencePiece,
|
430 |
-
level="byte-level",
|
431 |
-
org="Baichuan",
|
432 |
-
),
|
433 |
-
TokenizerConfig(
|
434 |
-
"baichuan-inc/Baichuan2-7B-Chat",
|
435 |
-
name_display="baichuan-inc/baichuan2",
|
436 |
-
impl=TokenizerImpl.SentencePiece,
|
437 |
-
org="Baichuan",
|
438 |
-
desc="expand the vocabulary size from 64000 in Baichuan1 to 125696",
|
439 |
-
),
|
440 |
-
TokenizerConfig(
|
441 |
-
"internlm/internlm-chat-7b",
|
442 |
-
impl=TokenizerImpl.SentencePiece,
|
443 |
-
org="Shanghai AI Lab",
|
444 |
-
),
|
445 |
-
# 上海AI实验室 + 商汤
|
446 |
-
TokenizerConfig(
|
447 |
-
"internlm/internlm2-chat-7b",
|
448 |
-
impl=TokenizerImpl.SentencePiece,
|
449 |
-
org="Shanghai AI Lab",
|
450 |
-
),
|
451 |
-
TokenizerConfig(
|
452 |
-
"internlm/internlm2-math-7b",
|
453 |
-
impl=TokenizerImpl.SentencePiece,
|
454 |
-
org="Shanghai AI Lab",
|
455 |
-
),
|
456 |
-
TokenizerConfig(
|
457 |
-
"internlm/internlm-xcomposer-7b",
|
458 |
-
impl=TokenizerImpl.SentencePiece,
|
459 |
-
org="Shanghai AI Lab",
|
460 |
-
),
|
461 |
-
TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
|
462 |
-
TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
|
463 |
-
TokenizerConfig(
|
464 |
-
"Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"
|
465 |
-
),
|
466 |
-
TokenizerConfig(
|
467 |
-
"Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"
|
468 |
-
), # 文件:tokenizer.model
|
469 |
-
TokenizerConfig(
|
470 |
-
"FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"
|
471 |
-
),
|
472 |
-
# 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值?
|
473 |
-
# "goat",
|
474 |
-
# ##### glm系列
|
475 |
-
# "glm_chinese",),
|
476 |
-
TokenizerConfig(
|
477 |
-
"THUDM/chatglm-6b",
|
478 |
-
impl=TokenizerImpl.SentencePiece,
|
479 |
-
org="Tsinghua",
|
480 |
-
meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
|
481 |
-
init_kwargs={"revision": "refs/pr/100"},
|
482 |
-
),
|
483 |
-
TokenizerConfig(
|
484 |
-
"THUDM/chatglm2-6b",
|
485 |
-
impl=TokenizerImpl.SentencePiece,
|
486 |
-
org="Tsinghua",
|
487 |
-
),
|
488 |
-
TokenizerConfig(
|
489 |
-
"THUDM/chatglm3-6b",
|
490 |
-
impl=TokenizerImpl.SentencePiece,
|
491 |
-
org="Tsinghua",
|
492 |
-
),
|
493 |
-
TokenizerConfig(
|
494 |
-
"thu-coai/CharacterGLM-6B",
|
495 |
-
impl=TokenizerImpl.SentencePiece,
|
496 |
-
org="Tsinghua",
|
497 |
-
),
|
498 |
-
# tiktoken 系列
|
499 |
-
TokenizerConfig(
|
500 |
-
"openai/text-davinci-003",
|
501 |
-
impl=TokenizerImpl.TikToken,
|
502 |
-
org="OpenAI",
|
503 |
-
link="https://github.com/openai/tiktoken",
|
504 |
-
),
|
505 |
-
#
|
506 |
-
TokenizerConfig(
|
507 |
-
"openai/code-davinci-002",
|
508 |
-
impl=TokenizerImpl.TikToken,
|
509 |
-
org="OpenAI",
|
510 |
-
link="https://github.com/openai/tiktoken",
|
511 |
-
),
|
512 |
-
TokenizerConfig(
|
513 |
-
"openai/gpt-3.5-turbo",
|
514 |
-
impl=TokenizerImpl.TikToken,
|
515 |
-
org="OpenAI",
|
516 |
-
link="https://github.com/openai/tiktoken",
|
517 |
-
desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError",
|
518 |
-
),
|
519 |
-
TokenizerConfig(
|
520 |
-
"openai/gpt-4",
|
521 |
-
impl=TokenizerImpl.TikToken,
|
522 |
-
org="OpenAI",
|
523 |
-
link="https://github.com/openai/tiktoken",
|
524 |
-
),
|
525 |
-
TokenizerConfig(
|
526 |
-
"openai/gpt-4o",
|
527 |
-
impl=TokenizerImpl.TikToken,
|
528 |
-
org="OpenAI",
|
529 |
-
link="https://github.com/openai/tiktoken",
|
530 |
-
),
|
531 |
-
TokenizerConfig(
|
532 |
-
"Qwen/Qwen-7B-Chat",
|
533 |
-
name_display="Qwen/Qwen",
|
534 |
-
impl=TokenizerImpl.TikToken,
|
535 |
-
org="Alibaba",
|
536 |
-
init_kwargs={"revision": "refs/pr/56"},
|
537 |
-
meta="在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词",
|
538 |
-
),
|
539 |
-
# https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
|
540 |
-
# 该词表在GPT-4使用的BPE词表cl100k_base基础上,对中文、多语言进行了优化,在对中、英、代码数据的高效编解码的基础上,
|
541 |
-
# 对部分多语言更加友好,方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
|
542 |
-
# TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
|
543 |
-
# 未分类
|
544 |
-
# ("amber", ""),
|
545 |
-
TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
|
546 |
-
TokenizerConfig("apple/DCLM-7B", org="Apple"),
|
547 |
-
TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
|
548 |
-
TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
|
549 |
-
TokenizerConfig("mistralai/Mistral-Large-Instruct-2407", org="Mistral"),
|
550 |
-
TokenizerConfig("mistralai/Mistral-Nemo-Instruct-2407", org="Mistral"),
|
551 |
-
TokenizerConfig("paust/pko-t5-large", org="PAUST"),
|
552 |
-
TokenizerConfig("01-ai/Yi-6B", org="Yi"),
|
553 |
-
TokenizerConfig("01-ai/Yi-34B", org="Yi"),
|
554 |
-
TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
|
555 |
-
TokenizerConfig("01-ai/Yi-1.5-34B", org="Yi"),
|
556 |
-
TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
|
557 |
-
TokenizerConfig("microsoft/phi-1", org="Microsoft"),
|
558 |
-
TokenizerConfig("microsoft/phi-2", org="Microsoft"),
|
559 |
-
TokenizerConfig(
|
560 |
-
"microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"
|
561 |
-
),
|
562 |
-
TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
|
563 |
-
TokenizerConfig("google/mobilebert-uncased", org="Google"),
|
564 |
-
# ("google/mobilenet_v2_1.0_224",), # error
|
565 |
-
TokenizerConfig("google/switch-c-2048", org="Google"),
|
566 |
-
TokenizerConfig("google/byt5-small", org="Google"),
|
567 |
-
TokenizerConfig("google/mt5-large", org="Google"),
|
568 |
-
TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
|
569 |
-
TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
|
570 |
-
TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
|
571 |
-
TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
|
572 |
-
TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
|
573 |
-
TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
|
574 |
-
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
575 |
-
TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
|
576 |
-
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
577 |
-
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
578 |
-
TokenizerConfig(
|
579 |
-
"deepseek-ai/DeepSeek-R1", org="DeepSeek"
|
580 |
-
), # 在llama3的词典上,增加了一些中文token,删掉了一部分token
|
581 |
-
TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
|
582 |
-
TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
|
583 |
-
TokenizerConfig("google/gemma-7b", org="Google"),
|
584 |
-
TokenizerConfig("google/gemma-2-9b", org="Google"),
|
585 |
-
TokenizerConfig("allenai/OLMo-7B-hf", org="Allen AI"),
|
586 |
-
TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
|
587 |
-
TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
|
588 |
-
TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
|
589 |
-
TokenizerConfig("MiniMaxAI/MiniMax-Text-01", org="MiniMax"),
|
590 |
-
# TokenizerConfig("nvidia/Nemotron-4-340B-Instruct", org="Nvidia"),
|
591 |
-
# ("claude",),
|
592 |
-
# https://github.com/Duxiaoman-DI/XuanYuan
|
593 |
-
# https://huggingface.co/apple/OpenELM-3B-Instruct https://huggingface.co/apple/OpenELM-3B
|
594 |
-
]
|
595 |
-
|
596 |
-
assert len(set([config.name_display for config in _all_tokenizer_config])) == len(
|
597 |
-
_all_tokenizer_config
|
598 |
-
)
|
599 |
-
assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(
|
600 |
-
_all_tokenizer_config
|
601 |
-
)
|
602 |
-
assert len(
|
603 |
-
set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])
|
604 |
-
) == len(_all_tokenizer_config)
|
605 |
-
|
606 |
-
|
607 |
-
class TokenizerFactory:
|
608 |
-
def __init__(self):
|
609 |
-
# self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
|
610 |
-
self.all_tokenizer_configs = sorted(
|
611 |
-
_all_tokenizer_config, key=lambda k: k.name_display
|
612 |
-
)
|
613 |
-
self.all_tokenizer_names = [
|
614 |
-
config.name_or_path for config in self.all_tokenizer_configs
|
615 |
-
]
|
616 |
-
self.name_to_config_list = [
|
617 |
-
{config.name_or_path: config for config in self.all_tokenizer_configs},
|
618 |
-
{config.name_display: config for config in self.all_tokenizer_configs},
|
619 |
-
{
|
620 |
-
config.name_display.split("/")[-1]: config
|
621 |
-
for config in self.all_tokenizer_configs
|
622 |
-
},
|
623 |
-
]
|
624 |
-
self.tokenizer_cache = {}
|
625 |
-
|
626 |
-
def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
|
627 |
-
for name_to_config in self.name_to_config_list:
|
628 |
-
if tokenizer_name in name_to_config:
|
629 |
-
return name_to_config[tokenizer_name]
|
630 |
-
return None
|
631 |
-
|
632 |
-
def get_tokenizer(self, tokenizer_name: str):
|
633 |
-
"""
|
634 |
-
:param tokenizer_name:
|
635 |
-
:return:
|
636 |
-
"""
|
637 |
-
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
638 |
-
|
639 |
-
# 1. load from cache
|
640 |
-
if tokenizer_config in self.tokenizer_cache:
|
641 |
-
return self.tokenizer_cache[tokenizer_config]
|
642 |
-
|
643 |
-
# 2. load tokenizer
|
644 |
-
tokenizer = self.load_tokenizer(tokenizer_config)
|
645 |
-
|
646 |
-
self.tokenizer_cache[tokenizer_config] = tokenizer
|
647 |
-
return tokenizer
|
648 |
-
|
649 |
-
def get_name_with_hyperlink(self, tokenizer_name: str) -> str:
|
650 |
-
def model_hyperlink(link, model_name):
|
651 |
-
model_name = model_name
|
652 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
653 |
-
|
654 |
-
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
655 |
-
return model_hyperlink(
|
656 |
-
tokenizer_config.link, tokenizer_config.name_display.split("/")[-1]
|
657 |
-
)
|
658 |
-
|
659 |
-
def load_tokenizer(self, tokenizer_config):
|
660 |
-
if tokenizer_config == None:
|
661 |
-
print("dd")
|
662 |
-
logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
|
663 |
-
if (
|
664 |
-
tokenizer_config.impl == TokenizerImpl.TikToken
|
665 |
-
and "openai" in tokenizer_config.name_or_path
|
666 |
-
):
|
667 |
-
tokenizer = tiktoken.encoding_for_model(
|
668 |
-
tokenizer_config.name_or_path.replace("openai/", "")
|
669 |
-
)
|
670 |
-
else:
|
671 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
672 |
-
tokenizer_config.name_or_path,
|
673 |
-
trust_remote_code=True,
|
674 |
-
**tokenizer_config.init_kwargs,
|
675 |
-
)
|
676 |
-
return tokenizer
|
677 |
-
|
678 |
-
def add_config(
|
679 |
-
self,
|
680 |
-
):
|
681 |
-
pass
|
682 |
-
|
683 |
-
def add_tokenizer(self, tokenizer_name):
|
684 |
-
pass
|
685 |
-
|
686 |
-
|
687 |
-
tokenizer_factory = TokenizerFactory()
|
688 |
-
|
689 |
-
|
690 |
-
def add_tokenizer(tokenizer_name: str):
|
691 |
-
"""
|
692 |
-
:param tokenizer_name:
|
693 |
-
:return:
|
694 |
-
"""
|
695 |
-
if tokenizer_name in []:
|
696 |
-
logger.info(f"{tokenizer_name} already exits")
|
697 |
-
else:
|
698 |
-
# add to config
|
699 |
-
tokenizer_config = TokenizerConfig(tokenizer_name, org="-")
|
700 |
-
|
701 |
-
# add to tokenizer
|
702 |
-
tokenizer = tokenizer_factory.load_tokenizer(tokenizer_config)
|
703 |
-
|
704 |
-
# refresh cache
|
705 |
-
|
706 |
-
try:
|
707 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
708 |
-
tokenizer_name, trust_remote_code=True, **tokenizer_config.init_kwargs
|
709 |
-
)
|
710 |
-
tokenizer_factory.all_tokenizer_configs.append(
|
711 |
-
"",
|
712 |
-
)
|
713 |
-
tokenizer_factory
|
714 |
-
|
715 |
-
except Exception as e:
|
716 |
-
logger.error(e)
|
717 |
-
|
718 |
-
pass
|
719 |
-
|
720 |
-
|
721 |
-
# class TokenizerType(Enum):
|
722 |
-
#
|
723 |
-
# # BERTTokenizer
|
724 |
-
# # 依赖一个txt文件
|
725 |
-
#
|
726 |
-
#
|
727 |
-
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
|
728 |
-
# # 依赖一个json文件,Tokenizer.from_file(vocab_file)
|
729 |
-
# # 案例:gpt-neox-20B
|
730 |
-
# HFTokenizer = auto()
|
731 |
-
#
|
732 |
-
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
|
733 |
-
# # 案例:
|
734 |
-
# SentencePieceTokenizer = auto()
|
735 |
-
#
|
736 |
-
#
|
737 |
-
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
|
738 |
-
# # 源码:
|
739 |
-
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
|
740 |
-
# # Byte-level BPE
|
741 |
-
# GPT2BPETokenizer = auto()
|
742 |
-
|
743 |
-
|
744 |
-
if __name__ == "__main__":
|
745 |
-
for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
|
746 |
-
if True:
|
747 |
-
# if "t5" in tokenizer_config.name_or_path:
|
748 |
-
tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
|
749 |
-
tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
|
750 |
-
tokenizer3 = tokenizer_factory.get_tokenizer(
|
751 |
-
tokenizer_config.name_display.split("/")[-1]
|
752 |
-
)
|
753 |
-
assert tokenizer1 == tokenizer2 == tokenizer3
|
754 |
-
print(tokenizer_config.name_or_path, len(tokenizer1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|