Spaces:
Running
Running
gpantaz
commited on
Commit
·
b4dc5cb
1
Parent(s):
05a8ffa
Add application file
Browse files- LICENSE +21 -0
- README.md +1 -12
- __pycache__/character_util.cpython-311.pyc +0 -0
- __pycache__/playground_app.cpython-311.pyc +0 -0
- __pycache__/playground_examples.cpython-311.pyc +0 -0
- __pycache__/playground_util.cpython-311.pyc +0 -0
- __pycache__/vocab.cpython-311.pyc +0 -0
- app.py +24 -0
- character_util.py +178 -0
- playground_app.py +91 -0
- playground_examples.py +42 -0
- playground_util.py +107 -0
- requirements.txt +13 -0
- utils/__pycache__/i18n_util.cpython-311.pyc +0 -0
- utils/__pycache__/lang_util.cpython-311.pyc +0 -0
- utils/__pycache__/log_util.cpython-311.pyc +0 -0
- utils/__pycache__/text_util.cpython-311.pyc +0 -0
- utils/i18n_util.py +26 -0
- utils/lang_util.py +89 -0
- utils/log_util.py +10 -0
- utils/oov_util.py +122 -0
- utils/text_util.py +47 -0
- vocab.py +754 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Athens NLP Summer School
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1 @@
|
|
1 |
-
|
2 |
-
title: Test
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.34.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# tokenization_playground
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/character_util.cpython-311.pyc
ADDED
Binary file (11.9 kB). View file
|
|
__pycache__/playground_app.cpython-311.pyc
ADDED
Binary file (5.94 kB). View file
|
|
__pycache__/playground_examples.cpython-311.pyc
ADDED
Binary file (1.04 kB). View file
|
|
__pycache__/playground_util.cpython-311.pyc
ADDED
Binary file (5.98 kB). View file
|
|
__pycache__/vocab.cpython-311.pyc
ADDED
Binary file (27.5 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from huggingface_hub import login
|
5 |
+
from playground_app import demo as playground_tab
|
6 |
+
|
7 |
+
auth_token = os.environ.get("HF_TOKEN", None)
|
8 |
+
if auth_token:
|
9 |
+
login(token=auth_token)
|
10 |
+
|
11 |
+
|
12 |
+
title = """
|
13 |
+
<div align="center">
|
14 |
+
<span>Tokenization Playground</span>
|
15 |
+
</div>
|
16 |
+
"""
|
17 |
+
|
18 |
+
with gr.Blocks() as demo:
|
19 |
+
_ = gr.HTML(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
|
20 |
+
_ = playground_tab.render()
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
# demo.launch()
|
24 |
+
demo.launch(share=True)
|
character_util.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Literal
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from utils.lang_util import detect_language_by_unicode, language_ranges
|
9 |
+
from utils.log_util import logger
|
10 |
+
from utils.text_util import contains_digit, get_space_count
|
11 |
+
from vocab import tokenizer_factory
|
12 |
+
|
13 |
+
CURRENT_DIR = Path.parent(Path.resolve(__file__))
|
14 |
+
|
15 |
+
cache = {}
|
16 |
+
default_columns = ["digit", "zh"]
|
17 |
+
|
18 |
+
|
19 |
+
def text_to_unicode(text: str) -> str:
|
20 |
+
"""Convert text to unicode representation."""
|
21 |
+
return "".join(rf"\u{ord(character):04X}" for character in text)
|
22 |
+
|
23 |
+
|
24 |
+
def calculate_dist(token_lens: list[int]) -> str:
|
25 |
+
"""Calculate the distribution of token lengths."""
|
26 |
+
if not token_lens:
|
27 |
+
return "-"
|
28 |
+
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
29 |
+
|
30 |
+
|
31 |
+
def iter_vocab(
|
32 |
+
tokenizer_name: str,
|
33 |
+
from_cache: bool = True,
|
34 |
+
cache_dir: str = "stats",
|
35 |
+
) -> pd.DataFrame | dict:
|
36 |
+
""":param tokenizer_name:
|
37 |
+
:param from_cache:
|
38 |
+
:param cache_dir:
|
39 |
+
:return:
|
40 |
+
"""
|
41 |
+
tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
|
42 |
+
|
43 |
+
cache_dir = os.path.join(CURRENT_DIR, cache_dir)
|
44 |
+
os.makedirs(cache_dir, exist_ok=True)
|
45 |
+
|
46 |
+
# load from cache
|
47 |
+
cache_path = os.path.join(cache_dir, "character_stats.json")
|
48 |
+
if not cache and os.path.exists(cache_path):
|
49 |
+
with open(cache_path, encoding="utf-8") as f_tmp:
|
50 |
+
cache.update(json.load(f_tmp))
|
51 |
+
if from_cache and tokenizer_name in cache:
|
52 |
+
# logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
53 |
+
return cache[tokenizer_name]
|
54 |
+
|
55 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
56 |
+
|
57 |
+
tokens_by_lang = {lang[1]: [] for lang in language_ranges}
|
58 |
+
digit_tokens = []
|
59 |
+
space_tokens = []
|
60 |
+
byte_tokens = []
|
61 |
+
|
62 |
+
buffer = []
|
63 |
+
for token_id in range(tokenizer.vocab_size):
|
64 |
+
# for token_id in tokenizer.get_vocab():
|
65 |
+
# for token_id in range(len(tokenizer)):
|
66 |
+
decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
|
67 |
+
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
68 |
+
tags = []
|
69 |
+
if token is None: # 有些词典有空的id(不连续)
|
70 |
+
continue
|
71 |
+
if isinstance(token, bytes):
|
72 |
+
token = token.decode("utf-8", errors="ignore")
|
73 |
+
|
74 |
+
if hasattr(tokenizer, "sp_model") and tokenizer.sp_model.is_byte(token_id):
|
75 |
+
tags.append("is_byte")
|
76 |
+
byte_tokens.append(token)
|
77 |
+
|
78 |
+
language_tags = detect_language_by_unicode(decode_str)
|
79 |
+
for language in language_tags:
|
80 |
+
tokens_by_lang[language[1]].append(decode_str)
|
81 |
+
|
82 |
+
if contains_digit(decode_str):
|
83 |
+
tags.append("digit")
|
84 |
+
digit_tokens.append(decode_str)
|
85 |
+
|
86 |
+
space_count = get_space_count(decode_str)
|
87 |
+
if space_count > 0:
|
88 |
+
space_tokens.append(decode_str)
|
89 |
+
|
90 |
+
buffer.append(
|
91 |
+
json.dumps(
|
92 |
+
{
|
93 |
+
"id": token_id,
|
94 |
+
"token": token,
|
95 |
+
"token_decode": decode_str,
|
96 |
+
"token_dumps": json.dumps(token),
|
97 |
+
"token_unicode": text_to_unicode(token),
|
98 |
+
"token_len": len(decode_str),
|
99 |
+
},
|
100 |
+
ensure_ascii=False,
|
101 |
+
)
|
102 |
+
+ "\n"
|
103 |
+
)
|
104 |
+
|
105 |
+
result = {
|
106 |
+
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
107 |
+
"organization": tokenizer_config.org,
|
108 |
+
"vocab_size": len(tokenizer),
|
109 |
+
"num(digit)": len(digit_tokens),
|
110 |
+
"len(digit)": calculate_dist([len(token) for token in digit_tokens]),
|
111 |
+
"num(space)": len(space_tokens),
|
112 |
+
"len(space)": calculate_dist([len(token) for token in space_tokens]),
|
113 |
+
}
|
114 |
+
|
115 |
+
for lang, tokens in tokens_by_lang.items():
|
116 |
+
result[f"num({lang})"] = len(tokens)
|
117 |
+
result["len(" + lang + ")"] = calculate_dist([len(token) for token in tokens])
|
118 |
+
|
119 |
+
out_path = os.path.join(
|
120 |
+
cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl"
|
121 |
+
)
|
122 |
+
with open(out_path, "w", encoding="utf-8") as f_out:
|
123 |
+
for line in buffer:
|
124 |
+
f_out.write(line)
|
125 |
+
len_before = len(cache)
|
126 |
+
cache[tokenizer_name] = result
|
127 |
+
len_after = len(cache)
|
128 |
+
logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
|
129 |
+
with open(cache_path, "w", encoding="utf-8") as f_out:
|
130 |
+
f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
|
131 |
+
return result
|
132 |
+
|
133 |
+
|
134 |
+
def to_dataframe(stats: dict[str, Any], columns: list[str]) -> pd.DataFrame:
|
135 |
+
table = []
|
136 |
+
for stat in stats.values():
|
137 |
+
filtered_stat = {}
|
138 |
+
for k, v in stat.items():
|
139 |
+
if not k.startswith("num") and not k.startswith("len"):
|
140 |
+
filtered_stat[k] = v
|
141 |
+
if any(column in k for column in columns):
|
142 |
+
k = k.replace("ja-kana", "kana")
|
143 |
+
filtered_stat[k] = v
|
144 |
+
table.append(filtered_stat)
|
145 |
+
return pd.DataFrame(table)
|
146 |
+
|
147 |
+
|
148 |
+
def get_character_table(
|
149 |
+
tokenizer_filter: str | None = None,
|
150 |
+
columns: list | None = None,
|
151 |
+
return_type: Literal["dict", "dataframe"] | None = "dataframe",
|
152 |
+
) -> pd.DataFrame | dict:
|
153 |
+
logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
|
154 |
+
stats = {}
|
155 |
+
if columns is None:
|
156 |
+
columns = default_columns
|
157 |
+
if tokenizer_filter is not None:
|
158 |
+
tokenizer_names = [
|
159 |
+
tokenizer_config.name_or_path
|
160 |
+
for tokenizer_config in tokenizer_factory.all_tokenizer_configs
|
161 |
+
if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()
|
162 |
+
]
|
163 |
+
else:
|
164 |
+
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
165 |
+
|
166 |
+
for tokenizer_name in tokenizer_names:
|
167 |
+
stat = iter_vocab(tokenizer_name)
|
168 |
+
stats[tokenizer_name] = stat
|
169 |
+
|
170 |
+
if return_type == "dataframe":
|
171 |
+
stats = to_dataframe(stats, columns)
|
172 |
+
return stats
|
173 |
+
|
174 |
+
|
175 |
+
if __name__ == "__main__":
|
176 |
+
# aa = get_character_table(tokenizer_filter="baichuan")
|
177 |
+
df = get_character_table()
|
178 |
+
logger.info(f"\n{df.to_markdown(index=False)}")
|
playground_app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from playground_examples import examples
|
3 |
+
from playground_util import on_load, tokenize, tokenize_pair
|
4 |
+
from vocab import tokenizer_factory
|
5 |
+
|
6 |
+
get_window_url_params = """
|
7 |
+
function(url_params) {
|
8 |
+
const params = new URLSearchParams(window.location.search);
|
9 |
+
url_params = JSON.stringify(Object.fromEntries(params));
|
10 |
+
return url_params;
|
11 |
+
}
|
12 |
+
"""
|
13 |
+
|
14 |
+
all_tokenizer_name = [
|
15 |
+
(config.name_display, config.name_or_path)
|
16 |
+
for config in tokenizer_factory.all_tokenizer_configs
|
17 |
+
]
|
18 |
+
|
19 |
+
with gr.Blocks() as demo:
|
20 |
+
with gr.Row():
|
21 |
+
gr.Markdown("## Input Text")
|
22 |
+
dropdown_examples = gr.Dropdown(
|
23 |
+
sorted(examples.keys()),
|
24 |
+
value="Examples",
|
25 |
+
type="index",
|
26 |
+
allow_custom_value=True,
|
27 |
+
show_label=False,
|
28 |
+
container=False,
|
29 |
+
scale=0,
|
30 |
+
elem_classes="example-style",
|
31 |
+
)
|
32 |
+
user_input = gr.Textbox(
|
33 |
+
label="Input Text",
|
34 |
+
lines=5,
|
35 |
+
show_label=False,
|
36 |
+
)
|
37 |
+
|
38 |
+
with gr.Row():
|
39 |
+
with gr.Column(scale=6), gr.Group():
|
40 |
+
tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1")
|
41 |
+
|
42 |
+
with gr.Column(scale=6), gr.Group():
|
43 |
+
tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2")
|
44 |
+
|
45 |
+
with gr.Row():
|
46 |
+
# dynamic change label
|
47 |
+
with gr.Column():
|
48 |
+
output_text_1 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
|
49 |
+
with gr.Column():
|
50 |
+
output_text_2 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
|
51 |
+
|
52 |
+
with gr.Row():
|
53 |
+
output_table_1 = gr.Dataframe()
|
54 |
+
output_table_2 = gr.Dataframe()
|
55 |
+
|
56 |
+
tokenizer_name_1.change(
|
57 |
+
tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1]
|
58 |
+
)
|
59 |
+
|
60 |
+
tokenizer_name_2.change(
|
61 |
+
tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2]
|
62 |
+
)
|
63 |
+
|
64 |
+
user_input.change(
|
65 |
+
tokenize_pair,
|
66 |
+
[user_input, tokenizer_name_1, tokenizer_name_2],
|
67 |
+
[output_text_1, output_table_1, output_text_2, output_table_2],
|
68 |
+
show_api=False,
|
69 |
+
)
|
70 |
+
|
71 |
+
dropdown_examples.change(
|
72 |
+
lambda example_idx: (
|
73 |
+
examples[sorted(examples.keys())[example_idx]]["text"],
|
74 |
+
examples[sorted(examples.keys())[example_idx]]["tokenizer_1"],
|
75 |
+
examples[sorted(examples.keys())[example_idx]]["tokenizer_2"],
|
76 |
+
),
|
77 |
+
dropdown_examples,
|
78 |
+
[user_input, tokenizer_name_1, tokenizer_name_2],
|
79 |
+
show_api=False,
|
80 |
+
)
|
81 |
+
|
82 |
+
demo.load(
|
83 |
+
fn=on_load,
|
84 |
+
inputs=[user_input],
|
85 |
+
outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
|
86 |
+
js=get_window_url_params,
|
87 |
+
show_api=False,
|
88 |
+
)
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
demo.launch(share=True)
|
playground_examples.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
default_user_input = """Replace this text in the input field to see how tokenization works."""
|
2 |
+
default_tokenizer_name_1 = "openai/gpt-4o"
|
3 |
+
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
4 |
+
|
5 |
+
|
6 |
+
number_example = """127+677=804
|
7 |
+
127 + 677 = 804\n
|
8 |
+
1275+6773 = 8041
|
9 |
+
1275 + 6773 = 8048"""
|
10 |
+
|
11 |
+
code_example = """for i in range(1, 101):
|
12 |
+
if i % 3 == 0 and i % 5 == 0:
|
13 |
+
print("FizzBuzz")
|
14 |
+
elif i % 3 == 0:
|
15 |
+
print("Fizz")
|
16 |
+
elif i % 5 == 0:
|
17 |
+
print("Buzz")
|
18 |
+
else:
|
19 |
+
print(i)
|
20 |
+
"""
|
21 |
+
|
22 |
+
spelling_example = """How do you spell "accommodate"?
|
23 |
+
How many letters are in the word "accommodate"?
|
24 |
+
How many r's are in the word strawberry?"""
|
25 |
+
|
26 |
+
examples = {
|
27 |
+
"number": {
|
28 |
+
"text": number_example,
|
29 |
+
"tokenizer_1": default_tokenizer_name_1,
|
30 |
+
"tokenizer_2": default_tokenizer_name_2,
|
31 |
+
},
|
32 |
+
"code": {
|
33 |
+
"text": code_example,
|
34 |
+
"tokenizer_1": default_tokenizer_name_1,
|
35 |
+
"tokenizer_2": default_tokenizer_name_2,
|
36 |
+
},
|
37 |
+
"spelling": {
|
38 |
+
"text": spelling_example,
|
39 |
+
"tokenizer_1": default_tokenizer_name_1,
|
40 |
+
"tokenizer_2": default_tokenizer_name_2,
|
41 |
+
},
|
42 |
+
}
|
playground_util.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import Any
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import pandas as pd
|
7 |
+
from playground_examples import (
|
8 |
+
default_tokenizer_name_1,
|
9 |
+
default_tokenizer_name_2,
|
10 |
+
default_user_input,
|
11 |
+
)
|
12 |
+
from utils.i18n_util import get_lang
|
13 |
+
from utils.log_util import logger
|
14 |
+
from vocab import tokenizer_factory
|
15 |
+
|
16 |
+
|
17 |
+
@lru_cache
|
18 |
+
def _tokenize(text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False):
|
19 |
+
logger.info(
|
20 |
+
"param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
|
21 |
+
)
|
22 |
+
pos_tokens = []
|
23 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
24 |
+
encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text)
|
25 |
+
table = []
|
26 |
+
|
27 |
+
for idx, token_id in enumerate(encoding):
|
28 |
+
decoded_text = tokenizer.decode([token_id])
|
29 |
+
decoded_text = decoded_text.replace(
|
30 |
+
" ", "⋅"
|
31 |
+
) # replace space with ⋅ for better visualization
|
32 |
+
pos_tokens.extend([(decoded_text, str(idx % color_num))])
|
33 |
+
|
34 |
+
try:
|
35 |
+
token = tokenizer.decode([token_id])[0]
|
36 |
+
except:
|
37 |
+
token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id]
|
38 |
+
|
39 |
+
if isinstance(token, bytes):
|
40 |
+
try:
|
41 |
+
token_str = token.decode("utf-8")
|
42 |
+
except:
|
43 |
+
token_str = token.decode("utf-8", errors="ignore")
|
44 |
+
logger.error(
|
45 |
+
f"{idx}: decode_error: "
|
46 |
+
+ json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
47 |
+
{
|
48 |
+
"tokenizer_type": tokenizer_name,
|
49 |
+
"token": str(token),
|
50 |
+
"token_str": token_str,
|
51 |
+
},
|
52 |
+
ensure_ascii=False,
|
53 |
+
)
|
54 |
+
)
|
55 |
+
|
56 |
+
# json_dumps = json.dumps(token_str)
|
57 |
+
elif isinstance(token, str):
|
58 |
+
token_str = token
|
59 |
+
else:
|
60 |
+
logger.error(
|
61 |
+
f"{idx}: wrong type for token {token_id} {type(token)} "
|
62 |
+
+ json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
|
63 |
+
)
|
64 |
+
token_str = token
|
65 |
+
|
66 |
+
table.append({"TokenID": token_id, "Text": decoded_text})
|
67 |
+
|
68 |
+
table_df = pd.DataFrame(table)
|
69 |
+
logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
|
70 |
+
return pos_tokens, len(encoding), table_df
|
71 |
+
|
72 |
+
|
73 |
+
def tokenize(
|
74 |
+
text: str, tokenizer_name: str, color_num: int = 5
|
75 |
+
) -> tuple[dict[Any, Any], pd.DataFrame]:
|
76 |
+
"""Tokenize an input text."""
|
77 |
+
pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num)
|
78 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
79 |
+
|
80 |
+
|
81 |
+
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2, color_num: int = 5):
|
82 |
+
"""input_text.change."""
|
83 |
+
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1, color_num)
|
84 |
+
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2, color_num)
|
85 |
+
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
86 |
+
|
87 |
+
|
88 |
+
def on_load(url_params: str, request: gr.Request = None) -> tuple[str, str, str]:
|
89 |
+
"""Function triggered on page load to get URL parameters."""
|
90 |
+
text = default_user_input
|
91 |
+
tokenizer_type_1 = default_tokenizer_name_1
|
92 |
+
tokenizer_type_2 = default_tokenizer_name_2
|
93 |
+
try:
|
94 |
+
url_params_dict = json.loads(url_params)
|
95 |
+
except json.JSONDecodeError:
|
96 |
+
url_params_dict = {}
|
97 |
+
|
98 |
+
if request:
|
99 |
+
lang, _ = get_lang(request)
|
100 |
+
logger.info(str(request.headers))
|
101 |
+
client_ip = request.client.host
|
102 |
+
|
103 |
+
tokenizer_type_1 = url_params_dict.get("tokenizer1", default_tokenizer_name_1)
|
104 |
+
tokenizer_type_2 = url_params_dict.get("tokenizer2", default_tokenizer_name_2)
|
105 |
+
text = url_params_dict.get("text", default_user_input)
|
106 |
+
logger.info(f"client_ip: {client_ip}; lang: {lang} params: {url_params}")
|
107 |
+
return text, tokenizer_type_1, tokenizer_type_2
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.38.1
|
2 |
+
transformers>4.40.0
|
3 |
+
sentencepiece
|
4 |
+
tiktoken
|
5 |
+
icetk
|
6 |
+
torch
|
7 |
+
nltk
|
8 |
+
boto3
|
9 |
+
protobuf==4.25.3
|
10 |
+
ai2-olmo
|
11 |
+
ipadic
|
12 |
+
fugashi
|
13 |
+
datasets
|
utils/__pycache__/i18n_util.cpython-311.pyc
ADDED
Binary file (1.61 kB). View file
|
|
utils/__pycache__/lang_util.cpython-311.pyc
ADDED
Binary file (3.24 kB). View file
|
|
utils/__pycache__/log_util.cpython-311.pyc
ADDED
Binary file (633 Bytes). View file
|
|
utils/__pycache__/text_util.cpython-311.pyc
ADDED
Binary file (2.21 kB). View file
|
|
utils/i18n_util.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
|
4 |
+
def get_lang(request: gr.Request):
|
5 |
+
"""
|
6 |
+
'accept-language', b'zh,en;q=0.9,zh-CN;q=0.8')
|
7 |
+
"""
|
8 |
+
accept_language = None
|
9 |
+
langs = []
|
10 |
+
try:
|
11 |
+
accept_language = request.headers["Accept-Language"]
|
12 |
+
for lang in accept_language.split(",")[:5]:
|
13 |
+
lang = lang.lower()
|
14 |
+
if lang.startswith("en"):
|
15 |
+
langs.append("en")
|
16 |
+
elif lang.startswith("es"):
|
17 |
+
langs.append("es")
|
18 |
+
elif lang.startswith("zh"):
|
19 |
+
langs.append("zh")
|
20 |
+
elif lang.startswith("fr"):
|
21 |
+
langs.append("fr")
|
22 |
+
elif lang.startswith("de"):
|
23 |
+
langs.append("de")
|
24 |
+
except Exception as e:
|
25 |
+
print(e)
|
26 |
+
return accept_language, langs
|
utils/lang_util.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
这个detect_language函数通过定义一系列语言字符的Unicode范围,然后使用regex包来检查输入字符串是否包含这些范围内的字符,
|
3 |
+
从而尝试确定字符串可能使用的语言。函数返回一个列表,包含所有匹配的语言名称;如果没有检测到已定义范围内的字符,则返回['Unknown']。
|
4 |
+
|
5 |
+
请注意,由于某些语言(如中文和日文)共享字符集的部分范围,这可能导致某些字符串被识别为多种语言。
|
6 |
+
此外,Latin范围非常广泛,几乎包括了所有西方语言的基本字母,因此可能需要更细致的逻辑来区分使用拉丁字母的具体语言。
|
7 |
+
|
8 |
+
|
9 |
+
通过检查特定的字母和重音符号来区分一些使用拉丁字母的语言。
|
10 |
+
然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
|
11 |
+
例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
|
12 |
+
此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
|
13 |
+
|
14 |
+
## common language
|
15 |
+
English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
|
16 |
+
"""
|
17 |
+
|
18 |
+
import re
|
19 |
+
from typing import List
|
20 |
+
|
21 |
+
# 由于大部分是'latin',所以就不统计了。
|
22 |
+
common_lang = ["Chinese", "Japanese-Kana", "Korean", "Arabic", "number"]
|
23 |
+
|
24 |
+
# Unicode range of different language
|
25 |
+
language_ranges = {
|
26 |
+
(
|
27 |
+
"Arabic",
|
28 |
+
"ar",
|
29 |
+
): r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]",
|
30 |
+
# 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
|
31 |
+
("Chinese", "zh"): r"[\u4e00-\u9fff]",
|
32 |
+
("Japanese", "ja"): r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]",
|
33 |
+
# https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
|
34 |
+
# Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
|
35 |
+
(
|
36 |
+
"Japanese-Kana",
|
37 |
+
"ja-kana",
|
38 |
+
): r"[\u3040-\u309F\u30A0-\u30FF]", # Hiragana & Katakana
|
39 |
+
("Korean", "ko"): r"[\uac00-\ud7a3]",
|
40 |
+
# 拉丁字母系列
|
41 |
+
# ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
|
42 |
+
# ('English', 'en'): r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
|
43 |
+
# ('French', 'fr'): r'[\u00C0-\u00FF]',
|
44 |
+
# ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
|
45 |
+
# ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
|
46 |
+
# 斯拉夫语系列
|
47 |
+
# ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
|
48 |
+
#
|
49 |
+
# 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
|
50 |
+
# 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
|
51 |
+
}
|
52 |
+
|
53 |
+
|
54 |
+
def detect_language_by_unicode(text: str) -> List:
|
55 |
+
"""
|
56 |
+
:param text:
|
57 |
+
:return:
|
58 |
+
"""
|
59 |
+
detected_languages = []
|
60 |
+
for language, pattern in language_ranges.items():
|
61 |
+
if re.search(pattern, text):
|
62 |
+
detected_languages.append(language)
|
63 |
+
|
64 |
+
return detected_languages
|
65 |
+
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
# 测试函数
|
69 |
+
test_strings = {
|
70 |
+
# 拉丁语系
|
71 |
+
"Hello, world!": "English/Latin",
|
72 |
+
"Hola": "Spanish",
|
73 |
+
"Bonjour": "French",
|
74 |
+
"Guten Tag": "German",
|
75 |
+
"Empieza donde estás. ": "Spanish",
|
76 |
+
# CJK
|
77 |
+
"你好": "Chinese",
|
78 |
+
"こんにちは": "Japanese",
|
79 |
+
"안녕하세요": "Korean",
|
80 |
+
# 其他
|
81 |
+
"Привет": "Russian/Cyrillic",
|
82 |
+
"مرحبا": "Arabic",
|
83 |
+
}
|
84 |
+
|
85 |
+
for s, expected in test_strings.items():
|
86 |
+
# print(f"'{s}' === Detected lang: {detect_language(s)} === Expected: {expected}")
|
87 |
+
print(
|
88 |
+
f"'{s}'\nDetected lang: {detect_language_by_unicode(s)}\nExpected lang: {expected}"
|
89 |
+
)
|
utils/log_util.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
logging.basicConfig(
|
4 |
+
format="[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s",
|
5 |
+
level=logging.INFO,
|
6 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
7 |
+
)
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
logger.setLevel(logging.INFO)
|
utils/oov_util.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
from vocab import TokenizerImpl, all_tokenizer_config, load_tokenizer
|
4 |
+
|
5 |
+
text = (
|
6 |
+
"hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;"
|
7 |
+
" 確実に春が近づいてること; a közoktatással? _ Belföld;"
|
8 |
+
" pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;"
|
9 |
+
" निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;"
|
10 |
+
" « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;"
|
11 |
+
"װיקיװערטערבוך "
|
12 |
+
)
|
13 |
+
whitespace = "\t \n\n\r "
|
14 |
+
bytes = b"\x00\x01\x02\x03\x04".decode("utf-8")
|
15 |
+
|
16 |
+
text += whitespace
|
17 |
+
|
18 |
+
|
19 |
+
def get_unk(tokenizer_config):
|
20 |
+
tokenizer = load_tokenizer(tokenizer_config)
|
21 |
+
if hasattr(tokenizer, "unk_token"):
|
22 |
+
return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
|
23 |
+
else:
|
24 |
+
return "unk_token not found"
|
25 |
+
|
26 |
+
|
27 |
+
# def infer_tokenizer_impl(tokenizer_config):
|
28 |
+
def infer_tokenizer_type(tokenizer_config):
|
29 |
+
tokenizer = load_tokenizer(tokenizer_config)
|
30 |
+
if tokenizer_config.impl == TokenizerImpl.TikToken:
|
31 |
+
return "tiktoken"
|
32 |
+
if hasattr(tokenizer, "backend_tokenizer"):
|
33 |
+
return str(
|
34 |
+
type(tokenizer.backend_tokenizer.model)
|
35 |
+
) # type(tokenizer._tokenizer.model))
|
36 |
+
# orion: sp_model.Load(vocab_file),继承 PreTrainedTokenizer
|
37 |
+
elif hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
38 |
+
# for i in range(tokenizer.sp_model.piece_size()):
|
39 |
+
# if tokenizer.sp_model.is_byte(i):
|
40 |
+
# print("")
|
41 |
+
return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
|
42 |
+
|
43 |
+
# sp.Load(model_path) ,并且包括image_tokenizer
|
44 |
+
elif "glm-" in tokenizer_config.name_or_path:
|
45 |
+
return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
|
46 |
+
# sp.Load(model_path) ,没有image_tokenizer
|
47 |
+
elif (
|
48 |
+
"glm2-" in tokenizer_config.name_or_path
|
49 |
+
or "glm3-" in tokenizer_config.name_or_path
|
50 |
+
or "CharacterGLM-6B" in tokenizer_config.name_or_path
|
51 |
+
):
|
52 |
+
return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
|
53 |
+
elif (
|
54 |
+
"abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path
|
55 |
+
): # 支持 byte-level,解决oov问题
|
56 |
+
return "japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
|
57 |
+
# bert-base-japanese: 特殊的地方在于 "word_tokenizer_type": "mecab",见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
|
58 |
+
elif "bert-base-japanese" in tokenizer_config.name_or_path:
|
59 |
+
return (
|
60 |
+
"wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
|
61 |
+
)
|
62 |
+
elif "moss" in tokenizer_config.name_or_path:
|
63 |
+
return "应该是 sentencepiece.byte_bpe,待确认"
|
64 |
+
elif "byt5" in tokenizer_config.name_or_path:
|
65 |
+
return "未知,待定"
|
66 |
+
else:
|
67 |
+
print("catch", tokenizer_config.name_or_path)
|
68 |
+
raise "error"
|
69 |
+
|
70 |
+
|
71 |
+
def test_lossless(tokenizer_config):
|
72 |
+
"""
|
73 |
+
xlm-roberta-base 为什么oov这么少?是因为有 byte吗?
|
74 |
+
:param tokenizer_config:
|
75 |
+
:return:
|
76 |
+
"""
|
77 |
+
tokenizer = load_tokenizer(tokenizer_config)
|
78 |
+
encoding = tokenizer.encode(text, add_special_tokens=False)
|
79 |
+
decoding = tokenizer.decode(encoding)
|
80 |
+
|
81 |
+
if text in decoding:
|
82 |
+
# print(tokenizer_config.name, tokenizer_config.impl, "lossless: true")
|
83 |
+
pass
|
84 |
+
else:
|
85 |
+
unk_count = sum(
|
86 |
+
[1 for token_id in encoding if token_id == tokenizer.unk_token_id]
|
87 |
+
)
|
88 |
+
oov_tokens = []
|
89 |
+
# if tokenizer_config.impl == TokenizerImpl.SentencePiece:
|
90 |
+
# print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
|
91 |
+
|
92 |
+
print("#######" * 5)
|
93 |
+
print(
|
94 |
+
f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
|
95 |
+
f"lossless: false; unk_token: {get_unk(tokenizer_config)},"
|
96 |
+
f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []"
|
97 |
+
)
|
98 |
+
for i in range(len(text)):
|
99 |
+
if text[i] != decoding[i]:
|
100 |
+
# print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
|
101 |
+
# f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
|
102 |
+
print(
|
103 |
+
f"text[{i}] = {json.dumps(text[i:], ensure_ascii=False)}, \n"
|
104 |
+
f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}"
|
105 |
+
)
|
106 |
+
|
107 |
+
break
|
108 |
+
|
109 |
+
|
110 |
+
for config in all_tokenizer_config:
|
111 |
+
# if "xlm-roberta-base" in config.name:
|
112 |
+
# if "xlm-roberta-base" in config.name:
|
113 |
+
# if "chatglm3-6b" in config.name:
|
114 |
+
# if "bert-base-japanese" in config.name:
|
115 |
+
# if "moss" in config.name:
|
116 |
+
# if "byt5" in config.name:
|
117 |
+
if "baichuan" in config.name_or_path:
|
118 |
+
# if "CharacterGLM-6B" in config.name:
|
119 |
+
# if "fastchat-t5" in config.name: # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
|
120 |
+
# if True:
|
121 |
+
# test_unk(config)
|
122 |
+
test_lossless(config)
|
utils/text_util.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
char_
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
def detect_lang_from_unicode():
|
7 |
+
pass
|
8 |
+
|
9 |
+
|
10 |
+
def is_digit_char(uchar):
|
11 |
+
return uchar in "0123456789"
|
12 |
+
|
13 |
+
|
14 |
+
def contains_digit(text):
|
15 |
+
return any(is_digit_char(ch) for ch in text)
|
16 |
+
|
17 |
+
|
18 |
+
def get_digit_count(text):
|
19 |
+
pass
|
20 |
+
|
21 |
+
|
22 |
+
def is_all_digit(text):
|
23 |
+
return all(is_digit_char(char) for char in text)
|
24 |
+
|
25 |
+
|
26 |
+
def get_digit_count(text):
|
27 |
+
digit_count = 0
|
28 |
+
for char in text:
|
29 |
+
if char in "0123456789":
|
30 |
+
digit_count += 1
|
31 |
+
return digit_count
|
32 |
+
|
33 |
+
|
34 |
+
def has_space(text):
|
35 |
+
pass
|
36 |
+
|
37 |
+
|
38 |
+
def is_all_space(text):
|
39 |
+
pass
|
40 |
+
|
41 |
+
|
42 |
+
def get_space_count(text):
|
43 |
+
space_count = 0
|
44 |
+
for char in text:
|
45 |
+
if len(char.strip()) == 0:
|
46 |
+
space_count += 1
|
47 |
+
return space_count
|
vocab.py
ADDED
@@ -0,0 +1,754 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from enum import Enum, auto
|
3 |
+
from typing import Any, Dict
|
4 |
+
|
5 |
+
import tiktoken
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
from utils.log_util import logger
|
8 |
+
|
9 |
+
"""Interface:
|
10 |
+
# https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py
|
11 |
+
|
12 |
+
tokenizer.encode -> List[int]: Converts a string to a sequence of ids (integer)
|
13 |
+
tokenizer.decode
|
14 |
+
tokenizer.convert_tokens_to_string # gpt4 没有这个方法
|
15 |
+
tokenizer.convert_ids_to_tokens
|
16 |
+
tokenizer.tokenize -> List[str]: Converts a string into a sequence of tokens ->
|
17 |
+
|
18 |
+
|
19 |
+
tokenizer.parent = ""
|
20 |
+
tokenizer.vocab_size
|
21 |
+
tokenizer.get_vocab() # gpt-neox-20b, llama
|
22 |
+
tokenizer.type = TokenizerType.ByteBPE.name
|
23 |
+
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
24 |
+
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
|
25 |
+
|
26 |
+
|
27 |
+
tokenizer.comments = "split all numbers into individual digits, " \
|
28 |
+
"and fallback to bytes to decompose unknown UTF-8 characters"
|
29 |
+
|
30 |
+
tokenizer.all_special_tokens # baichuan
|
31 |
+
tokenizer.special_tokens_set # gpt3.5_turbo
|
32 |
+
tokenizer.special_tokens_map
|
33 |
+
"""
|
34 |
+
|
35 |
+
|
36 |
+
class TokenizerImpl(Enum):
|
37 |
+
"""
|
38 |
+
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
|
39 |
+
- https://huggingface.co/docs/transformers/tokenizer_summary
|
40 |
+
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
|
41 |
+
|
42 |
+
## google/BertTokenizer
|
43 |
+
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
|
44 |
+
- 特征
|
45 |
+
- 算法:BERT的编码器是 BPE-WordPiece,将单词拆分成多个前缀符号(比如BERT中的##)最小单元
|
46 |
+
- 词典:有##开头的token,表示subword,
|
47 |
+
- 中文采用char粒度分词
|
48 |
+
- 英文采用 WordPiece
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
## google/sentencepiece
|
54 |
+
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
|
55 |
+
- 支持 sentencepiece 和 wordpiece
|
56 |
+
- sentencepiece 有byte-bpe吗?
|
57 |
+
- UNIGRAM = 1; // Unigram language model with dynamic algorithm
|
58 |
+
- BPE = 2; // Byte Pair Encoding
|
59 |
+
- WORD = 3; // Delimitered by whitespace.
|
60 |
+
- CHAR = 4; // tokenizes into character sequence
|
61 |
+
- wordpiece
|
62 |
+
- 特征:
|
63 |
+
- 训练: spm_train --model_type unigram/bpe/char/word
|
64 |
+
- 特殊符号: Ġ
|
65 |
+
- 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称 (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
|
66 |
+
- 实现:
|
67 |
+
- 依赖: protobuf
|
68 |
+
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
69 |
+
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
70 |
+
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
71 |
+
- 分词:
|
72 |
+
- pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
|
73 |
+
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
74 |
+
- 示例:google-t5, llama,baichuan, orion,
|
75 |
+
- llama: tokenizer.json(包含model.vocab model.merges) tokenizer.model
|
76 |
+
- grok: 原始是 .model文件,后面转成了 tokenizer.json
|
77 |
+
- google-t5: tokenizer.json, spiece.model
|
78 |
+
- Skywork-13B-Math: tokenizer.model
|
79 |
+
- xlm_roberta: sentencepiece.bpe.model
|
80 |
+
- GPT2Tokenizer
|
81 |
+
- tokenizer.json, vocab.json, merges.txt (https://huggingface.co/openai-community/gpt2)
|
82 |
+
- vocab.bpe, encoder.json, dict.txt (fairseq版本,不常用,可以忽略这个版本)
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
## thu/icetk
|
87 |
+
- icetk: sentencepiece的分支,支持image_tokenizer。
|
88 |
+
- glm, chatglm1, chatglm2
|
89 |
+
|
90 |
+
## huggingface/tokenizers
|
91 |
+
- https://github.com/huggingface/tokenizers
|
92 |
+
- VS sentencepiece
|
93 |
+
- 支持sentencepiece
|
94 |
+
- .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
|
95 |
+
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
|
96 |
+
- 加载 merges.txt, vocab.json
|
97 |
+
- SentencePieceBPETokenizer https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
|
98 |
+
- 在 sentencepiece基础上,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好,支持special token
|
99 |
+
- 类型: 支持 BBPE, WordPiece or Unigram
|
100 |
+
- 特征:
|
101 |
+
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
102 |
+
- added_tokens 在vocab中不一定存在。
|
103 |
+
- 实现:
|
104 |
+
- 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
|
105 |
+
- 加载:
|
106 |
+
- 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
|
107 |
+
- .model 是 tokenizer.models.BPE 类型
|
108 |
+
- 词典有 Ġ "\u0120" 开头
|
109 |
+
- 优势
|
110 |
+
-
|
111 |
+
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
112 |
+
- 优势:相对sentence piece,
|
113 |
+
- ss
|
114 |
+
|
115 |
+
## openai/tiktoken
|
116 |
+
- 特征:空格就是空格,
|
117 |
+
- 示例:gpt3.5 gpt4, qwen,
|
118 |
+
"""
|
119 |
+
|
120 |
+
""" 算法体系 https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
|
121 |
+
- word-base tokenizer:
|
122 |
+
- char-base tokenizer:
|
123 |
+
- subword-based Tokenizer
|
124 |
+
- BPE
|
125 |
+
- byte-bpe: base vocabulary大小是256
|
126 |
+
- WordPiece:
|
127 |
+
- 相比BPE,WordPiece 仅保存最终词表,而不保存学到的 merge rule
|
128 |
+
- Unigram
|
129 |
+
- SentencePiece
|
130 |
+
|
131 |
+
"""
|
132 |
+
|
133 |
+
# 分类体系:https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
|
134 |
+
BertTokenizer = "wordpiece.BertTokenizer"
|
135 |
+
JapaneseTokenizer = (
|
136 |
+
"wordpiece.MecabTokenizer",
|
137 |
+
"https://github.com/polm/fugashi",
|
138 |
+
) # 常用日语包 ipadic,fugashi,
|
139 |
+
ByteLevelBPETokenizer = "byte_level_bpe" # BBPE
|
140 |
+
SentencePieceBPETokenizer = "sentencepiece_bpe"
|
141 |
+
|
142 |
+
# 分类体系
|
143 |
+
|
144 |
+
# SentencePeice(BPE)
|
145 |
+
SentencePiece = auto() # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
|
146 |
+
byte_level_bpe = auto()
|
147 |
+
# HFTokenizer = auto() # , 支持
|
148 |
+
TikToken = auto()
|
149 |
+
# subword-nmt
|
150 |
+
# WordPiece
|
151 |
+
|
152 |
+
|
153 |
+
# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
|
154 |
+
|
155 |
+
|
156 |
+
@dataclass
|
157 |
+
class TokenizerConfig:
|
158 |
+
"""
|
159 |
+
https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
|
160 |
+
"""
|
161 |
+
|
162 |
+
name_or_path: str # org/model (path on hub), as unique id
|
163 |
+
name_display: str = None #
|
164 |
+
impl: TokenizerImpl = None # implementation, tokenizer_class/type
|
165 |
+
org: str = None
|
166 |
+
link: str = None # http://**
|
167 |
+
desc: str = None # description
|
168 |
+
meta: str = None
|
169 |
+
level: str = None # char-level, word-level, byte-level
|
170 |
+
lang: str = None
|
171 |
+
init_kwargs: Dict[str, Any] = field(
|
172 |
+
default_factory=dict,
|
173 |
+
)
|
174 |
+
|
175 |
+
def __post_init__(self):
|
176 |
+
if self.link is None:
|
177 |
+
self.link = "https://huggingface.co/" + self.name_or_path # TODO + revision
|
178 |
+
if self.name_display is None:
|
179 |
+
self.name_display = self.name_or_path
|
180 |
+
|
181 |
+
@classmethod
|
182 |
+
def init_from_json_file(cls, json_filepath: str) -> "TokenizerConfig":
|
183 |
+
pass
|
184 |
+
|
185 |
+
def __eq__(self, other):
|
186 |
+
if isinstance(other, self.__class__):
|
187 |
+
return self.__dict__ == other.__dict__
|
188 |
+
else:
|
189 |
+
return False
|
190 |
+
|
191 |
+
def __hash__(self):
|
192 |
+
return hash(self.name_or_path)
|
193 |
+
|
194 |
+
|
195 |
+
# TODO: append link and description to the end of dropdown button.
|
196 |
+
# Add tokenizer_class/type, comments
|
197 |
+
_all_tokenizer_config = [
|
198 |
+
# bert style tokenizers
|
199 |
+
TokenizerConfig(
|
200 |
+
"google-bert/bert-base-cased",
|
201 |
+
impl=TokenizerImpl.BertTokenizer,
|
202 |
+
org="Google",
|
203 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
204 |
+
),
|
205 |
+
TokenizerConfig(
|
206 |
+
"google-bert/bert-base-uncased",
|
207 |
+
impl=TokenizerImpl.BertTokenizer,
|
208 |
+
org="Google",
|
209 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
210 |
+
),
|
211 |
+
TokenizerConfig(
|
212 |
+
"google-bert/bert-base-chinese",
|
213 |
+
impl=TokenizerImpl.BertTokenizer,
|
214 |
+
org="Google",
|
215 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
|
216 |
+
),
|
217 |
+
TokenizerConfig(
|
218 |
+
"google-bert/bert-base-german-cased",
|
219 |
+
impl=TokenizerImpl.BertTokenizer,
|
220 |
+
org="Google",
|
221 |
+
),
|
222 |
+
TokenizerConfig(
|
223 |
+
"dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"
|
224 |
+
),
|
225 |
+
TokenizerConfig(
|
226 |
+
"asafaya/bert-base-arabic", impl=TokenizerImpl.BertTokenizer, org="-"
|
227 |
+
),
|
228 |
+
TokenizerConfig(
|
229 |
+
"google-bert/bert-base-multilingual-uncased",
|
230 |
+
impl=TokenizerImpl.BertTokenizer,
|
231 |
+
org="Google",
|
232 |
+
),
|
233 |
+
TokenizerConfig(
|
234 |
+
"google-bert/bert-base-multilingual-cased",
|
235 |
+
impl=TokenizerImpl.BertTokenizer,
|
236 |
+
org="Google",
|
237 |
+
),
|
238 |
+
TokenizerConfig(
|
239 |
+
"tohoku-nlp/bert-base-japanese",
|
240 |
+
impl=TokenizerImpl.BertTokenizer,
|
241 |
+
org="Tohoku",
|
242 |
+
desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
|
243 |
+
"then split into subwords by the WordPiece algorithm.",
|
244 |
+
),
|
245 |
+
TokenizerConfig(
|
246 |
+
"clue/roberta_chinese_clue_tiny",
|
247 |
+
name_display="clue/roberta-chinese-clue",
|
248 |
+
impl=TokenizerImpl.BertTokenizer,
|
249 |
+
org="CLUE",
|
250 |
+
init_kwargs={"revision": "refs/pr/1"},
|
251 |
+
desc="",
|
252 |
+
meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md",
|
253 |
+
),
|
254 |
+
TokenizerConfig(
|
255 |
+
"eson/kplug-base-encoder",
|
256 |
+
name_display="eson/kplug",
|
257 |
+
impl=TokenizerImpl.BertTokenizer,
|
258 |
+
org="JD",
|
259 |
+
),
|
260 |
+
TokenizerConfig(
|
261 |
+
"ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"
|
262 |
+
), # 台湾中央研究院
|
263 |
+
# WoBERT https://kexue.fm/archives/7758
|
264 |
+
# WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
|
265 |
+
# gpt2 style tokenizers
|
266 |
+
TokenizerConfig(
|
267 |
+
"openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"
|
268 |
+
),
|
269 |
+
# byte-level BPE,没有byte,是unicode-level的吗?
|
270 |
+
TokenizerConfig(
|
271 |
+
"ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"
|
272 |
+
),
|
273 |
+
TokenizerConfig(
|
274 |
+
"ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"
|
275 |
+
),
|
276 |
+
TokenizerConfig(
|
277 |
+
"fnlp/moss-moon-003-sft",
|
278 |
+
impl=TokenizerImpl.SentencePiece,
|
279 |
+
init_kwargs={"revision": "refs/pr/6"},
|
280 |
+
org="Fudan",
|
281 |
+
desc="This tokenizer has been trained to treat spaces like parts of the tokens "
|
282 |
+
"(a bit like sentencepiece) so a word will be encoded differently whether "
|
283 |
+
"it is at the beginning of the sentence (without space) or not",
|
284 |
+
meta="在gpt2词典基础上,扩充了5万中文",
|
285 |
+
),
|
286 |
+
TokenizerConfig(
|
287 |
+
"bigscience/bloom",
|
288 |
+
impl=TokenizerImpl.SentencePiece,
|
289 |
+
org="BigScience",
|
290 |
+
meta="比gpt_neox的词典 对中文支持更好。",
|
291 |
+
),
|
292 |
+
# ("bloomz_6b4_zh",
|
293 |
+
# ("BelleGroup/BELLE-7B-2M", # 模型和词典都基于bloom
|
294 |
+
#
|
295 |
+
TokenizerConfig(
|
296 |
+
"EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"
|
297 |
+
), # 5万
|
298 |
+
TokenizerConfig(
|
299 |
+
"cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"
|
300 |
+
), # GPTNeoXTokenizer
|
301 |
+
TokenizerConfig(
|
302 |
+
"abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"
|
303 |
+
),
|
304 |
+
TokenizerConfig(
|
305 |
+
"rinna/bilingual-gpt-neox-4b",
|
306 |
+
impl=TokenizerImpl.SentencePiece,
|
307 |
+
org="ABEJA",
|
308 |
+
lang="en/ja",
|
309 |
+
),
|
310 |
+
TokenizerConfig(
|
311 |
+
"Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
312 |
+
), # 15万,速度有点慢
|
313 |
+
TokenizerConfig(
|
314 |
+
"Qwen/Qwen1.5-110B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
315 |
+
),
|
316 |
+
TokenizerConfig(
|
317 |
+
"Qwen/Qwen1.5-1.8B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
318 |
+
),
|
319 |
+
TokenizerConfig("Qwen/Qwen2-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
|
320 |
+
TokenizerConfig("Qwen/Qwen2-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
|
321 |
+
TokenizerConfig(
|
322 |
+
"Qwen/Qwen2.5-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
323 |
+
),
|
324 |
+
TokenizerConfig(
|
325 |
+
"Qwen/Qwen2.5-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
|
326 |
+
),
|
327 |
+
TokenizerConfig(
|
328 |
+
"HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"
|
329 |
+
),
|
330 |
+
####### google/sentencepiece tokenizer:
|
331 |
+
# T5 llama internlm
|
332 |
+
TokenizerConfig(
|
333 |
+
"google-t5/t5-large",
|
334 |
+
name_display="google-t5/t5",
|
335 |
+
impl=TokenizerImpl.SentencePiece,
|
336 |
+
org="Google",
|
337 |
+
),
|
338 |
+
# t5_small, t5_base, t5_large, flan_t5_base,
|
339 |
+
# ("t5_base", "", "sentencepiece"),
|
340 |
+
# TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
|
341 |
+
TokenizerConfig(
|
342 |
+
"lmsys/fastchat-t5-3b-v1.0",
|
343 |
+
impl=TokenizerImpl.SentencePiece,
|
344 |
+
org="LMSYS",
|
345 |
+
init_kwargs={
|
346 |
+
"use_fast": False
|
347 |
+
}, # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
|
348 |
+
),
|
349 |
+
TokenizerConfig(
|
350 |
+
"CohereForAI/aya-101", org="Cohere For AI"
|
351 |
+
), # "tokenizer_class": "T5Tokenizer",
|
352 |
+
TokenizerConfig(
|
353 |
+
"ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"
|
354 |
+
),
|
355 |
+
TokenizerConfig(
|
356 |
+
"ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"
|
357 |
+
),
|
358 |
+
# byte-level BPE
|
359 |
+
# '中文单字': 700, '中文多字': 0 meta-llama/Meta-Llama-3.1-405B
|
360 |
+
#
|
361 |
+
TokenizerConfig(
|
362 |
+
"meta-llama/Llama-3.2-1B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
|
363 |
+
),
|
364 |
+
TokenizerConfig(
|
365 |
+
"meta-llama/Llama-3.2-3B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
|
366 |
+
),
|
367 |
+
# TokenizerConfig("meta-llama/Llama-3.3-70B-Instruct", impl=TokenizerImpl.SentencePiece,
|
368 |
+
# org="Meta"),
|
369 |
+
TokenizerConfig(
|
370 |
+
"meta-llama/Meta-Llama-3.1-405B", impl=TokenizerImpl.SentencePiece, org="Meta"
|
371 |
+
),
|
372 |
+
TokenizerConfig(
|
373 |
+
"NousResearch/Hermes-3-Llama-3.1-405B",
|
374 |
+
impl=TokenizerImpl.SentencePiece,
|
375 |
+
org="NousResearch",
|
376 |
+
),
|
377 |
+
TokenizerConfig(
|
378 |
+
"gradientai/Llama-3-8B-Instruct-Gradient-1048k",
|
379 |
+
name_display="Meta/llama3",
|
380 |
+
impl=TokenizerImpl.SentencePiece,
|
381 |
+
org="Meta",
|
382 |
+
desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters",
|
383 |
+
),
|
384 |
+
TokenizerConfig(
|
385 |
+
"NousResearch/Llama-2-7b-chat-hf",
|
386 |
+
name_display="Meta/llama2",
|
387 |
+
impl=TokenizerImpl.SentencePiece,
|
388 |
+
org="Meta",
|
389 |
+
),
|
390 |
+
TokenizerConfig(
|
391 |
+
"huggyllama/llama-7b",
|
392 |
+
name_display="Meta/llama",
|
393 |
+
impl=TokenizerImpl.SentencePiece,
|
394 |
+
org="Meta",
|
395 |
+
),
|
396 |
+
TokenizerConfig(
|
397 |
+
"hpcai-tech/grok-1",
|
398 |
+
name_display="xai-org/grok-1",
|
399 |
+
impl=TokenizerImpl.SentencePiece,
|
400 |
+
org="xAI",
|
401 |
+
),
|
402 |
+
# 由.model文件转化为了
|
403 |
+
TokenizerConfig(
|
404 |
+
"hfl/chinese-llama-lora-7b",
|
405 |
+
impl=TokenizerImpl.SentencePiece,
|
406 |
+
org="-",
|
407 |
+
meta="向原始LLaMA的词汇表中添加2w个中文词汇,针对原版LLaMA模型扩充了中文词表, 提升了中文编解码效率",
|
408 |
+
),
|
409 |
+
#
|
410 |
+
TokenizerConfig(
|
411 |
+
"hfl/chinese-llama-2-7b",
|
412 |
+
impl=TokenizerImpl.SentencePiece,
|
413 |
+
org="-",
|
414 |
+
meta="重新设计了新词表(大小:55296),进一步提升了中文字词的覆盖程度",
|
415 |
+
), #
|
416 |
+
TokenizerConfig(
|
417 |
+
"hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"
|
418 |
+
),
|
419 |
+
TokenizerConfig(
|
420 |
+
"hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"
|
421 |
+
),
|
422 |
+
# 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 "比chinese_llama词典多一个`[PAD]`,请勿混用"
|
423 |
+
#
|
424 |
+
# ("belle_llama_ext_7b",
|
425 |
+
# ("alpaca_7b",
|
426 |
+
TokenizerConfig(
|
427 |
+
"baichuan-inc/Baichuan-7B",
|
428 |
+
name_display="baichuan-inc/baichuan",
|
429 |
+
impl=TokenizerImpl.SentencePiece,
|
430 |
+
level="byte-level",
|
431 |
+
org="Baichuan",
|
432 |
+
),
|
433 |
+
TokenizerConfig(
|
434 |
+
"baichuan-inc/Baichuan2-7B-Chat",
|
435 |
+
name_display="baichuan-inc/baichuan2",
|
436 |
+
impl=TokenizerImpl.SentencePiece,
|
437 |
+
org="Baichuan",
|
438 |
+
desc="expand the vocabulary size from 64000 in Baichuan1 to 125696",
|
439 |
+
),
|
440 |
+
TokenizerConfig(
|
441 |
+
"internlm/internlm-chat-7b",
|
442 |
+
impl=TokenizerImpl.SentencePiece,
|
443 |
+
org="Shanghai AI Lab",
|
444 |
+
),
|
445 |
+
# 上海AI实验室 + 商汤
|
446 |
+
TokenizerConfig(
|
447 |
+
"internlm/internlm2-chat-7b",
|
448 |
+
impl=TokenizerImpl.SentencePiece,
|
449 |
+
org="Shanghai AI Lab",
|
450 |
+
),
|
451 |
+
TokenizerConfig(
|
452 |
+
"internlm/internlm2-math-7b",
|
453 |
+
impl=TokenizerImpl.SentencePiece,
|
454 |
+
org="Shanghai AI Lab",
|
455 |
+
),
|
456 |
+
TokenizerConfig(
|
457 |
+
"internlm/internlm-xcomposer-7b",
|
458 |
+
impl=TokenizerImpl.SentencePiece,
|
459 |
+
org="Shanghai AI Lab",
|
460 |
+
),
|
461 |
+
TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
|
462 |
+
TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
|
463 |
+
TokenizerConfig(
|
464 |
+
"Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"
|
465 |
+
),
|
466 |
+
TokenizerConfig(
|
467 |
+
"Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"
|
468 |
+
), # 文件:tokenizer.model
|
469 |
+
TokenizerConfig(
|
470 |
+
"FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"
|
471 |
+
),
|
472 |
+
# 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值?
|
473 |
+
# "goat",
|
474 |
+
# ##### glm系列
|
475 |
+
# "glm_chinese",),
|
476 |
+
TokenizerConfig(
|
477 |
+
"THUDM/chatglm-6b",
|
478 |
+
impl=TokenizerImpl.SentencePiece,
|
479 |
+
org="Tsinghua",
|
480 |
+
meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
|
481 |
+
init_kwargs={"revision": "refs/pr/100"},
|
482 |
+
),
|
483 |
+
TokenizerConfig(
|
484 |
+
"THUDM/chatglm2-6b",
|
485 |
+
impl=TokenizerImpl.SentencePiece,
|
486 |
+
org="Tsinghua",
|
487 |
+
),
|
488 |
+
TokenizerConfig(
|
489 |
+
"THUDM/chatglm3-6b",
|
490 |
+
impl=TokenizerImpl.SentencePiece,
|
491 |
+
org="Tsinghua",
|
492 |
+
),
|
493 |
+
TokenizerConfig(
|
494 |
+
"thu-coai/CharacterGLM-6B",
|
495 |
+
impl=TokenizerImpl.SentencePiece,
|
496 |
+
org="Tsinghua",
|
497 |
+
),
|
498 |
+
# tiktoken 系列
|
499 |
+
TokenizerConfig(
|
500 |
+
"openai/text-davinci-003",
|
501 |
+
impl=TokenizerImpl.TikToken,
|
502 |
+
org="OpenAI",
|
503 |
+
link="https://github.com/openai/tiktoken",
|
504 |
+
),
|
505 |
+
#
|
506 |
+
TokenizerConfig(
|
507 |
+
"openai/code-davinci-002",
|
508 |
+
impl=TokenizerImpl.TikToken,
|
509 |
+
org="OpenAI",
|
510 |
+
link="https://github.com/openai/tiktoken",
|
511 |
+
),
|
512 |
+
TokenizerConfig(
|
513 |
+
"openai/gpt-3.5-turbo",
|
514 |
+
impl=TokenizerImpl.TikToken,
|
515 |
+
org="OpenAI",
|
516 |
+
link="https://github.com/openai/tiktoken",
|
517 |
+
desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError",
|
518 |
+
),
|
519 |
+
TokenizerConfig(
|
520 |
+
"openai/gpt-4",
|
521 |
+
impl=TokenizerImpl.TikToken,
|
522 |
+
org="OpenAI",
|
523 |
+
link="https://github.com/openai/tiktoken",
|
524 |
+
),
|
525 |
+
TokenizerConfig(
|
526 |
+
"openai/gpt-4o",
|
527 |
+
impl=TokenizerImpl.TikToken,
|
528 |
+
org="OpenAI",
|
529 |
+
link="https://github.com/openai/tiktoken",
|
530 |
+
),
|
531 |
+
TokenizerConfig(
|
532 |
+
"Qwen/Qwen-7B-Chat",
|
533 |
+
name_display="Qwen/Qwen",
|
534 |
+
impl=TokenizerImpl.TikToken,
|
535 |
+
org="Alibaba",
|
536 |
+
init_kwargs={"revision": "refs/pr/56"},
|
537 |
+
meta="在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词",
|
538 |
+
),
|
539 |
+
# https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
|
540 |
+
# 该词表在GPT-4使用的BPE词表cl100k_base基础上,对中文、多语言进行了优化,在对中、英、代码数据的高效编解码的基础上,
|
541 |
+
# 对部分多语言更加友好,方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
|
542 |
+
# TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
|
543 |
+
# 未分类
|
544 |
+
# ("amber", ""),
|
545 |
+
TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
|
546 |
+
TokenizerConfig("apple/DCLM-7B", org="Apple"),
|
547 |
+
TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
|
548 |
+
TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
|
549 |
+
TokenizerConfig("mistralai/Mistral-Large-Instruct-2407", org="Mistral"),
|
550 |
+
TokenizerConfig("mistralai/Mistral-Nemo-Instruct-2407", org="Mistral"),
|
551 |
+
TokenizerConfig("paust/pko-t5-large", org="PAUST"),
|
552 |
+
TokenizerConfig("01-ai/Yi-6B", org="Yi"),
|
553 |
+
TokenizerConfig("01-ai/Yi-34B", org="Yi"),
|
554 |
+
TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
|
555 |
+
TokenizerConfig("01-ai/Yi-1.5-34B", org="Yi"),
|
556 |
+
TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
|
557 |
+
TokenizerConfig("microsoft/phi-1", org="Microsoft"),
|
558 |
+
TokenizerConfig("microsoft/phi-2", org="Microsoft"),
|
559 |
+
TokenizerConfig(
|
560 |
+
"microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"
|
561 |
+
),
|
562 |
+
TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
|
563 |
+
TokenizerConfig("google/mobilebert-uncased", org="Google"),
|
564 |
+
# ("google/mobilenet_v2_1.0_224",), # error
|
565 |
+
TokenizerConfig("google/switch-c-2048", org="Google"),
|
566 |
+
TokenizerConfig("google/byt5-small", org="Google"),
|
567 |
+
TokenizerConfig("google/mt5-large", org="Google"),
|
568 |
+
TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
|
569 |
+
TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
|
570 |
+
TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
|
571 |
+
TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
|
572 |
+
TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
|
573 |
+
TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
|
574 |
+
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
575 |
+
TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
|
576 |
+
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
577 |
+
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
578 |
+
TokenizerConfig(
|
579 |
+
"deepseek-ai/DeepSeek-R1", org="DeepSeek"
|
580 |
+
), # 在llama3的词典上,增加了一些中文token,删掉了一部分token
|
581 |
+
TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
|
582 |
+
TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
|
583 |
+
TokenizerConfig("google/gemma-7b", org="Google"),
|
584 |
+
TokenizerConfig("google/gemma-2-9b", org="Google"),
|
585 |
+
TokenizerConfig("allenai/OLMo-7B-hf", org="Allen AI"),
|
586 |
+
TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
|
587 |
+
TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
|
588 |
+
TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
|
589 |
+
TokenizerConfig("MiniMaxAI/MiniMax-Text-01", org="MiniMax"),
|
590 |
+
# TokenizerConfig("nvidia/Nemotron-4-340B-Instruct", org="Nvidia"),
|
591 |
+
# ("claude",),
|
592 |
+
# https://github.com/Duxiaoman-DI/XuanYuan
|
593 |
+
# https://huggingface.co/apple/OpenELM-3B-Instruct https://huggingface.co/apple/OpenELM-3B
|
594 |
+
]
|
595 |
+
|
596 |
+
assert len(set([config.name_display for config in _all_tokenizer_config])) == len(
|
597 |
+
_all_tokenizer_config
|
598 |
+
)
|
599 |
+
assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(
|
600 |
+
_all_tokenizer_config
|
601 |
+
)
|
602 |
+
assert len(
|
603 |
+
set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])
|
604 |
+
) == len(_all_tokenizer_config)
|
605 |
+
|
606 |
+
|
607 |
+
class TokenizerFactory:
|
608 |
+
def __init__(self):
|
609 |
+
# self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
|
610 |
+
self.all_tokenizer_configs = sorted(
|
611 |
+
_all_tokenizer_config, key=lambda k: k.name_display
|
612 |
+
)
|
613 |
+
self.all_tokenizer_names = [
|
614 |
+
config.name_or_path for config in self.all_tokenizer_configs
|
615 |
+
]
|
616 |
+
self.name_to_config_list = [
|
617 |
+
{config.name_or_path: config for config in self.all_tokenizer_configs},
|
618 |
+
{config.name_display: config for config in self.all_tokenizer_configs},
|
619 |
+
{
|
620 |
+
config.name_display.split("/")[-1]: config
|
621 |
+
for config in self.all_tokenizer_configs
|
622 |
+
},
|
623 |
+
]
|
624 |
+
self.tokenizer_cache = {}
|
625 |
+
|
626 |
+
def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
|
627 |
+
for name_to_config in self.name_to_config_list:
|
628 |
+
if tokenizer_name in name_to_config:
|
629 |
+
return name_to_config[tokenizer_name]
|
630 |
+
return None
|
631 |
+
|
632 |
+
def get_tokenizer(self, tokenizer_name: str):
|
633 |
+
"""
|
634 |
+
:param tokenizer_name:
|
635 |
+
:return:
|
636 |
+
"""
|
637 |
+
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
638 |
+
|
639 |
+
# 1. load from cache
|
640 |
+
if tokenizer_config in self.tokenizer_cache:
|
641 |
+
return self.tokenizer_cache[tokenizer_config]
|
642 |
+
|
643 |
+
# 2. load tokenizer
|
644 |
+
tokenizer = self.load_tokenizer(tokenizer_config)
|
645 |
+
|
646 |
+
self.tokenizer_cache[tokenizer_config] = tokenizer
|
647 |
+
return tokenizer
|
648 |
+
|
649 |
+
def get_name_with_hyperlink(self, tokenizer_name: str) -> str:
|
650 |
+
def model_hyperlink(link, model_name):
|
651 |
+
model_name = model_name
|
652 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
653 |
+
|
654 |
+
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
655 |
+
return model_hyperlink(
|
656 |
+
tokenizer_config.link, tokenizer_config.name_display.split("/")[-1]
|
657 |
+
)
|
658 |
+
|
659 |
+
def load_tokenizer(self, tokenizer_config):
|
660 |
+
if tokenizer_config == None:
|
661 |
+
print("dd")
|
662 |
+
logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
|
663 |
+
if (
|
664 |
+
tokenizer_config.impl == TokenizerImpl.TikToken
|
665 |
+
and "openai" in tokenizer_config.name_or_path
|
666 |
+
):
|
667 |
+
tokenizer = tiktoken.encoding_for_model(
|
668 |
+
tokenizer_config.name_or_path.replace("openai/", "")
|
669 |
+
)
|
670 |
+
else:
|
671 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
672 |
+
tokenizer_config.name_or_path,
|
673 |
+
trust_remote_code=True,
|
674 |
+
**tokenizer_config.init_kwargs,
|
675 |
+
)
|
676 |
+
return tokenizer
|
677 |
+
|
678 |
+
def add_config(
|
679 |
+
self,
|
680 |
+
):
|
681 |
+
pass
|
682 |
+
|
683 |
+
def add_tokenizer(self, tokenizer_name):
|
684 |
+
pass
|
685 |
+
|
686 |
+
|
687 |
+
tokenizer_factory = TokenizerFactory()
|
688 |
+
|
689 |
+
|
690 |
+
def add_tokenizer(tokenizer_name: str):
|
691 |
+
"""
|
692 |
+
:param tokenizer_name:
|
693 |
+
:return:
|
694 |
+
"""
|
695 |
+
if tokenizer_name in []:
|
696 |
+
logger.info(f"{tokenizer_name} already exits")
|
697 |
+
else:
|
698 |
+
# add to config
|
699 |
+
tokenizer_config = TokenizerConfig(tokenizer_name, org="-")
|
700 |
+
|
701 |
+
# add to tokenizer
|
702 |
+
tokenizer = tokenizer_factory.load_tokenizer(tokenizer_config)
|
703 |
+
|
704 |
+
# refresh cache
|
705 |
+
|
706 |
+
try:
|
707 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
708 |
+
tokenizer_name, trust_remote_code=True, **tokenizer_config.init_kwargs
|
709 |
+
)
|
710 |
+
tokenizer_factory.all_tokenizer_configs.append(
|
711 |
+
"",
|
712 |
+
)
|
713 |
+
tokenizer_factory
|
714 |
+
|
715 |
+
except Exception as e:
|
716 |
+
logger.error(e)
|
717 |
+
|
718 |
+
pass
|
719 |
+
|
720 |
+
|
721 |
+
# class TokenizerType(Enum):
|
722 |
+
#
|
723 |
+
# # BERTTokenizer
|
724 |
+
# # 依赖一个txt文件
|
725 |
+
#
|
726 |
+
#
|
727 |
+
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
|
728 |
+
# # 依赖一个json文件,Tokenizer.from_file(vocab_file)
|
729 |
+
# # 案例:gpt-neox-20B
|
730 |
+
# HFTokenizer = auto()
|
731 |
+
#
|
732 |
+
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
|
733 |
+
# # 案例:
|
734 |
+
# SentencePieceTokenizer = auto()
|
735 |
+
#
|
736 |
+
#
|
737 |
+
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
|
738 |
+
# # 源码:
|
739 |
+
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
|
740 |
+
# # Byte-level BPE
|
741 |
+
# GPT2BPETokenizer = auto()
|
742 |
+
|
743 |
+
|
744 |
+
if __name__ == "__main__":
|
745 |
+
for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
|
746 |
+
if True:
|
747 |
+
# if "t5" in tokenizer_config.name_or_path:
|
748 |
+
tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
|
749 |
+
tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
|
750 |
+
tokenizer3 = tokenizer_factory.get_tokenizer(
|
751 |
+
tokenizer_config.name_display.split("/")[-1]
|
752 |
+
)
|
753 |
+
assert tokenizer1 == tokenizer2 == tokenizer3
|
754 |
+
print(tokenizer_config.name_or_path, len(tokenizer1))
|