gpantaz commited on
Commit
b4dc5cb
·
1 Parent(s): 05a8ffa

Add application file

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Athens NLP Summer School
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: Test
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.34.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # tokenization_playground
 
 
 
 
 
 
 
 
 
 
 
__pycache__/character_util.cpython-311.pyc ADDED
Binary file (11.9 kB). View file
 
__pycache__/playground_app.cpython-311.pyc ADDED
Binary file (5.94 kB). View file
 
__pycache__/playground_examples.cpython-311.pyc ADDED
Binary file (1.04 kB). View file
 
__pycache__/playground_util.cpython-311.pyc ADDED
Binary file (5.98 kB). View file
 
__pycache__/vocab.cpython-311.pyc ADDED
Binary file (27.5 kB). View file
 
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ from huggingface_hub import login
5
+ from playground_app import demo as playground_tab
6
+
7
+ auth_token = os.environ.get("HF_TOKEN", None)
8
+ if auth_token:
9
+ login(token=auth_token)
10
+
11
+
12
+ title = """
13
+ <div align="center">
14
+ <span>Tokenization Playground</span>
15
+ </div>
16
+ """
17
+
18
+ with gr.Blocks() as demo:
19
+ _ = gr.HTML(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
20
+ _ = playground_tab.render()
21
+
22
+ if __name__ == "__main__":
23
+ # demo.launch()
24
+ demo.launch(share=True)
character_util.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Literal
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from utils.lang_util import detect_language_by_unicode, language_ranges
9
+ from utils.log_util import logger
10
+ from utils.text_util import contains_digit, get_space_count
11
+ from vocab import tokenizer_factory
12
+
13
+ CURRENT_DIR = Path.parent(Path.resolve(__file__))
14
+
15
+ cache = {}
16
+ default_columns = ["digit", "zh"]
17
+
18
+
19
+ def text_to_unicode(text: str) -> str:
20
+ """Convert text to unicode representation."""
21
+ return "".join(rf"\u{ord(character):04X}" for character in text)
22
+
23
+
24
+ def calculate_dist(token_lens: list[int]) -> str:
25
+ """Calculate the distribution of token lengths."""
26
+ if not token_lens:
27
+ return "-"
28
+ return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
29
+
30
+
31
+ def iter_vocab(
32
+ tokenizer_name: str,
33
+ from_cache: bool = True,
34
+ cache_dir: str = "stats",
35
+ ) -> pd.DataFrame | dict:
36
+ """:param tokenizer_name:
37
+ :param from_cache:
38
+ :param cache_dir:
39
+ :return:
40
+ """
41
+ tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
42
+
43
+ cache_dir = os.path.join(CURRENT_DIR, cache_dir)
44
+ os.makedirs(cache_dir, exist_ok=True)
45
+
46
+ # load from cache
47
+ cache_path = os.path.join(cache_dir, "character_stats.json")
48
+ if not cache and os.path.exists(cache_path):
49
+ with open(cache_path, encoding="utf-8") as f_tmp:
50
+ cache.update(json.load(f_tmp))
51
+ if from_cache and tokenizer_name in cache:
52
+ # logger.info(f"load {tokenizer_config.name_or_path} from cache")
53
+ return cache[tokenizer_name]
54
+
55
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
56
+
57
+ tokens_by_lang = {lang[1]: [] for lang in language_ranges}
58
+ digit_tokens = []
59
+ space_tokens = []
60
+ byte_tokens = []
61
+
62
+ buffer = []
63
+ for token_id in range(tokenizer.vocab_size):
64
+ # for token_id in tokenizer.get_vocab():
65
+ # for token_id in range(len(tokenizer)):
66
+ decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
67
+ token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
68
+ tags = []
69
+ if token is None: # 有些词典有空的id(不连续)
70
+ continue
71
+ if isinstance(token, bytes):
72
+ token = token.decode("utf-8", errors="ignore")
73
+
74
+ if hasattr(tokenizer, "sp_model") and tokenizer.sp_model.is_byte(token_id):
75
+ tags.append("is_byte")
76
+ byte_tokens.append(token)
77
+
78
+ language_tags = detect_language_by_unicode(decode_str)
79
+ for language in language_tags:
80
+ tokens_by_lang[language[1]].append(decode_str)
81
+
82
+ if contains_digit(decode_str):
83
+ tags.append("digit")
84
+ digit_tokens.append(decode_str)
85
+
86
+ space_count = get_space_count(decode_str)
87
+ if space_count > 0:
88
+ space_tokens.append(decode_str)
89
+
90
+ buffer.append(
91
+ json.dumps(
92
+ {
93
+ "id": token_id,
94
+ "token": token,
95
+ "token_decode": decode_str,
96
+ "token_dumps": json.dumps(token),
97
+ "token_unicode": text_to_unicode(token),
98
+ "token_len": len(decode_str),
99
+ },
100
+ ensure_ascii=False,
101
+ )
102
+ + "\n"
103
+ )
104
+
105
+ result = {
106
+ "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
107
+ "organization": tokenizer_config.org,
108
+ "vocab_size": len(tokenizer),
109
+ "num(digit)": len(digit_tokens),
110
+ "len(digit)": calculate_dist([len(token) for token in digit_tokens]),
111
+ "num(space)": len(space_tokens),
112
+ "len(space)": calculate_dist([len(token) for token in space_tokens]),
113
+ }
114
+
115
+ for lang, tokens in tokens_by_lang.items():
116
+ result[f"num({lang})"] = len(tokens)
117
+ result["len(" + lang + ")"] = calculate_dist([len(token) for token in tokens])
118
+
119
+ out_path = os.path.join(
120
+ cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl"
121
+ )
122
+ with open(out_path, "w", encoding="utf-8") as f_out:
123
+ for line in buffer:
124
+ f_out.write(line)
125
+ len_before = len(cache)
126
+ cache[tokenizer_name] = result
127
+ len_after = len(cache)
128
+ logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
129
+ with open(cache_path, "w", encoding="utf-8") as f_out:
130
+ f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
131
+ return result
132
+
133
+
134
+ def to_dataframe(stats: dict[str, Any], columns: list[str]) -> pd.DataFrame:
135
+ table = []
136
+ for stat in stats.values():
137
+ filtered_stat = {}
138
+ for k, v in stat.items():
139
+ if not k.startswith("num") and not k.startswith("len"):
140
+ filtered_stat[k] = v
141
+ if any(column in k for column in columns):
142
+ k = k.replace("ja-kana", "kana")
143
+ filtered_stat[k] = v
144
+ table.append(filtered_stat)
145
+ return pd.DataFrame(table)
146
+
147
+
148
+ def get_character_table(
149
+ tokenizer_filter: str | None = None,
150
+ columns: list | None = None,
151
+ return_type: Literal["dict", "dataframe"] | None = "dataframe",
152
+ ) -> pd.DataFrame | dict:
153
+ logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
154
+ stats = {}
155
+ if columns is None:
156
+ columns = default_columns
157
+ if tokenizer_filter is not None:
158
+ tokenizer_names = [
159
+ tokenizer_config.name_or_path
160
+ for tokenizer_config in tokenizer_factory.all_tokenizer_configs
161
+ if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()
162
+ ]
163
+ else:
164
+ tokenizer_names = tokenizer_factory.all_tokenizer_names
165
+
166
+ for tokenizer_name in tokenizer_names:
167
+ stat = iter_vocab(tokenizer_name)
168
+ stats[tokenizer_name] = stat
169
+
170
+ if return_type == "dataframe":
171
+ stats = to_dataframe(stats, columns)
172
+ return stats
173
+
174
+
175
+ if __name__ == "__main__":
176
+ # aa = get_character_table(tokenizer_filter="baichuan")
177
+ df = get_character_table()
178
+ logger.info(f"\n{df.to_markdown(index=False)}")
playground_app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from playground_examples import examples
3
+ from playground_util import on_load, tokenize, tokenize_pair
4
+ from vocab import tokenizer_factory
5
+
6
+ get_window_url_params = """
7
+ function(url_params) {
8
+ const params = new URLSearchParams(window.location.search);
9
+ url_params = JSON.stringify(Object.fromEntries(params));
10
+ return url_params;
11
+ }
12
+ """
13
+
14
+ all_tokenizer_name = [
15
+ (config.name_display, config.name_or_path)
16
+ for config in tokenizer_factory.all_tokenizer_configs
17
+ ]
18
+
19
+ with gr.Blocks() as demo:
20
+ with gr.Row():
21
+ gr.Markdown("## Input Text")
22
+ dropdown_examples = gr.Dropdown(
23
+ sorted(examples.keys()),
24
+ value="Examples",
25
+ type="index",
26
+ allow_custom_value=True,
27
+ show_label=False,
28
+ container=False,
29
+ scale=0,
30
+ elem_classes="example-style",
31
+ )
32
+ user_input = gr.Textbox(
33
+ label="Input Text",
34
+ lines=5,
35
+ show_label=False,
36
+ )
37
+
38
+ with gr.Row():
39
+ with gr.Column(scale=6), gr.Group():
40
+ tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1")
41
+
42
+ with gr.Column(scale=6), gr.Group():
43
+ tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2")
44
+
45
+ with gr.Row():
46
+ # dynamic change label
47
+ with gr.Column():
48
+ output_text_1 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
49
+ with gr.Column():
50
+ output_text_2 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
51
+
52
+ with gr.Row():
53
+ output_table_1 = gr.Dataframe()
54
+ output_table_2 = gr.Dataframe()
55
+
56
+ tokenizer_name_1.change(
57
+ tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1]
58
+ )
59
+
60
+ tokenizer_name_2.change(
61
+ tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2]
62
+ )
63
+
64
+ user_input.change(
65
+ tokenize_pair,
66
+ [user_input, tokenizer_name_1, tokenizer_name_2],
67
+ [output_text_1, output_table_1, output_text_2, output_table_2],
68
+ show_api=False,
69
+ )
70
+
71
+ dropdown_examples.change(
72
+ lambda example_idx: (
73
+ examples[sorted(examples.keys())[example_idx]]["text"],
74
+ examples[sorted(examples.keys())[example_idx]]["tokenizer_1"],
75
+ examples[sorted(examples.keys())[example_idx]]["tokenizer_2"],
76
+ ),
77
+ dropdown_examples,
78
+ [user_input, tokenizer_name_1, tokenizer_name_2],
79
+ show_api=False,
80
+ )
81
+
82
+ demo.load(
83
+ fn=on_load,
84
+ inputs=[user_input],
85
+ outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
86
+ js=get_window_url_params,
87
+ show_api=False,
88
+ )
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch(share=True)
playground_examples.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_user_input = """Replace this text in the input field to see how tokenization works."""
2
+ default_tokenizer_name_1 = "openai/gpt-4o"
3
+ default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
4
+
5
+
6
+ number_example = """127+677=804
7
+ 127 + 677 = 804\n
8
+ 1275+6773 = 8041
9
+ 1275 + 6773 = 8048"""
10
+
11
+ code_example = """for i in range(1, 101):
12
+ if i % 3 == 0 and i % 5 == 0:
13
+ print("FizzBuzz")
14
+ elif i % 3 == 0:
15
+ print("Fizz")
16
+ elif i % 5 == 0:
17
+ print("Buzz")
18
+ else:
19
+ print(i)
20
+ """
21
+
22
+ spelling_example = """How do you spell "accommodate"?
23
+ How many letters are in the word "accommodate"?
24
+ How many r's are in the word strawberry?"""
25
+
26
+ examples = {
27
+ "number": {
28
+ "text": number_example,
29
+ "tokenizer_1": default_tokenizer_name_1,
30
+ "tokenizer_2": default_tokenizer_name_2,
31
+ },
32
+ "code": {
33
+ "text": code_example,
34
+ "tokenizer_1": default_tokenizer_name_1,
35
+ "tokenizer_2": default_tokenizer_name_2,
36
+ },
37
+ "spelling": {
38
+ "text": spelling_example,
39
+ "tokenizer_1": default_tokenizer_name_1,
40
+ "tokenizer_2": default_tokenizer_name_2,
41
+ },
42
+ }
playground_util.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from functools import lru_cache
3
+ from typing import Any
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from playground_examples import (
8
+ default_tokenizer_name_1,
9
+ default_tokenizer_name_2,
10
+ default_user_input,
11
+ )
12
+ from utils.i18n_util import get_lang
13
+ from utils.log_util import logger
14
+ from vocab import tokenizer_factory
15
+
16
+
17
+ @lru_cache
18
+ def _tokenize(text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False):
19
+ logger.info(
20
+ "param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
21
+ )
22
+ pos_tokens = []
23
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
24
+ encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text)
25
+ table = []
26
+
27
+ for idx, token_id in enumerate(encoding):
28
+ decoded_text = tokenizer.decode([token_id])
29
+ decoded_text = decoded_text.replace(
30
+ " ", "⋅"
31
+ ) # replace space with ⋅ for better visualization
32
+ pos_tokens.extend([(decoded_text, str(idx % color_num))])
33
+
34
+ try:
35
+ token = tokenizer.decode([token_id])[0]
36
+ except:
37
+ token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id]
38
+
39
+ if isinstance(token, bytes):
40
+ try:
41
+ token_str = token.decode("utf-8")
42
+ except:
43
+ token_str = token.decode("utf-8", errors="ignore")
44
+ logger.error(
45
+ f"{idx}: decode_error: "
46
+ + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
47
+ {
48
+ "tokenizer_type": tokenizer_name,
49
+ "token": str(token),
50
+ "token_str": token_str,
51
+ },
52
+ ensure_ascii=False,
53
+ )
54
+ )
55
+
56
+ # json_dumps = json.dumps(token_str)
57
+ elif isinstance(token, str):
58
+ token_str = token
59
+ else:
60
+ logger.error(
61
+ f"{idx}: wrong type for token {token_id} {type(token)} "
62
+ + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
63
+ )
64
+ token_str = token
65
+
66
+ table.append({"TokenID": token_id, "Text": decoded_text})
67
+
68
+ table_df = pd.DataFrame(table)
69
+ logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
70
+ return pos_tokens, len(encoding), table_df
71
+
72
+
73
+ def tokenize(
74
+ text: str, tokenizer_name: str, color_num: int = 5
75
+ ) -> tuple[dict[Any, Any], pd.DataFrame]:
76
+ """Tokenize an input text."""
77
+ pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num)
78
+ return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
79
+
80
+
81
+ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2, color_num: int = 5):
82
+ """input_text.change."""
83
+ pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1, color_num)
84
+ pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2, color_num)
85
+ return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
86
+
87
+
88
+ def on_load(url_params: str, request: gr.Request = None) -> tuple[str, str, str]:
89
+ """Function triggered on page load to get URL parameters."""
90
+ text = default_user_input
91
+ tokenizer_type_1 = default_tokenizer_name_1
92
+ tokenizer_type_2 = default_tokenizer_name_2
93
+ try:
94
+ url_params_dict = json.loads(url_params)
95
+ except json.JSONDecodeError:
96
+ url_params_dict = {}
97
+
98
+ if request:
99
+ lang, _ = get_lang(request)
100
+ logger.info(str(request.headers))
101
+ client_ip = request.client.host
102
+
103
+ tokenizer_type_1 = url_params_dict.get("tokenizer1", default_tokenizer_name_1)
104
+ tokenizer_type_2 = url_params_dict.get("tokenizer2", default_tokenizer_name_2)
105
+ text = url_params_dict.get("text", default_user_input)
106
+ logger.info(f"client_ip: {client_ip}; lang: {lang} params: {url_params}")
107
+ return text, tokenizer_type_1, tokenizer_type_2
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.38.1
2
+ transformers>4.40.0
3
+ sentencepiece
4
+ tiktoken
5
+ icetk
6
+ torch
7
+ nltk
8
+ boto3
9
+ protobuf==4.25.3
10
+ ai2-olmo
11
+ ipadic
12
+ fugashi
13
+ datasets
utils/__pycache__/i18n_util.cpython-311.pyc ADDED
Binary file (1.61 kB). View file
 
utils/__pycache__/lang_util.cpython-311.pyc ADDED
Binary file (3.24 kB). View file
 
utils/__pycache__/log_util.cpython-311.pyc ADDED
Binary file (633 Bytes). View file
 
utils/__pycache__/text_util.cpython-311.pyc ADDED
Binary file (2.21 kB). View file
 
utils/i18n_util.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ def get_lang(request: gr.Request):
5
+ """
6
+ 'accept-language', b'zh,en;q=0.9,zh-CN;q=0.8')
7
+ """
8
+ accept_language = None
9
+ langs = []
10
+ try:
11
+ accept_language = request.headers["Accept-Language"]
12
+ for lang in accept_language.split(",")[:5]:
13
+ lang = lang.lower()
14
+ if lang.startswith("en"):
15
+ langs.append("en")
16
+ elif lang.startswith("es"):
17
+ langs.append("es")
18
+ elif lang.startswith("zh"):
19
+ langs.append("zh")
20
+ elif lang.startswith("fr"):
21
+ langs.append("fr")
22
+ elif lang.startswith("de"):
23
+ langs.append("de")
24
+ except Exception as e:
25
+ print(e)
26
+ return accept_language, langs
utils/lang_util.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 这个detect_language函数通过定义一系列语言字符的Unicode范围,然后使用regex包来检查输入字符串是否包含这些范围内的字符,
3
+ 从而尝试确定字符串可能使用的语言。函数返回一个列表,包含所有匹配的语言名称;如果没有检测到已定义范围内的字符,则返回['Unknown']。
4
+
5
+ 请注意,由于某些语言(如中文和日文)共享字符集的部分范围,这可能导致某些字符串被识别为多种语言。
6
+ 此外,Latin范围非常广泛,几乎包括了所有西方语言的基本字母,因此可能需要更细致的逻辑来区分使用拉丁字母的具体语言。
7
+
8
+
9
+ 通过检查特定的字母和重音符号来区分一些使用拉丁字母的语言。
10
+ 然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
11
+ 例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
12
+ 此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
13
+
14
+ ## common language
15
+ English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
16
+ """
17
+
18
+ import re
19
+ from typing import List
20
+
21
+ # 由于大部分是'latin',所以就不统计了。
22
+ common_lang = ["Chinese", "Japanese-Kana", "Korean", "Arabic", "number"]
23
+
24
+ # Unicode range of different language
25
+ language_ranges = {
26
+ (
27
+ "Arabic",
28
+ "ar",
29
+ ): r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]",
30
+ # 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
31
+ ("Chinese", "zh"): r"[\u4e00-\u9fff]",
32
+ ("Japanese", "ja"): r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]",
33
+ # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
34
+ # Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
35
+ (
36
+ "Japanese-Kana",
37
+ "ja-kana",
38
+ ): r"[\u3040-\u309F\u30A0-\u30FF]", # Hiragana & Katakana
39
+ ("Korean", "ko"): r"[\uac00-\ud7a3]",
40
+ # 拉丁字母系列
41
+ # ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
42
+ # ('English', 'en'): r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
43
+ # ('French', 'fr'): r'[\u00C0-\u00FF]',
44
+ # ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
45
+ # ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
46
+ # 斯拉夫语系列
47
+ # ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
48
+ #
49
+ # 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
50
+ # 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
51
+ }
52
+
53
+
54
+ def detect_language_by_unicode(text: str) -> List:
55
+ """
56
+ :param text:
57
+ :return:
58
+ """
59
+ detected_languages = []
60
+ for language, pattern in language_ranges.items():
61
+ if re.search(pattern, text):
62
+ detected_languages.append(language)
63
+
64
+ return detected_languages
65
+
66
+
67
+ if __name__ == "__main__":
68
+ # 测试函数
69
+ test_strings = {
70
+ # 拉丁语系
71
+ "Hello, world!": "English/Latin",
72
+ "Hola": "Spanish",
73
+ "Bonjour": "French",
74
+ "Guten Tag": "German",
75
+ "Empieza donde estás. ": "Spanish",
76
+ # CJK
77
+ "你好": "Chinese",
78
+ "こんにちは": "Japanese",
79
+ "안녕하세요": "Korean",
80
+ # 其他
81
+ "Привет": "Russian/Cyrillic",
82
+ "مرحبا": "Arabic",
83
+ }
84
+
85
+ for s, expected in test_strings.items():
86
+ # print(f"'{s}' === Detected lang: {detect_language(s)} === Expected: {expected}")
87
+ print(
88
+ f"'{s}'\nDetected lang: {detect_language_by_unicode(s)}\nExpected lang: {expected}"
89
+ )
utils/log_util.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logging.basicConfig(
4
+ format="[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s",
5
+ level=logging.INFO,
6
+ datefmt="%Y-%m-%d %H:%M:%S",
7
+ )
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
utils/oov_util.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from vocab import TokenizerImpl, all_tokenizer_config, load_tokenizer
4
+
5
+ text = (
6
+ "hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;"
7
+ " 確実に春が近づいてること; a közoktatással? _ Belföld;"
8
+ " pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;"
9
+ " निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;"
10
+ " « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;"
11
+ "װיקיװערטערבוך "
12
+ )
13
+ whitespace = "\t \n\n\r "
14
+ bytes = b"\x00\x01\x02\x03\x04".decode("utf-8")
15
+
16
+ text += whitespace
17
+
18
+
19
+ def get_unk(tokenizer_config):
20
+ tokenizer = load_tokenizer(tokenizer_config)
21
+ if hasattr(tokenizer, "unk_token"):
22
+ return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
23
+ else:
24
+ return "unk_token not found"
25
+
26
+
27
+ # def infer_tokenizer_impl(tokenizer_config):
28
+ def infer_tokenizer_type(tokenizer_config):
29
+ tokenizer = load_tokenizer(tokenizer_config)
30
+ if tokenizer_config.impl == TokenizerImpl.TikToken:
31
+ return "tiktoken"
32
+ if hasattr(tokenizer, "backend_tokenizer"):
33
+ return str(
34
+ type(tokenizer.backend_tokenizer.model)
35
+ ) # type(tokenizer._tokenizer.model))
36
+ # orion: sp_model.Load(vocab_file),继承 PreTrainedTokenizer
37
+ elif hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
38
+ # for i in range(tokenizer.sp_model.piece_size()):
39
+ # if tokenizer.sp_model.is_byte(i):
40
+ # print("")
41
+ return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
42
+
43
+ # sp.Load(model_path) ,并且包括image_tokenizer
44
+ elif "glm-" in tokenizer_config.name_or_path:
45
+ return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
46
+ # sp.Load(model_path) ,没有image_tokenizer
47
+ elif (
48
+ "glm2-" in tokenizer_config.name_or_path
49
+ or "glm3-" in tokenizer_config.name_or_path
50
+ or "CharacterGLM-6B" in tokenizer_config.name_or_path
51
+ ):
52
+ return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
53
+ elif (
54
+ "abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path
55
+ ): # 支持 byte-level,解决oov问题
56
+ return "japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
57
+ # bert-base-japanese: 特殊的地方在于 "word_tokenizer_type": "mecab",见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
58
+ elif "bert-base-japanese" in tokenizer_config.name_or_path:
59
+ return (
60
+ "wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
61
+ )
62
+ elif "moss" in tokenizer_config.name_or_path:
63
+ return "应该是 sentencepiece.byte_bpe,待确认"
64
+ elif "byt5" in tokenizer_config.name_or_path:
65
+ return "未知,待定"
66
+ else:
67
+ print("catch", tokenizer_config.name_or_path)
68
+ raise "error"
69
+
70
+
71
+ def test_lossless(tokenizer_config):
72
+ """
73
+ xlm-roberta-base 为什么oov这么少?是因为有 byte吗?
74
+ :param tokenizer_config:
75
+ :return:
76
+ """
77
+ tokenizer = load_tokenizer(tokenizer_config)
78
+ encoding = tokenizer.encode(text, add_special_tokens=False)
79
+ decoding = tokenizer.decode(encoding)
80
+
81
+ if text in decoding:
82
+ # print(tokenizer_config.name, tokenizer_config.impl, "lossless: true")
83
+ pass
84
+ else:
85
+ unk_count = sum(
86
+ [1 for token_id in encoding if token_id == tokenizer.unk_token_id]
87
+ )
88
+ oov_tokens = []
89
+ # if tokenizer_config.impl == TokenizerImpl.SentencePiece:
90
+ # print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
91
+
92
+ print("#######" * 5)
93
+ print(
94
+ f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
95
+ f"lossless: false; unk_token: {get_unk(tokenizer_config)},"
96
+ f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []"
97
+ )
98
+ for i in range(len(text)):
99
+ if text[i] != decoding[i]:
100
+ # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
101
+ # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
102
+ print(
103
+ f"text[{i}] = {json.dumps(text[i:], ensure_ascii=False)}, \n"
104
+ f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}"
105
+ )
106
+
107
+ break
108
+
109
+
110
+ for config in all_tokenizer_config:
111
+ # if "xlm-roberta-base" in config.name:
112
+ # if "xlm-roberta-base" in config.name:
113
+ # if "chatglm3-6b" in config.name:
114
+ # if "bert-base-japanese" in config.name:
115
+ # if "moss" in config.name:
116
+ # if "byt5" in config.name:
117
+ if "baichuan" in config.name_or_path:
118
+ # if "CharacterGLM-6B" in config.name:
119
+ # if "fastchat-t5" in config.name: # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
120
+ # if True:
121
+ # test_unk(config)
122
+ test_lossless(config)
utils/text_util.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ char_
3
+ """
4
+
5
+
6
+ def detect_lang_from_unicode():
7
+ pass
8
+
9
+
10
+ def is_digit_char(uchar):
11
+ return uchar in "0123456789"
12
+
13
+
14
+ def contains_digit(text):
15
+ return any(is_digit_char(ch) for ch in text)
16
+
17
+
18
+ def get_digit_count(text):
19
+ pass
20
+
21
+
22
+ def is_all_digit(text):
23
+ return all(is_digit_char(char) for char in text)
24
+
25
+
26
+ def get_digit_count(text):
27
+ digit_count = 0
28
+ for char in text:
29
+ if char in "0123456789":
30
+ digit_count += 1
31
+ return digit_count
32
+
33
+
34
+ def has_space(text):
35
+ pass
36
+
37
+
38
+ def is_all_space(text):
39
+ pass
40
+
41
+
42
+ def get_space_count(text):
43
+ space_count = 0
44
+ for char in text:
45
+ if len(char.strip()) == 0:
46
+ space_count += 1
47
+ return space_count
vocab.py ADDED
@@ -0,0 +1,754 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum, auto
3
+ from typing import Any, Dict
4
+
5
+ import tiktoken
6
+ from transformers import AutoTokenizer
7
+ from utils.log_util import logger
8
+
9
+ """Interface:
10
+ # https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py
11
+
12
+ tokenizer.encode -> List[int]: Converts a string to a sequence of ids (integer)
13
+ tokenizer.decode
14
+ tokenizer.convert_tokens_to_string # gpt4 没有这个方法
15
+ tokenizer.convert_ids_to_tokens
16
+ tokenizer.tokenize -> List[str]: Converts a string into a sequence of tokens ->
17
+
18
+
19
+ tokenizer.parent = ""
20
+ tokenizer.vocab_size
21
+ tokenizer.get_vocab() # gpt-neox-20b, llama
22
+ tokenizer.type = TokenizerType.ByteBPE.name
23
+ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
24
+ "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
25
+
26
+
27
+ tokenizer.comments = "split all numbers into individual digits, " \
28
+ "and fallback to bytes to decompose unknown UTF-8 characters"
29
+
30
+ tokenizer.all_special_tokens # baichuan
31
+ tokenizer.special_tokens_set # gpt3.5_turbo
32
+ tokenizer.special_tokens_map
33
+ """
34
+
35
+
36
+ class TokenizerImpl(Enum):
37
+ """
38
+ - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
39
+ - https://huggingface.co/docs/transformers/tokenizer_summary
40
+ - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
41
+
42
+ ## google/BertTokenizer
43
+ - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
44
+ - 特征
45
+ - 算法:BERT的编码器是 BPE-WordPiece,将单词拆分成多个前缀符号(比如BERT中的##)最小单元
46
+ - 词典:有##开头的token,表示subword,
47
+ - 中文采用char粒度分词
48
+ - 英文采用 WordPiece
49
+
50
+
51
+
52
+
53
+ ## google/sentencepiece
54
+ - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
55
+ - 支持 sentencepiece 和 wordpiece
56
+ - sentencepiece 有byte-bpe吗?
57
+ - UNIGRAM = 1; // Unigram language model with dynamic algorithm
58
+ - BPE = 2; // Byte Pair Encoding
59
+ - WORD = 3; // Delimitered by whitespace.
60
+ - CHAR = 4; // tokenizes into character sequence
61
+ - wordpiece
62
+ - 特征:
63
+ - 训练: spm_train --model_type unigram/bpe/char/word
64
+ - 特殊符号: Ġ
65
+ - 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称 (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
66
+ - 实现:
67
+ - 依赖: protobuf
68
+ - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
69
+ - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
70
+ - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
71
+ - 分词:
72
+ - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
73
+ - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
74
+ - 示例:google-t5, llama,baichuan, orion,
75
+ - llama: tokenizer.json(包含model.vocab model.merges) tokenizer.model
76
+ - grok: 原始是 .model文件,后面转成了 tokenizer.json
77
+ - google-t5: tokenizer.json, spiece.model
78
+ - Skywork-13B-Math: tokenizer.model
79
+ - xlm_roberta: sentencepiece.bpe.model
80
+ - GPT2Tokenizer
81
+ - tokenizer.json, vocab.json, merges.txt (https://huggingface.co/openai-community/gpt2)
82
+ - vocab.bpe, encoder.json, dict.txt (fairseq版本,不常用,可以忽略这个版本)
83
+
84
+
85
+
86
+ ## thu/icetk
87
+ - icetk: sentencepiece的分支,支持image_tokenizer。
88
+ - glm, chatglm1, chatglm2
89
+
90
+ ## huggingface/tokenizers
91
+ - https://github.com/huggingface/tokenizers
92
+ - VS sentencepiece
93
+ - 支持sentencepiece
94
+ - .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
95
+ - https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
96
+ - 加载 merges.txt, vocab.json
97
+ - SentencePieceBPETokenizer https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
98
+ - 在 sentencepiece基础上,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好,支持special token
99
+ - 类型: 支持 BBPE, WordPiece or Unigram
100
+ - 特征:
101
+ - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
102
+ - added_tokens 在vocab中不一定存在。
103
+ - 实现:
104
+ - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
105
+ - 加载:
106
+ - 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
107
+ - .model 是 tokenizer.models.BPE 类型
108
+ - 词典有 Ġ "\u0120" 开头
109
+ - 优势
110
+ -
111
+ - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
112
+ - 优势:相对sentence piece,
113
+ - ss
114
+
115
+ ## openai/tiktoken
116
+ - 特征:空格就是空格,
117
+ - 示例:gpt3.5 gpt4, qwen,
118
+ """
119
+
120
+ """ 算法体系 https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
121
+ - word-base tokenizer:
122
+ - char-base tokenizer:
123
+ - subword-based Tokenizer
124
+ - BPE
125
+ - byte-bpe: base vocabulary大小是256
126
+ - WordPiece:
127
+ - 相比BPE,WordPiece 仅保存最终词表,而不保存学到的 merge rule
128
+ - Unigram
129
+ - SentencePiece
130
+
131
+ """
132
+
133
+ # 分类体系:https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
134
+ BertTokenizer = "wordpiece.BertTokenizer"
135
+ JapaneseTokenizer = (
136
+ "wordpiece.MecabTokenizer",
137
+ "https://github.com/polm/fugashi",
138
+ ) # 常用日语包 ipadic,fugashi,
139
+ ByteLevelBPETokenizer = "byte_level_bpe" # BBPE
140
+ SentencePieceBPETokenizer = "sentencepiece_bpe"
141
+
142
+ # 分类体系
143
+
144
+ # SentencePeice(BPE)
145
+ SentencePiece = auto() # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
146
+ byte_level_bpe = auto()
147
+ # HFTokenizer = auto() # , 支持
148
+ TikToken = auto()
149
+ # subword-nmt
150
+ # WordPiece
151
+
152
+
153
+ # load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
154
+
155
+
156
+ @dataclass
157
+ class TokenizerConfig:
158
+ """
159
+ https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
160
+ """
161
+
162
+ name_or_path: str # org/model (path on hub), as unique id
163
+ name_display: str = None #
164
+ impl: TokenizerImpl = None # implementation, tokenizer_class/type
165
+ org: str = None
166
+ link: str = None # http://**
167
+ desc: str = None # description
168
+ meta: str = None
169
+ level: str = None # char-level, word-level, byte-level
170
+ lang: str = None
171
+ init_kwargs: Dict[str, Any] = field(
172
+ default_factory=dict,
173
+ )
174
+
175
+ def __post_init__(self):
176
+ if self.link is None:
177
+ self.link = "https://huggingface.co/" + self.name_or_path # TODO + revision
178
+ if self.name_display is None:
179
+ self.name_display = self.name_or_path
180
+
181
+ @classmethod
182
+ def init_from_json_file(cls, json_filepath: str) -> "TokenizerConfig":
183
+ pass
184
+
185
+ def __eq__(self, other):
186
+ if isinstance(other, self.__class__):
187
+ return self.__dict__ == other.__dict__
188
+ else:
189
+ return False
190
+
191
+ def __hash__(self):
192
+ return hash(self.name_or_path)
193
+
194
+
195
+ # TODO: append link and description to the end of dropdown button.
196
+ # Add tokenizer_class/type, comments
197
+ _all_tokenizer_config = [
198
+ # bert style tokenizers
199
+ TokenizerConfig(
200
+ "google-bert/bert-base-cased",
201
+ impl=TokenizerImpl.BertTokenizer,
202
+ org="Google",
203
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
204
+ ),
205
+ TokenizerConfig(
206
+ "google-bert/bert-base-uncased",
207
+ impl=TokenizerImpl.BertTokenizer,
208
+ org="Google",
209
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
210
+ ),
211
+ TokenizerConfig(
212
+ "google-bert/bert-base-chinese",
213
+ impl=TokenizerImpl.BertTokenizer,
214
+ org="Google",
215
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
216
+ ),
217
+ TokenizerConfig(
218
+ "google-bert/bert-base-german-cased",
219
+ impl=TokenizerImpl.BertTokenizer,
220
+ org="Google",
221
+ ),
222
+ TokenizerConfig(
223
+ "dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"
224
+ ),
225
+ TokenizerConfig(
226
+ "asafaya/bert-base-arabic", impl=TokenizerImpl.BertTokenizer, org="-"
227
+ ),
228
+ TokenizerConfig(
229
+ "google-bert/bert-base-multilingual-uncased",
230
+ impl=TokenizerImpl.BertTokenizer,
231
+ org="Google",
232
+ ),
233
+ TokenizerConfig(
234
+ "google-bert/bert-base-multilingual-cased",
235
+ impl=TokenizerImpl.BertTokenizer,
236
+ org="Google",
237
+ ),
238
+ TokenizerConfig(
239
+ "tohoku-nlp/bert-base-japanese",
240
+ impl=TokenizerImpl.BertTokenizer,
241
+ org="Tohoku",
242
+ desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
243
+ "then split into subwords by the WordPiece algorithm.",
244
+ ),
245
+ TokenizerConfig(
246
+ "clue/roberta_chinese_clue_tiny",
247
+ name_display="clue/roberta-chinese-clue",
248
+ impl=TokenizerImpl.BertTokenizer,
249
+ org="CLUE",
250
+ init_kwargs={"revision": "refs/pr/1"},
251
+ desc="",
252
+ meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md",
253
+ ),
254
+ TokenizerConfig(
255
+ "eson/kplug-base-encoder",
256
+ name_display="eson/kplug",
257
+ impl=TokenizerImpl.BertTokenizer,
258
+ org="JD",
259
+ ),
260
+ TokenizerConfig(
261
+ "ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"
262
+ ), # 台湾中央研究院
263
+ # WoBERT https://kexue.fm/archives/7758
264
+ # WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
265
+ # gpt2 style tokenizers
266
+ TokenizerConfig(
267
+ "openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"
268
+ ),
269
+ # byte-level BPE,没有byte,是unicode-level的吗?
270
+ TokenizerConfig(
271
+ "ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"
272
+ ),
273
+ TokenizerConfig(
274
+ "ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"
275
+ ),
276
+ TokenizerConfig(
277
+ "fnlp/moss-moon-003-sft",
278
+ impl=TokenizerImpl.SentencePiece,
279
+ init_kwargs={"revision": "refs/pr/6"},
280
+ org="Fudan",
281
+ desc="This tokenizer has been trained to treat spaces like parts of the tokens "
282
+ "(a bit like sentencepiece) so a word will be encoded differently whether "
283
+ "it is at the beginning of the sentence (without space) or not",
284
+ meta="在gpt2词典基础上,扩充了5万中文",
285
+ ),
286
+ TokenizerConfig(
287
+ "bigscience/bloom",
288
+ impl=TokenizerImpl.SentencePiece,
289
+ org="BigScience",
290
+ meta="比gpt_neox的词典 对中文支持更好。",
291
+ ),
292
+ # ("bloomz_6b4_zh",
293
+ # ("BelleGroup/BELLE-7B-2M", # 模型和词典都基于bloom
294
+ #
295
+ TokenizerConfig(
296
+ "EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"
297
+ ), # 5万
298
+ TokenizerConfig(
299
+ "cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"
300
+ ), # GPTNeoXTokenizer
301
+ TokenizerConfig(
302
+ "abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"
303
+ ),
304
+ TokenizerConfig(
305
+ "rinna/bilingual-gpt-neox-4b",
306
+ impl=TokenizerImpl.SentencePiece,
307
+ org="ABEJA",
308
+ lang="en/ja",
309
+ ),
310
+ TokenizerConfig(
311
+ "Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
312
+ ), # 15万,速度有点慢
313
+ TokenizerConfig(
314
+ "Qwen/Qwen1.5-110B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
315
+ ),
316
+ TokenizerConfig(
317
+ "Qwen/Qwen1.5-1.8B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
318
+ ),
319
+ TokenizerConfig("Qwen/Qwen2-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
320
+ TokenizerConfig("Qwen/Qwen2-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
321
+ TokenizerConfig(
322
+ "Qwen/Qwen2.5-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
323
+ ),
324
+ TokenizerConfig(
325
+ "Qwen/Qwen2.5-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
326
+ ),
327
+ TokenizerConfig(
328
+ "HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"
329
+ ),
330
+ ####### google/sentencepiece tokenizer:
331
+ # T5 llama internlm
332
+ TokenizerConfig(
333
+ "google-t5/t5-large",
334
+ name_display="google-t5/t5",
335
+ impl=TokenizerImpl.SentencePiece,
336
+ org="Google",
337
+ ),
338
+ # t5_small, t5_base, t5_large, flan_t5_base,
339
+ # ("t5_base", "", "sentencepiece"),
340
+ # TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
341
+ TokenizerConfig(
342
+ "lmsys/fastchat-t5-3b-v1.0",
343
+ impl=TokenizerImpl.SentencePiece,
344
+ org="LMSYS",
345
+ init_kwargs={
346
+ "use_fast": False
347
+ }, # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
348
+ ),
349
+ TokenizerConfig(
350
+ "CohereForAI/aya-101", org="Cohere For AI"
351
+ ), # "tokenizer_class": "T5Tokenizer",
352
+ TokenizerConfig(
353
+ "ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"
354
+ ),
355
+ TokenizerConfig(
356
+ "ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"
357
+ ),
358
+ # byte-level BPE
359
+ # '中文单字': 700, '中文多字': 0 meta-llama/Meta-Llama-3.1-405B
360
+ #
361
+ TokenizerConfig(
362
+ "meta-llama/Llama-3.2-1B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
363
+ ),
364
+ TokenizerConfig(
365
+ "meta-llama/Llama-3.2-3B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
366
+ ),
367
+ # TokenizerConfig("meta-llama/Llama-3.3-70B-Instruct", impl=TokenizerImpl.SentencePiece,
368
+ # org="Meta"),
369
+ TokenizerConfig(
370
+ "meta-llama/Meta-Llama-3.1-405B", impl=TokenizerImpl.SentencePiece, org="Meta"
371
+ ),
372
+ TokenizerConfig(
373
+ "NousResearch/Hermes-3-Llama-3.1-405B",
374
+ impl=TokenizerImpl.SentencePiece,
375
+ org="NousResearch",
376
+ ),
377
+ TokenizerConfig(
378
+ "gradientai/Llama-3-8B-Instruct-Gradient-1048k",
379
+ name_display="Meta/llama3",
380
+ impl=TokenizerImpl.SentencePiece,
381
+ org="Meta",
382
+ desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters",
383
+ ),
384
+ TokenizerConfig(
385
+ "NousResearch/Llama-2-7b-chat-hf",
386
+ name_display="Meta/llama2",
387
+ impl=TokenizerImpl.SentencePiece,
388
+ org="Meta",
389
+ ),
390
+ TokenizerConfig(
391
+ "huggyllama/llama-7b",
392
+ name_display="Meta/llama",
393
+ impl=TokenizerImpl.SentencePiece,
394
+ org="Meta",
395
+ ),
396
+ TokenizerConfig(
397
+ "hpcai-tech/grok-1",
398
+ name_display="xai-org/grok-1",
399
+ impl=TokenizerImpl.SentencePiece,
400
+ org="xAI",
401
+ ),
402
+ # 由.model文件转化为了
403
+ TokenizerConfig(
404
+ "hfl/chinese-llama-lora-7b",
405
+ impl=TokenizerImpl.SentencePiece,
406
+ org="-",
407
+ meta="向原始LLaMA的词汇表中添加2w个中文词汇,针对原版LLaMA模型扩充了中文词表, 提升了中文编解码效率",
408
+ ),
409
+ #
410
+ TokenizerConfig(
411
+ "hfl/chinese-llama-2-7b",
412
+ impl=TokenizerImpl.SentencePiece,
413
+ org="-",
414
+ meta="重新设计了新词表(大小:55296),进一步提升了中文字词的覆盖程度",
415
+ ), #
416
+ TokenizerConfig(
417
+ "hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"
418
+ ),
419
+ TokenizerConfig(
420
+ "hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"
421
+ ),
422
+ # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 "比chinese_llama词典多一个`[PAD]`,请勿混用"
423
+ #
424
+ # ("belle_llama_ext_7b",
425
+ # ("alpaca_7b",
426
+ TokenizerConfig(
427
+ "baichuan-inc/Baichuan-7B",
428
+ name_display="baichuan-inc/baichuan",
429
+ impl=TokenizerImpl.SentencePiece,
430
+ level="byte-level",
431
+ org="Baichuan",
432
+ ),
433
+ TokenizerConfig(
434
+ "baichuan-inc/Baichuan2-7B-Chat",
435
+ name_display="baichuan-inc/baichuan2",
436
+ impl=TokenizerImpl.SentencePiece,
437
+ org="Baichuan",
438
+ desc="expand the vocabulary size from 64000 in Baichuan1 to 125696",
439
+ ),
440
+ TokenizerConfig(
441
+ "internlm/internlm-chat-7b",
442
+ impl=TokenizerImpl.SentencePiece,
443
+ org="Shanghai AI Lab",
444
+ ),
445
+ # 上海AI实验室 + 商汤
446
+ TokenizerConfig(
447
+ "internlm/internlm2-chat-7b",
448
+ impl=TokenizerImpl.SentencePiece,
449
+ org="Shanghai AI Lab",
450
+ ),
451
+ TokenizerConfig(
452
+ "internlm/internlm2-math-7b",
453
+ impl=TokenizerImpl.SentencePiece,
454
+ org="Shanghai AI Lab",
455
+ ),
456
+ TokenizerConfig(
457
+ "internlm/internlm-xcomposer-7b",
458
+ impl=TokenizerImpl.SentencePiece,
459
+ org="Shanghai AI Lab",
460
+ ),
461
+ TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
462
+ TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
463
+ TokenizerConfig(
464
+ "Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"
465
+ ),
466
+ TokenizerConfig(
467
+ "Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"
468
+ ), # 文件:tokenizer.model
469
+ TokenizerConfig(
470
+ "FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"
471
+ ),
472
+ # 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值?
473
+ # "goat",
474
+ # ##### glm系列
475
+ # "glm_chinese",),
476
+ TokenizerConfig(
477
+ "THUDM/chatglm-6b",
478
+ impl=TokenizerImpl.SentencePiece,
479
+ org="Tsinghua",
480
+ meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
481
+ init_kwargs={"revision": "refs/pr/100"},
482
+ ),
483
+ TokenizerConfig(
484
+ "THUDM/chatglm2-6b",
485
+ impl=TokenizerImpl.SentencePiece,
486
+ org="Tsinghua",
487
+ ),
488
+ TokenizerConfig(
489
+ "THUDM/chatglm3-6b",
490
+ impl=TokenizerImpl.SentencePiece,
491
+ org="Tsinghua",
492
+ ),
493
+ TokenizerConfig(
494
+ "thu-coai/CharacterGLM-6B",
495
+ impl=TokenizerImpl.SentencePiece,
496
+ org="Tsinghua",
497
+ ),
498
+ # tiktoken 系列
499
+ TokenizerConfig(
500
+ "openai/text-davinci-003",
501
+ impl=TokenizerImpl.TikToken,
502
+ org="OpenAI",
503
+ link="https://github.com/openai/tiktoken",
504
+ ),
505
+ #
506
+ TokenizerConfig(
507
+ "openai/code-davinci-002",
508
+ impl=TokenizerImpl.TikToken,
509
+ org="OpenAI",
510
+ link="https://github.com/openai/tiktoken",
511
+ ),
512
+ TokenizerConfig(
513
+ "openai/gpt-3.5-turbo",
514
+ impl=TokenizerImpl.TikToken,
515
+ org="OpenAI",
516
+ link="https://github.com/openai/tiktoken",
517
+ desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError",
518
+ ),
519
+ TokenizerConfig(
520
+ "openai/gpt-4",
521
+ impl=TokenizerImpl.TikToken,
522
+ org="OpenAI",
523
+ link="https://github.com/openai/tiktoken",
524
+ ),
525
+ TokenizerConfig(
526
+ "openai/gpt-4o",
527
+ impl=TokenizerImpl.TikToken,
528
+ org="OpenAI",
529
+ link="https://github.com/openai/tiktoken",
530
+ ),
531
+ TokenizerConfig(
532
+ "Qwen/Qwen-7B-Chat",
533
+ name_display="Qwen/Qwen",
534
+ impl=TokenizerImpl.TikToken,
535
+ org="Alibaba",
536
+ init_kwargs={"revision": "refs/pr/56"},
537
+ meta="在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词",
538
+ ),
539
+ # https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
540
+ # 该词表在GPT-4使用的BPE词表cl100k_base基础上,对中文、多语言进行了优化,在对中、英、代码数据的高效编解码的基础上,
541
+ # 对部分多语言更加友好,方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
542
+ # TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
543
+ # 未分类
544
+ # ("amber", ""),
545
+ TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
546
+ TokenizerConfig("apple/DCLM-7B", org="Apple"),
547
+ TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
548
+ TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
549
+ TokenizerConfig("mistralai/Mistral-Large-Instruct-2407", org="Mistral"),
550
+ TokenizerConfig("mistralai/Mistral-Nemo-Instruct-2407", org="Mistral"),
551
+ TokenizerConfig("paust/pko-t5-large", org="PAUST"),
552
+ TokenizerConfig("01-ai/Yi-6B", org="Yi"),
553
+ TokenizerConfig("01-ai/Yi-34B", org="Yi"),
554
+ TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
555
+ TokenizerConfig("01-ai/Yi-1.5-34B", org="Yi"),
556
+ TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
557
+ TokenizerConfig("microsoft/phi-1", org="Microsoft"),
558
+ TokenizerConfig("microsoft/phi-2", org="Microsoft"),
559
+ TokenizerConfig(
560
+ "microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"
561
+ ),
562
+ TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
563
+ TokenizerConfig("google/mobilebert-uncased", org="Google"),
564
+ # ("google/mobilenet_v2_1.0_224",), # error
565
+ TokenizerConfig("google/switch-c-2048", org="Google"),
566
+ TokenizerConfig("google/byt5-small", org="Google"),
567
+ TokenizerConfig("google/mt5-large", org="Google"),
568
+ TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
569
+ TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
570
+ TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
571
+ TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
572
+ TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
573
+ TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
574
+ TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
575
+ TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
576
+ TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
577
+ TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
578
+ TokenizerConfig(
579
+ "deepseek-ai/DeepSeek-R1", org="DeepSeek"
580
+ ), # 在llama3的词典上,增加了一些中文token,删掉了一部分token
581
+ TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
582
+ TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
583
+ TokenizerConfig("google/gemma-7b", org="Google"),
584
+ TokenizerConfig("google/gemma-2-9b", org="Google"),
585
+ TokenizerConfig("allenai/OLMo-7B-hf", org="Allen AI"),
586
+ TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
587
+ TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
588
+ TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
589
+ TokenizerConfig("MiniMaxAI/MiniMax-Text-01", org="MiniMax"),
590
+ # TokenizerConfig("nvidia/Nemotron-4-340B-Instruct", org="Nvidia"),
591
+ # ("claude",),
592
+ # https://github.com/Duxiaoman-DI/XuanYuan
593
+ # https://huggingface.co/apple/OpenELM-3B-Instruct https://huggingface.co/apple/OpenELM-3B
594
+ ]
595
+
596
+ assert len(set([config.name_display for config in _all_tokenizer_config])) == len(
597
+ _all_tokenizer_config
598
+ )
599
+ assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(
600
+ _all_tokenizer_config
601
+ )
602
+ assert len(
603
+ set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])
604
+ ) == len(_all_tokenizer_config)
605
+
606
+
607
+ class TokenizerFactory:
608
+ def __init__(self):
609
+ # self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
610
+ self.all_tokenizer_configs = sorted(
611
+ _all_tokenizer_config, key=lambda k: k.name_display
612
+ )
613
+ self.all_tokenizer_names = [
614
+ config.name_or_path for config in self.all_tokenizer_configs
615
+ ]
616
+ self.name_to_config_list = [
617
+ {config.name_or_path: config for config in self.all_tokenizer_configs},
618
+ {config.name_display: config for config in self.all_tokenizer_configs},
619
+ {
620
+ config.name_display.split("/")[-1]: config
621
+ for config in self.all_tokenizer_configs
622
+ },
623
+ ]
624
+ self.tokenizer_cache = {}
625
+
626
+ def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
627
+ for name_to_config in self.name_to_config_list:
628
+ if tokenizer_name in name_to_config:
629
+ return name_to_config[tokenizer_name]
630
+ return None
631
+
632
+ def get_tokenizer(self, tokenizer_name: str):
633
+ """
634
+ :param tokenizer_name:
635
+ :return:
636
+ """
637
+ tokenizer_config = self.get_tokenizer_config(tokenizer_name)
638
+
639
+ # 1. load from cache
640
+ if tokenizer_config in self.tokenizer_cache:
641
+ return self.tokenizer_cache[tokenizer_config]
642
+
643
+ # 2. load tokenizer
644
+ tokenizer = self.load_tokenizer(tokenizer_config)
645
+
646
+ self.tokenizer_cache[tokenizer_config] = tokenizer
647
+ return tokenizer
648
+
649
+ def get_name_with_hyperlink(self, tokenizer_name: str) -> str:
650
+ def model_hyperlink(link, model_name):
651
+ model_name = model_name
652
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
653
+
654
+ tokenizer_config = self.get_tokenizer_config(tokenizer_name)
655
+ return model_hyperlink(
656
+ tokenizer_config.link, tokenizer_config.name_display.split("/")[-1]
657
+ )
658
+
659
+ def load_tokenizer(self, tokenizer_config):
660
+ if tokenizer_config == None:
661
+ print("dd")
662
+ logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
663
+ if (
664
+ tokenizer_config.impl == TokenizerImpl.TikToken
665
+ and "openai" in tokenizer_config.name_or_path
666
+ ):
667
+ tokenizer = tiktoken.encoding_for_model(
668
+ tokenizer_config.name_or_path.replace("openai/", "")
669
+ )
670
+ else:
671
+ tokenizer = AutoTokenizer.from_pretrained(
672
+ tokenizer_config.name_or_path,
673
+ trust_remote_code=True,
674
+ **tokenizer_config.init_kwargs,
675
+ )
676
+ return tokenizer
677
+
678
+ def add_config(
679
+ self,
680
+ ):
681
+ pass
682
+
683
+ def add_tokenizer(self, tokenizer_name):
684
+ pass
685
+
686
+
687
+ tokenizer_factory = TokenizerFactory()
688
+
689
+
690
+ def add_tokenizer(tokenizer_name: str):
691
+ """
692
+ :param tokenizer_name:
693
+ :return:
694
+ """
695
+ if tokenizer_name in []:
696
+ logger.info(f"{tokenizer_name} already exits")
697
+ else:
698
+ # add to config
699
+ tokenizer_config = TokenizerConfig(tokenizer_name, org="-")
700
+
701
+ # add to tokenizer
702
+ tokenizer = tokenizer_factory.load_tokenizer(tokenizer_config)
703
+
704
+ # refresh cache
705
+
706
+ try:
707
+ tokenizer = AutoTokenizer.from_pretrained(
708
+ tokenizer_name, trust_remote_code=True, **tokenizer_config.init_kwargs
709
+ )
710
+ tokenizer_factory.all_tokenizer_configs.append(
711
+ "",
712
+ )
713
+ tokenizer_factory
714
+
715
+ except Exception as e:
716
+ logger.error(e)
717
+
718
+ pass
719
+
720
+
721
+ # class TokenizerType(Enum):
722
+ #
723
+ # # BERTTokenizer
724
+ # # 依赖一个txt文件
725
+ #
726
+ #
727
+ # # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
728
+ # # 依赖一个json文件,Tokenizer.from_file(vocab_file)
729
+ # # 案例:gpt-neox-20B
730
+ # HFTokenizer = auto()
731
+ #
732
+ # # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
733
+ # # 案例:
734
+ # SentencePieceTokenizer = auto()
735
+ #
736
+ #
737
+ # # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
738
+ # # 源码:
739
+ # # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
740
+ # # Byte-level BPE
741
+ # GPT2BPETokenizer = auto()
742
+
743
+
744
+ if __name__ == "__main__":
745
+ for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
746
+ if True:
747
+ # if "t5" in tokenizer_config.name_or_path:
748
+ tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
749
+ tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
750
+ tokenizer3 = tokenizer_factory.get_tokenizer(
751
+ tokenizer_config.name_display.split("/")[-1]
752
+ )
753
+ assert tokenizer1 == tokenizer2 == tokenizer3
754
+ print(tokenizer_config.name_or_path, len(tokenizer1))