George Pantazopoulos commited on
Commit
f34a973
·
1 Parent(s): 2a67d66

chore: cleanup

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 Athens NLP Summer School
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Tokenization Playground
3
- emoji: 📝
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: gradio
7
- pinned: false
8
- short_description: Compare different tokenizers
9
- ---
10
-
11
- # tokenization_playground
12
- Link to source code: https://github.com/athnlp/tokenization_playground
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- import gradio as gr
4
- from huggingface_hub import login
5
-
6
- from playground_app import demo as playground_tab
7
-
8
- auth_token = os.environ.get("HF_TOKEN", None)
9
- if auth_token:
10
- login(token=auth_token)
11
-
12
-
13
- title = """
14
- <div align="center">
15
- <span>Tokenization Playground</span>
16
- </div>
17
- """
18
-
19
- with gr.Blocks() as demo:
20
- _ = gr.HTML(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
21
- _ = playground_tab.render()
22
-
23
- if __name__ == "__main__":
24
- demo.launch()
25
- # demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
character_util.py DELETED
@@ -1,178 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
- from typing import Literal
5
-
6
- import numpy as np
7
- import pandas as pd
8
- from utils.lang_util import detect_language_by_unicode, language_ranges
9
- from utils.log_util import logger
10
- from utils.text_util import contains_digit, get_space_count
11
- from vocab import tokenizer_factory
12
-
13
- CURRENT_DIR = Path.parent(Path.resolve(__file__))
14
-
15
- cache = {}
16
- default_columns = ["digit", "zh"]
17
-
18
-
19
- def text_to_unicode(text: str) -> str:
20
- """Convert text to unicode representation."""
21
- return "".join(rf"\u{ord(character):04X}" for character in text)
22
-
23
-
24
- def calculate_dist(token_lens: list[int]) -> str:
25
- """Calculate the distribution of token lengths."""
26
- if not token_lens:
27
- return "-"
28
- return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
29
-
30
-
31
- def iter_vocab(
32
- tokenizer_name: str,
33
- from_cache: bool = True,
34
- cache_dir: str = "stats",
35
- ) -> pd.DataFrame | dict:
36
- """:param tokenizer_name:
37
- :param from_cache:
38
- :param cache_dir:
39
- :return:
40
- """
41
- tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
42
-
43
- cache_dir = os.path.join(CURRENT_DIR, cache_dir)
44
- os.makedirs(cache_dir, exist_ok=True)
45
-
46
- # load from cache
47
- cache_path = os.path.join(cache_dir, "character_stats.json")
48
- if not cache and os.path.exists(cache_path):
49
- with open(cache_path, encoding="utf-8") as f_tmp:
50
- cache.update(json.load(f_tmp))
51
- if from_cache and tokenizer_name in cache:
52
- # logger.info(f"load {tokenizer_config.name_or_path} from cache")
53
- return cache[tokenizer_name]
54
-
55
- tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
56
-
57
- tokens_by_lang = {lang[1]: [] for lang in language_ranges}
58
- digit_tokens = []
59
- space_tokens = []
60
- byte_tokens = []
61
-
62
- buffer = []
63
- for token_id in range(tokenizer.vocab_size):
64
- # for token_id in tokenizer.get_vocab():
65
- # for token_id in range(len(tokenizer)):
66
- decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
67
- token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
68
- tags = []
69
- if token is None: # 有些词典有空的id(不连续)
70
- continue
71
- if isinstance(token, bytes):
72
- token = token.decode("utf-8", errors="ignore")
73
-
74
- if hasattr(tokenizer, "sp_model") and tokenizer.sp_model.is_byte(token_id):
75
- tags.append("is_byte")
76
- byte_tokens.append(token)
77
-
78
- language_tags = detect_language_by_unicode(decode_str)
79
- for language in language_tags:
80
- tokens_by_lang[language[1]].append(decode_str)
81
-
82
- if contains_digit(decode_str):
83
- tags.append("digit")
84
- digit_tokens.append(decode_str)
85
-
86
- space_count = get_space_count(decode_str)
87
- if space_count > 0:
88
- space_tokens.append(decode_str)
89
-
90
- buffer.append(
91
- json.dumps(
92
- {
93
- "id": token_id,
94
- "token": token,
95
- "token_decode": decode_str,
96
- "token_dumps": json.dumps(token),
97
- "token_unicode": text_to_unicode(token),
98
- "token_len": len(decode_str),
99
- },
100
- ensure_ascii=False,
101
- )
102
- + "\n"
103
- )
104
-
105
- result = {
106
- "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
107
- "organization": tokenizer_config.org,
108
- "vocab_size": len(tokenizer),
109
- "num(digit)": len(digit_tokens),
110
- "len(digit)": calculate_dist([len(token) for token in digit_tokens]),
111
- "num(space)": len(space_tokens),
112
- "len(space)": calculate_dist([len(token) for token in space_tokens]),
113
- }
114
-
115
- for lang, tokens in tokens_by_lang.items():
116
- result[f"num({lang})"] = len(tokens)
117
- result["len(" + lang + ")"] = calculate_dist([len(token) for token in tokens])
118
-
119
- out_path = os.path.join(
120
- cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl"
121
- )
122
- with open(out_path, "w", encoding="utf-8") as f_out:
123
- for line in buffer:
124
- f_out.write(line)
125
- len_before = len(cache)
126
- cache[tokenizer_name] = result
127
- len_after = len(cache)
128
- logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
129
- with open(cache_path, "w", encoding="utf-8") as f_out:
130
- f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
131
- return result
132
-
133
-
134
- def to_dataframe(stats: dict[str, Any], columns: list[str]) -> pd.DataFrame:
135
- table = []
136
- for stat in stats.values():
137
- filtered_stat = {}
138
- for k, v in stat.items():
139
- if not k.startswith("num") and not k.startswith("len"):
140
- filtered_stat[k] = v
141
- if any(column in k for column in columns):
142
- k = k.replace("ja-kana", "kana")
143
- filtered_stat[k] = v
144
- table.append(filtered_stat)
145
- return pd.DataFrame(table)
146
-
147
-
148
- def get_character_table(
149
- tokenizer_filter: str | None = None,
150
- columns: list | None = None,
151
- return_type: Literal["dict", "dataframe"] | None = "dataframe",
152
- ) -> pd.DataFrame | dict:
153
- logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
154
- stats = {}
155
- if columns is None:
156
- columns = default_columns
157
- if tokenizer_filter is not None:
158
- tokenizer_names = [
159
- tokenizer_config.name_or_path
160
- for tokenizer_config in tokenizer_factory.all_tokenizer_configs
161
- if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()
162
- ]
163
- else:
164
- tokenizer_names = tokenizer_factory.all_tokenizer_names
165
-
166
- for tokenizer_name in tokenizer_names:
167
- stat = iter_vocab(tokenizer_name)
168
- stats[tokenizer_name] = stat
169
-
170
- if return_type == "dataframe":
171
- stats = to_dataframe(stats, columns)
172
- return stats
173
-
174
-
175
- if __name__ == "__main__":
176
- # aa = get_character_table(tokenizer_filter="baichuan")
177
- df = get_character_table()
178
- logger.info(f"\n{df.to_markdown(index=False)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
playground_app.py DELETED
@@ -1,91 +0,0 @@
1
- import gradio as gr
2
- from playground_examples import examples
3
- from playground_util import on_load, tokenize, tokenize_pair
4
- from vocab import tokenizer_factory
5
-
6
- get_window_url_params = """
7
- function(url_params) {
8
- const params = new URLSearchParams(window.location.search);
9
- url_params = JSON.stringify(Object.fromEntries(params));
10
- return url_params;
11
- }
12
- """
13
-
14
- all_tokenizer_name = [
15
- (config.name_display, config.name_or_path)
16
- for config in tokenizer_factory.all_tokenizer_configs
17
- ]
18
-
19
- with gr.Blocks() as demo:
20
- with gr.Row():
21
- gr.Markdown("## Input Text")
22
- dropdown_examples = gr.Dropdown(
23
- sorted(examples.keys()),
24
- value="Examples",
25
- type="index",
26
- allow_custom_value=True,
27
- show_label=False,
28
- container=False,
29
- scale=0,
30
- elem_classes="example-style",
31
- )
32
- user_input = gr.Textbox(
33
- label="Input Text",
34
- lines=5,
35
- show_label=False,
36
- )
37
-
38
- with gr.Row():
39
- with gr.Column(scale=6), gr.Group():
40
- tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1")
41
-
42
- with gr.Column(scale=6), gr.Group():
43
- tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2")
44
-
45
- with gr.Row():
46
- # dynamic change label
47
- with gr.Column():
48
- output_text_1 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
49
- with gr.Column():
50
- output_text_2 = gr.Highlightedtext(show_legend=False, show_inline_category=False)
51
-
52
- with gr.Row():
53
- output_table_1 = gr.Dataframe()
54
- output_table_2 = gr.Dataframe()
55
-
56
- tokenizer_name_1.change(
57
- tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1]
58
- )
59
-
60
- tokenizer_name_2.change(
61
- tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2]
62
- )
63
-
64
- user_input.change(
65
- tokenize_pair,
66
- [user_input, tokenizer_name_1, tokenizer_name_2],
67
- [output_text_1, output_table_1, output_text_2, output_table_2],
68
- show_api=False,
69
- )
70
-
71
- dropdown_examples.change(
72
- lambda example_idx: (
73
- examples[sorted(examples.keys())[example_idx]]["text"],
74
- examples[sorted(examples.keys())[example_idx]]["tokenizer_1"],
75
- examples[sorted(examples.keys())[example_idx]]["tokenizer_2"],
76
- ),
77
- dropdown_examples,
78
- [user_input, tokenizer_name_1, tokenizer_name_2],
79
- show_api=False,
80
- )
81
-
82
- demo.load(
83
- fn=on_load,
84
- inputs=[user_input],
85
- outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
86
- js=get_window_url_params,
87
- show_api=False,
88
- )
89
-
90
- if __name__ == "__main__":
91
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
playground_examples.py DELETED
@@ -1,42 +0,0 @@
1
- default_user_input = """Replace this text in the input field to see how tokenization works."""
2
- default_tokenizer_name_1 = "openai/gpt-4o"
3
- default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
4
-
5
-
6
- number_example = """127+677=804
7
- 127 + 677 = 804\n
8
- 1275+6773 = 8041
9
- 1275 + 6773 = 8048"""
10
-
11
- code_example = """for i in range(1, 101):
12
- if i % 3 == 0 and i % 5 == 0:
13
- print("FizzBuzz")
14
- elif i % 3 == 0:
15
- print("Fizz")
16
- elif i % 5 == 0:
17
- print("Buzz")
18
- else:
19
- print(i)
20
- """
21
-
22
- spelling_example = """How do you spell "accommodate"?
23
- How many letters are in the word "accommodate"?
24
- How many r's are in the word strawberry?"""
25
-
26
- examples = {
27
- "number": {
28
- "text": number_example,
29
- "tokenizer_1": default_tokenizer_name_1,
30
- "tokenizer_2": default_tokenizer_name_2,
31
- },
32
- "code": {
33
- "text": code_example,
34
- "tokenizer_1": default_tokenizer_name_1,
35
- "tokenizer_2": default_tokenizer_name_2,
36
- },
37
- "spelling": {
38
- "text": spelling_example,
39
- "tokenizer_1": default_tokenizer_name_1,
40
- "tokenizer_2": default_tokenizer_name_2,
41
- },
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
playground_util.py DELETED
@@ -1,107 +0,0 @@
1
- import json
2
- from functools import lru_cache
3
- from typing import Any
4
-
5
- import gradio as gr
6
- import pandas as pd
7
- from playground_examples import (
8
- default_tokenizer_name_1,
9
- default_tokenizer_name_2,
10
- default_user_input,
11
- )
12
- from utils.i18n_util import get_lang
13
- from utils.log_util import logger
14
- from vocab import tokenizer_factory
15
-
16
-
17
- @lru_cache
18
- def _tokenize(text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False):
19
- logger.info(
20
- "param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
21
- )
22
- pos_tokens = []
23
- tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
24
- encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text)
25
- table = []
26
-
27
- for idx, token_id in enumerate(encoding):
28
- decoded_text = tokenizer.decode([token_id])
29
- decoded_text = decoded_text.replace(
30
- " ", "⋅"
31
- ) # replace space with ⋅ for better visualization
32
- pos_tokens.extend([(decoded_text, str(idx % color_num))])
33
-
34
- try:
35
- token = tokenizer.decode([token_id])[0]
36
- except:
37
- token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id]
38
-
39
- if isinstance(token, bytes):
40
- try:
41
- token_str = token.decode("utf-8")
42
- except:
43
- token_str = token.decode("utf-8", errors="ignore")
44
- logger.error(
45
- f"{idx}: decode_error: "
46
- + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
47
- {
48
- "tokenizer_type": tokenizer_name,
49
- "token": str(token),
50
- "token_str": token_str,
51
- },
52
- ensure_ascii=False,
53
- )
54
- )
55
-
56
- # json_dumps = json.dumps(token_str)
57
- elif isinstance(token, str):
58
- token_str = token
59
- else:
60
- logger.error(
61
- f"{idx}: wrong type for token {token_id} {type(token)} "
62
- + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False)
63
- )
64
- token_str = token
65
-
66
- table.append({"TokenID": token_id, "Text": decoded_text})
67
-
68
- table_df = pd.DataFrame(table)
69
- logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
70
- return pos_tokens, len(encoding), table_df
71
-
72
-
73
- def tokenize(
74
- text: str, tokenizer_name: str, color_num: int = 5
75
- ) -> tuple[dict[Any, Any], pd.DataFrame]:
76
- """Tokenize an input text."""
77
- pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num)
78
- return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
79
-
80
-
81
- def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2, color_num: int = 5):
82
- """input_text.change."""
83
- pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1, color_num)
84
- pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2, color_num)
85
- return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
86
-
87
-
88
- def on_load(url_params: str, request: gr.Request = None) -> tuple[str, str, str]:
89
- """Function triggered on page load to get URL parameters."""
90
- text = default_user_input
91
- tokenizer_type_1 = default_tokenizer_name_1
92
- tokenizer_type_2 = default_tokenizer_name_2
93
- try:
94
- url_params_dict = json.loads(url_params)
95
- except json.JSONDecodeError:
96
- url_params_dict = {}
97
-
98
- if request:
99
- lang, _ = get_lang(request)
100
- logger.info(str(request.headers))
101
- client_ip = request.client.host
102
-
103
- tokenizer_type_1 = url_params_dict.get("tokenizer1", default_tokenizer_name_1)
104
- tokenizer_type_2 = url_params_dict.get("tokenizer2", default_tokenizer_name_2)
105
- text = url_params_dict.get("text", default_user_input)
106
- logger.info(f"client_ip: {client_ip}; lang: {lang} params: {url_params}")
107
- return text, tokenizer_type_1, tokenizer_type_2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,13 +0,0 @@
1
- gradio>=4.38.1
2
- transformers>4.40.0
3
- sentencepiece
4
- tiktoken
5
- icetk
6
- torch
7
- nltk
8
- boto3
9
- protobuf==4.25.3
10
- ai2-olmo
11
- ipadic
12
- fugashi
13
- datasets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/__pycache__/i18n_util.cpython-311.pyc DELETED
Binary file (1.61 kB)
 
utils/__pycache__/lang_util.cpython-311.pyc DELETED
Binary file (3.24 kB)
 
utils/__pycache__/log_util.cpython-311.pyc DELETED
Binary file (633 Bytes)
 
utils/__pycache__/text_util.cpython-311.pyc DELETED
Binary file (2.21 kB)
 
utils/i18n_util.py DELETED
@@ -1,26 +0,0 @@
1
- import gradio as gr
2
-
3
-
4
- def get_lang(request: gr.Request):
5
- """
6
- 'accept-language', b'zh,en;q=0.9,zh-CN;q=0.8')
7
- """
8
- accept_language = None
9
- langs = []
10
- try:
11
- accept_language = request.headers["Accept-Language"]
12
- for lang in accept_language.split(",")[:5]:
13
- lang = lang.lower()
14
- if lang.startswith("en"):
15
- langs.append("en")
16
- elif lang.startswith("es"):
17
- langs.append("es")
18
- elif lang.startswith("zh"):
19
- langs.append("zh")
20
- elif lang.startswith("fr"):
21
- langs.append("fr")
22
- elif lang.startswith("de"):
23
- langs.append("de")
24
- except Exception as e:
25
- print(e)
26
- return accept_language, langs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/lang_util.py DELETED
@@ -1,89 +0,0 @@
1
- """
2
- 这个detect_language函数通过定义一系列语言字符的Unicode范围,然后使用regex包来检查输入字符串是否包含这些范围内的字符,
3
- 从而尝试确定字符串可能使用的语言。函数返回一个列表,包含所有匹配的语言名称;如果没有检测到已定义范围内的字符,则返回['Unknown']。
4
-
5
- 请注意,由于某些语言(如中文和日文)共享字符集的部分范围,这可能导致某些字符串被识别为多种语言。
6
- 此外,Latin范围非常广泛,几乎包括了所有西方语言的基本字母,因此可能需要更细致的逻辑来区分使用拉丁字母的具体语言。
7
-
8
-
9
- 通过检查特定的字母和重音符号来区分一些使用拉丁字母的语言。
10
- 然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
11
- 例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
12
- 此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
13
-
14
- ## common language
15
- English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
16
- """
17
-
18
- import re
19
- from typing import List
20
-
21
- # 由于大部分是'latin',所以就不统计了。
22
- common_lang = ["Chinese", "Japanese-Kana", "Korean", "Arabic", "number"]
23
-
24
- # Unicode range of different language
25
- language_ranges = {
26
- (
27
- "Arabic",
28
- "ar",
29
- ): r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]",
30
- # 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
31
- ("Chinese", "zh"): r"[\u4e00-\u9fff]",
32
- ("Japanese", "ja"): r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]",
33
- # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
34
- # Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
35
- (
36
- "Japanese-Kana",
37
- "ja-kana",
38
- ): r"[\u3040-\u309F\u30A0-\u30FF]", # Hiragana & Katakana
39
- ("Korean", "ko"): r"[\uac00-\ud7a3]",
40
- # 拉丁字母系列
41
- # ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
42
- # ('English', 'en'): r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
43
- # ('French', 'fr'): r'[\u00C0-\u00FF]',
44
- # ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
45
- # ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
46
- # 斯拉夫语系列
47
- # ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
48
- #
49
- # 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
50
- # 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
51
- }
52
-
53
-
54
- def detect_language_by_unicode(text: str) -> List:
55
- """
56
- :param text:
57
- :return:
58
- """
59
- detected_languages = []
60
- for language, pattern in language_ranges.items():
61
- if re.search(pattern, text):
62
- detected_languages.append(language)
63
-
64
- return detected_languages
65
-
66
-
67
- if __name__ == "__main__":
68
- # 测试函数
69
- test_strings = {
70
- # 拉丁语系
71
- "Hello, world!": "English/Latin",
72
- "Hola": "Spanish",
73
- "Bonjour": "French",
74
- "Guten Tag": "German",
75
- "Empieza donde estás. ": "Spanish",
76
- # CJK
77
- "你好": "Chinese",
78
- "こんにちは": "Japanese",
79
- "안녕하세요": "Korean",
80
- # 其他
81
- "Привет": "Russian/Cyrillic",
82
- "مرحبا": "Arabic",
83
- }
84
-
85
- for s, expected in test_strings.items():
86
- # print(f"'{s}' === Detected lang: {detect_language(s)} === Expected: {expected}")
87
- print(
88
- f"'{s}'\nDetected lang: {detect_language_by_unicode(s)}\nExpected lang: {expected}"
89
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/log_util.py DELETED
@@ -1,10 +0,0 @@
1
- import logging
2
-
3
- logging.basicConfig(
4
- format="[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s",
5
- level=logging.INFO,
6
- datefmt="%Y-%m-%d %H:%M:%S",
7
- )
8
-
9
- logger = logging.getLogger(__name__)
10
- logger.setLevel(logging.INFO)
 
 
 
 
 
 
 
 
 
 
 
utils/oov_util.py DELETED
@@ -1,122 +0,0 @@
1
- import json
2
-
3
- from vocab import TokenizerImpl, all_tokenizer_config, load_tokenizer
4
-
5
- text = (
6
- "hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;"
7
- " 確実に春が近づいてること; a közoktatással? _ Belföld;"
8
- " pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;"
9
- " निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;"
10
- " « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;"
11
- "װיקיװערטערבוך "
12
- )
13
- whitespace = "\t \n\n\r "
14
- bytes = b"\x00\x01\x02\x03\x04".decode("utf-8")
15
-
16
- text += whitespace
17
-
18
-
19
- def get_unk(tokenizer_config):
20
- tokenizer = load_tokenizer(tokenizer_config)
21
- if hasattr(tokenizer, "unk_token"):
22
- return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
23
- else:
24
- return "unk_token not found"
25
-
26
-
27
- # def infer_tokenizer_impl(tokenizer_config):
28
- def infer_tokenizer_type(tokenizer_config):
29
- tokenizer = load_tokenizer(tokenizer_config)
30
- if tokenizer_config.impl == TokenizerImpl.TikToken:
31
- return "tiktoken"
32
- if hasattr(tokenizer, "backend_tokenizer"):
33
- return str(
34
- type(tokenizer.backend_tokenizer.model)
35
- ) # type(tokenizer._tokenizer.model))
36
- # orion: sp_model.Load(vocab_file),继承 PreTrainedTokenizer
37
- elif hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
38
- # for i in range(tokenizer.sp_model.piece_size()):
39
- # if tokenizer.sp_model.is_byte(i):
40
- # print("")
41
- return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
42
-
43
- # sp.Load(model_path) ,并且包括image_tokenizer
44
- elif "glm-" in tokenizer_config.name_or_path:
45
- return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
46
- # sp.Load(model_path) ,没有image_tokenizer
47
- elif (
48
- "glm2-" in tokenizer_config.name_or_path
49
- or "glm3-" in tokenizer_config.name_or_path
50
- or "CharacterGLM-6B" in tokenizer_config.name_or_path
51
- ):
52
- return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
53
- elif (
54
- "abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path
55
- ): # 支持 byte-level,解决oov问题
56
- return "japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
57
- # bert-base-japanese: 特殊的地方在于 "word_tokenizer_type": "mecab",见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
58
- elif "bert-base-japanese" in tokenizer_config.name_or_path:
59
- return (
60
- "wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
61
- )
62
- elif "moss" in tokenizer_config.name_or_path:
63
- return "应该是 sentencepiece.byte_bpe,待确认"
64
- elif "byt5" in tokenizer_config.name_or_path:
65
- return "未知,待定"
66
- else:
67
- print("catch", tokenizer_config.name_or_path)
68
- raise "error"
69
-
70
-
71
- def test_lossless(tokenizer_config):
72
- """
73
- xlm-roberta-base 为什么oov这么少?是因为有 byte吗?
74
- :param tokenizer_config:
75
- :return:
76
- """
77
- tokenizer = load_tokenizer(tokenizer_config)
78
- encoding = tokenizer.encode(text, add_special_tokens=False)
79
- decoding = tokenizer.decode(encoding)
80
-
81
- if text in decoding:
82
- # print(tokenizer_config.name, tokenizer_config.impl, "lossless: true")
83
- pass
84
- else:
85
- unk_count = sum(
86
- [1 for token_id in encoding if token_id == tokenizer.unk_token_id]
87
- )
88
- oov_tokens = []
89
- # if tokenizer_config.impl == TokenizerImpl.SentencePiece:
90
- # print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
91
-
92
- print("#######" * 5)
93
- print(
94
- f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
95
- f"lossless: false; unk_token: {get_unk(tokenizer_config)},"
96
- f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []"
97
- )
98
- for i in range(len(text)):
99
- if text[i] != decoding[i]:
100
- # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
101
- # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
102
- print(
103
- f"text[{i}] = {json.dumps(text[i:], ensure_ascii=False)}, \n"
104
- f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}"
105
- )
106
-
107
- break
108
-
109
-
110
- for config in all_tokenizer_config:
111
- # if "xlm-roberta-base" in config.name:
112
- # if "xlm-roberta-base" in config.name:
113
- # if "chatglm3-6b" in config.name:
114
- # if "bert-base-japanese" in config.name:
115
- # if "moss" in config.name:
116
- # if "byt5" in config.name:
117
- if "baichuan" in config.name_or_path:
118
- # if "CharacterGLM-6B" in config.name:
119
- # if "fastchat-t5" in config.name: # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
120
- # if True:
121
- # test_unk(config)
122
- test_lossless(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/text_util.py DELETED
@@ -1,47 +0,0 @@
1
- """
2
- char_
3
- """
4
-
5
-
6
- def detect_lang_from_unicode():
7
- pass
8
-
9
-
10
- def is_digit_char(uchar):
11
- return uchar in "0123456789"
12
-
13
-
14
- def contains_digit(text):
15
- return any(is_digit_char(ch) for ch in text)
16
-
17
-
18
- def get_digit_count(text):
19
- pass
20
-
21
-
22
- def is_all_digit(text):
23
- return all(is_digit_char(char) for char in text)
24
-
25
-
26
- def get_digit_count(text):
27
- digit_count = 0
28
- for char in text:
29
- if char in "0123456789":
30
- digit_count += 1
31
- return digit_count
32
-
33
-
34
- def has_space(text):
35
- pass
36
-
37
-
38
- def is_all_space(text):
39
- pass
40
-
41
-
42
- def get_space_count(text):
43
- space_count = 0
44
- for char in text:
45
- if len(char.strip()) == 0:
46
- space_count += 1
47
- return space_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab.py DELETED
@@ -1,754 +0,0 @@
1
- from dataclasses import dataclass, field
2
- from enum import Enum, auto
3
- from typing import Any, Dict
4
-
5
- import tiktoken
6
- from transformers import AutoTokenizer
7
- from utils.log_util import logger
8
-
9
- """Interface:
10
- # https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py
11
-
12
- tokenizer.encode -> List[int]: Converts a string to a sequence of ids (integer)
13
- tokenizer.decode
14
- tokenizer.convert_tokens_to_string # gpt4 没有这个方法
15
- tokenizer.convert_ids_to_tokens
16
- tokenizer.tokenize -> List[str]: Converts a string into a sequence of tokens ->
17
-
18
-
19
- tokenizer.parent = ""
20
- tokenizer.vocab_size
21
- tokenizer.get_vocab() # gpt-neox-20b, llama
22
- tokenizer.type = TokenizerType.ByteBPE.name
23
- tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
24
- "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
25
-
26
-
27
- tokenizer.comments = "split all numbers into individual digits, " \
28
- "and fallback to bytes to decompose unknown UTF-8 characters"
29
-
30
- tokenizer.all_special_tokens # baichuan
31
- tokenizer.special_tokens_set # gpt3.5_turbo
32
- tokenizer.special_tokens_map
33
- """
34
-
35
-
36
- class TokenizerImpl(Enum):
37
- """
38
- - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
39
- - https://huggingface.co/docs/transformers/tokenizer_summary
40
- - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
41
-
42
- ## google/BertTokenizer
43
- - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
44
- - 特征
45
- - 算法:BERT的编码器是 BPE-WordPiece,将单词拆分成多个前缀符号(比如BERT中的##)最小单元
46
- - 词典:有##开头的token,表示subword,
47
- - 中文采用char粒度分词
48
- - 英文采用 WordPiece
49
-
50
-
51
-
52
-
53
- ## google/sentencepiece
54
- - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
55
- - 支持 sentencepiece 和 wordpiece
56
- - sentencepiece 有byte-bpe吗?
57
- - UNIGRAM = 1; // Unigram language model with dynamic algorithm
58
- - BPE = 2; // Byte Pair Encoding
59
- - WORD = 3; // Delimitered by whitespace.
60
- - CHAR = 4; // tokenizes into character sequence
61
- - wordpiece
62
- - 特征:
63
- - 训练: spm_train --model_type unigram/bpe/char/word
64
- - 特殊符号: Ġ
65
- - 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称 (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
66
- - 实现:
67
- - 依赖: protobuf
68
- - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
69
- - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
70
- - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
71
- - 分词:
72
- - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
73
- - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
74
- - 示例:google-t5, llama,baichuan, orion,
75
- - llama: tokenizer.json(包含model.vocab model.merges) tokenizer.model
76
- - grok: 原始是 .model文件,后面转成了 tokenizer.json
77
- - google-t5: tokenizer.json, spiece.model
78
- - Skywork-13B-Math: tokenizer.model
79
- - xlm_roberta: sentencepiece.bpe.model
80
- - GPT2Tokenizer
81
- - tokenizer.json, vocab.json, merges.txt (https://huggingface.co/openai-community/gpt2)
82
- - vocab.bpe, encoder.json, dict.txt (fairseq版本,不常用,可以忽略这个版本)
83
-
84
-
85
-
86
- ## thu/icetk
87
- - icetk: sentencepiece的分支,支持image_tokenizer。
88
- - glm, chatglm1, chatglm2
89
-
90
- ## huggingface/tokenizers
91
- - https://github.com/huggingface/tokenizers
92
- - VS sentencepiece
93
- - 支持sentencepiece
94
- - .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
95
- - https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
96
- - 加载 merges.txt, vocab.json
97
- - SentencePieceBPETokenizer https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
98
- - 在 sentencepiece基础上,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好,支持special token
99
- - 类型: 支持 BBPE, WordPiece or Unigram
100
- - 特征:
101
- - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
102
- - added_tokens 在vocab中不一定存在。
103
- - 实现:
104
- - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
105
- - 加载:
106
- - 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
107
- - .model 是 tokenizer.models.BPE 类型
108
- - 词典有 Ġ "\u0120" 开头
109
- - 优势
110
- -
111
- - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
112
- - 优势:相对sentence piece,
113
- - ss
114
-
115
- ## openai/tiktoken
116
- - 特征:空格就是空格,
117
- - 示例:gpt3.5 gpt4, qwen,
118
- """
119
-
120
- """ 算法体系 https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
121
- - word-base tokenizer:
122
- - char-base tokenizer:
123
- - subword-based Tokenizer
124
- - BPE
125
- - byte-bpe: base vocabulary大小是256
126
- - WordPiece:
127
- - 相比BPE,WordPiece 仅保存最终词表,而不保存学到的 merge rule
128
- - Unigram
129
- - SentencePiece
130
-
131
- """
132
-
133
- # 分类体系:https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
134
- BertTokenizer = "wordpiece.BertTokenizer"
135
- JapaneseTokenizer = (
136
- "wordpiece.MecabTokenizer",
137
- "https://github.com/polm/fugashi",
138
- ) # 常用日语包 ipadic,fugashi,
139
- ByteLevelBPETokenizer = "byte_level_bpe" # BBPE
140
- SentencePieceBPETokenizer = "sentencepiece_bpe"
141
-
142
- # 分类体系
143
-
144
- # SentencePeice(BPE)
145
- SentencePiece = auto() # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
146
- byte_level_bpe = auto()
147
- # HFTokenizer = auto() # , 支持
148
- TikToken = auto()
149
- # subword-nmt
150
- # WordPiece
151
-
152
-
153
- # load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
154
-
155
-
156
- @dataclass
157
- class TokenizerConfig:
158
- """
159
- https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
160
- """
161
-
162
- name_or_path: str # org/model (path on hub), as unique id
163
- name_display: str = None #
164
- impl: TokenizerImpl = None # implementation, tokenizer_class/type
165
- org: str = None
166
- link: str = None # http://**
167
- desc: str = None # description
168
- meta: str = None
169
- level: str = None # char-level, word-level, byte-level
170
- lang: str = None
171
- init_kwargs: Dict[str, Any] = field(
172
- default_factory=dict,
173
- )
174
-
175
- def __post_init__(self):
176
- if self.link is None:
177
- self.link = "https://huggingface.co/" + self.name_or_path # TODO + revision
178
- if self.name_display is None:
179
- self.name_display = self.name_or_path
180
-
181
- @classmethod
182
- def init_from_json_file(cls, json_filepath: str) -> "TokenizerConfig":
183
- pass
184
-
185
- def __eq__(self, other):
186
- if isinstance(other, self.__class__):
187
- return self.__dict__ == other.__dict__
188
- else:
189
- return False
190
-
191
- def __hash__(self):
192
- return hash(self.name_or_path)
193
-
194
-
195
- # TODO: append link and description to the end of dropdown button.
196
- # Add tokenizer_class/type, comments
197
- _all_tokenizer_config = [
198
- # bert style tokenizers
199
- TokenizerConfig(
200
- "google-bert/bert-base-cased",
201
- impl=TokenizerImpl.BertTokenizer,
202
- org="Google",
203
- desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
204
- ),
205
- TokenizerConfig(
206
- "google-bert/bert-base-uncased",
207
- impl=TokenizerImpl.BertTokenizer,
208
- org="Google",
209
- desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
210
- ),
211
- TokenizerConfig(
212
- "google-bert/bert-base-chinese",
213
- impl=TokenizerImpl.BertTokenizer,
214
- org="Google",
215
- desc="first add whitespace around any CJK character, then perform wordpiece tokenization.",
216
- ),
217
- TokenizerConfig(
218
- "google-bert/bert-base-german-cased",
219
- impl=TokenizerImpl.BertTokenizer,
220
- org="Google",
221
- ),
222
- TokenizerConfig(
223
- "dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"
224
- ),
225
- TokenizerConfig(
226
- "asafaya/bert-base-arabic", impl=TokenizerImpl.BertTokenizer, org="-"
227
- ),
228
- TokenizerConfig(
229
- "google-bert/bert-base-multilingual-uncased",
230
- impl=TokenizerImpl.BertTokenizer,
231
- org="Google",
232
- ),
233
- TokenizerConfig(
234
- "google-bert/bert-base-multilingual-cased",
235
- impl=TokenizerImpl.BertTokenizer,
236
- org="Google",
237
- ),
238
- TokenizerConfig(
239
- "tohoku-nlp/bert-base-japanese",
240
- impl=TokenizerImpl.BertTokenizer,
241
- org="Tohoku",
242
- desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
243
- "then split into subwords by the WordPiece algorithm.",
244
- ),
245
- TokenizerConfig(
246
- "clue/roberta_chinese_clue_tiny",
247
- name_display="clue/roberta-chinese-clue",
248
- impl=TokenizerImpl.BertTokenizer,
249
- org="CLUE",
250
- init_kwargs={"revision": "refs/pr/1"},
251
- desc="",
252
- meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md",
253
- ),
254
- TokenizerConfig(
255
- "eson/kplug-base-encoder",
256
- name_display="eson/kplug",
257
- impl=TokenizerImpl.BertTokenizer,
258
- org="JD",
259
- ),
260
- TokenizerConfig(
261
- "ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"
262
- ), # 台湾中央研究院
263
- # WoBERT https://kexue.fm/archives/7758
264
- # WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
265
- # gpt2 style tokenizers
266
- TokenizerConfig(
267
- "openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"
268
- ),
269
- # byte-level BPE,没有byte,是unicode-level的吗?
270
- TokenizerConfig(
271
- "ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"
272
- ),
273
- TokenizerConfig(
274
- "ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"
275
- ),
276
- TokenizerConfig(
277
- "fnlp/moss-moon-003-sft",
278
- impl=TokenizerImpl.SentencePiece,
279
- init_kwargs={"revision": "refs/pr/6"},
280
- org="Fudan",
281
- desc="This tokenizer has been trained to treat spaces like parts of the tokens "
282
- "(a bit like sentencepiece) so a word will be encoded differently whether "
283
- "it is at the beginning of the sentence (without space) or not",
284
- meta="在gpt2词典基础上,扩充了5万中文",
285
- ),
286
- TokenizerConfig(
287
- "bigscience/bloom",
288
- impl=TokenizerImpl.SentencePiece,
289
- org="BigScience",
290
- meta="比gpt_neox的词典 对中文支持更好。",
291
- ),
292
- # ("bloomz_6b4_zh",
293
- # ("BelleGroup/BELLE-7B-2M", # 模型和词典都基于bloom
294
- #
295
- TokenizerConfig(
296
- "EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"
297
- ), # 5万
298
- TokenizerConfig(
299
- "cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"
300
- ), # GPTNeoXTokenizer
301
- TokenizerConfig(
302
- "abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"
303
- ),
304
- TokenizerConfig(
305
- "rinna/bilingual-gpt-neox-4b",
306
- impl=TokenizerImpl.SentencePiece,
307
- org="ABEJA",
308
- lang="en/ja",
309
- ),
310
- TokenizerConfig(
311
- "Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
312
- ), # 15万,速度有点慢
313
- TokenizerConfig(
314
- "Qwen/Qwen1.5-110B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
315
- ),
316
- TokenizerConfig(
317
- "Qwen/Qwen1.5-1.8B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
318
- ),
319
- TokenizerConfig("Qwen/Qwen2-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
320
- TokenizerConfig("Qwen/Qwen2-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
321
- TokenizerConfig(
322
- "Qwen/Qwen2.5-0.5B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
323
- ),
324
- TokenizerConfig(
325
- "Qwen/Qwen2.5-72B", impl=TokenizerImpl.SentencePiece, org="Alibaba"
326
- ),
327
- TokenizerConfig(
328
- "HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"
329
- ),
330
- ####### google/sentencepiece tokenizer:
331
- # T5 llama internlm
332
- TokenizerConfig(
333
- "google-t5/t5-large",
334
- name_display="google-t5/t5",
335
- impl=TokenizerImpl.SentencePiece,
336
- org="Google",
337
- ),
338
- # t5_small, t5_base, t5_large, flan_t5_base,
339
- # ("t5_base", "", "sentencepiece"),
340
- # TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
341
- TokenizerConfig(
342
- "lmsys/fastchat-t5-3b-v1.0",
343
- impl=TokenizerImpl.SentencePiece,
344
- org="LMSYS",
345
- init_kwargs={
346
- "use_fast": False
347
- }, # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
348
- ),
349
- TokenizerConfig(
350
- "CohereForAI/aya-101", org="Cohere For AI"
351
- ), # "tokenizer_class": "T5Tokenizer",
352
- TokenizerConfig(
353
- "ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"
354
- ),
355
- TokenizerConfig(
356
- "ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"
357
- ),
358
- # byte-level BPE
359
- # '中文单字': 700, '中文多字': 0 meta-llama/Meta-Llama-3.1-405B
360
- #
361
- TokenizerConfig(
362
- "meta-llama/Llama-3.2-1B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
363
- ),
364
- TokenizerConfig(
365
- "meta-llama/Llama-3.2-3B-Instruct", impl=TokenizerImpl.SentencePiece, org="Meta"
366
- ),
367
- # TokenizerConfig("meta-llama/Llama-3.3-70B-Instruct", impl=TokenizerImpl.SentencePiece,
368
- # org="Meta"),
369
- TokenizerConfig(
370
- "meta-llama/Meta-Llama-3.1-405B", impl=TokenizerImpl.SentencePiece, org="Meta"
371
- ),
372
- TokenizerConfig(
373
- "NousResearch/Hermes-3-Llama-3.1-405B",
374
- impl=TokenizerImpl.SentencePiece,
375
- org="NousResearch",
376
- ),
377
- TokenizerConfig(
378
- "gradientai/Llama-3-8B-Instruct-Gradient-1048k",
379
- name_display="Meta/llama3",
380
- impl=TokenizerImpl.SentencePiece,
381
- org="Meta",
382
- desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters",
383
- ),
384
- TokenizerConfig(
385
- "NousResearch/Llama-2-7b-chat-hf",
386
- name_display="Meta/llama2",
387
- impl=TokenizerImpl.SentencePiece,
388
- org="Meta",
389
- ),
390
- TokenizerConfig(
391
- "huggyllama/llama-7b",
392
- name_display="Meta/llama",
393
- impl=TokenizerImpl.SentencePiece,
394
- org="Meta",
395
- ),
396
- TokenizerConfig(
397
- "hpcai-tech/grok-1",
398
- name_display="xai-org/grok-1",
399
- impl=TokenizerImpl.SentencePiece,
400
- org="xAI",
401
- ),
402
- # 由.model文件转化为了
403
- TokenizerConfig(
404
- "hfl/chinese-llama-lora-7b",
405
- impl=TokenizerImpl.SentencePiece,
406
- org="-",
407
- meta="向原始LLaMA的词汇表中添加2w个中文词汇,针对原版LLaMA模型扩充了中文词表, 提升了中文编解码效率",
408
- ),
409
- #
410
- TokenizerConfig(
411
- "hfl/chinese-llama-2-7b",
412
- impl=TokenizerImpl.SentencePiece,
413
- org="-",
414
- meta="重新设计了新词表(大小:55296),进一步提升了中文字词的覆盖程度",
415
- ), #
416
- TokenizerConfig(
417
- "hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"
418
- ),
419
- TokenizerConfig(
420
- "hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"
421
- ),
422
- # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 "比chinese_llama词典多一个`[PAD]`,请勿混用"
423
- #
424
- # ("belle_llama_ext_7b",
425
- # ("alpaca_7b",
426
- TokenizerConfig(
427
- "baichuan-inc/Baichuan-7B",
428
- name_display="baichuan-inc/baichuan",
429
- impl=TokenizerImpl.SentencePiece,
430
- level="byte-level",
431
- org="Baichuan",
432
- ),
433
- TokenizerConfig(
434
- "baichuan-inc/Baichuan2-7B-Chat",
435
- name_display="baichuan-inc/baichuan2",
436
- impl=TokenizerImpl.SentencePiece,
437
- org="Baichuan",
438
- desc="expand the vocabulary size from 64000 in Baichuan1 to 125696",
439
- ),
440
- TokenizerConfig(
441
- "internlm/internlm-chat-7b",
442
- impl=TokenizerImpl.SentencePiece,
443
- org="Shanghai AI Lab",
444
- ),
445
- # 上海AI实验室 + 商汤
446
- TokenizerConfig(
447
- "internlm/internlm2-chat-7b",
448
- impl=TokenizerImpl.SentencePiece,
449
- org="Shanghai AI Lab",
450
- ),
451
- TokenizerConfig(
452
- "internlm/internlm2-math-7b",
453
- impl=TokenizerImpl.SentencePiece,
454
- org="Shanghai AI Lab",
455
- ),
456
- TokenizerConfig(
457
- "internlm/internlm-xcomposer-7b",
458
- impl=TokenizerImpl.SentencePiece,
459
- org="Shanghai AI Lab",
460
- ),
461
- TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
462
- TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
463
- TokenizerConfig(
464
- "Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"
465
- ),
466
- TokenizerConfig(
467
- "Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"
468
- ), # 文件:tokenizer.model
469
- TokenizerConfig(
470
- "FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"
471
- ),
472
- # 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值?
473
- # "goat",
474
- # ##### glm系列
475
- # "glm_chinese",),
476
- TokenizerConfig(
477
- "THUDM/chatglm-6b",
478
- impl=TokenizerImpl.SentencePiece,
479
- org="Tsinghua",
480
- meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
481
- init_kwargs={"revision": "refs/pr/100"},
482
- ),
483
- TokenizerConfig(
484
- "THUDM/chatglm2-6b",
485
- impl=TokenizerImpl.SentencePiece,
486
- org="Tsinghua",
487
- ),
488
- TokenizerConfig(
489
- "THUDM/chatglm3-6b",
490
- impl=TokenizerImpl.SentencePiece,
491
- org="Tsinghua",
492
- ),
493
- TokenizerConfig(
494
- "thu-coai/CharacterGLM-6B",
495
- impl=TokenizerImpl.SentencePiece,
496
- org="Tsinghua",
497
- ),
498
- # tiktoken 系列
499
- TokenizerConfig(
500
- "openai/text-davinci-003",
501
- impl=TokenizerImpl.TikToken,
502
- org="OpenAI",
503
- link="https://github.com/openai/tiktoken",
504
- ),
505
- #
506
- TokenizerConfig(
507
- "openai/code-davinci-002",
508
- impl=TokenizerImpl.TikToken,
509
- org="OpenAI",
510
- link="https://github.com/openai/tiktoken",
511
- ),
512
- TokenizerConfig(
513
- "openai/gpt-3.5-turbo",
514
- impl=TokenizerImpl.TikToken,
515
- org="OpenAI",
516
- link="https://github.com/openai/tiktoken",
517
- desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError",
518
- ),
519
- TokenizerConfig(
520
- "openai/gpt-4",
521
- impl=TokenizerImpl.TikToken,
522
- org="OpenAI",
523
- link="https://github.com/openai/tiktoken",
524
- ),
525
- TokenizerConfig(
526
- "openai/gpt-4o",
527
- impl=TokenizerImpl.TikToken,
528
- org="OpenAI",
529
- link="https://github.com/openai/tiktoken",
530
- ),
531
- TokenizerConfig(
532
- "Qwen/Qwen-7B-Chat",
533
- name_display="Qwen/Qwen",
534
- impl=TokenizerImpl.TikToken,
535
- org="Alibaba",
536
- init_kwargs={"revision": "refs/pr/56"},
537
- meta="在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词",
538
- ),
539
- # https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
540
- # 该词表在GPT-4使用的BPE词表cl100k_base基础上,对中文、多语言进行了优化,在对中、英、代码数据的高效编解码的基础上,
541
- # 对部分多语言更加友好,方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
542
- # TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
543
- # 未分类
544
- # ("amber", ""),
545
- TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
546
- TokenizerConfig("apple/DCLM-7B", org="Apple"),
547
- TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
548
- TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
549
- TokenizerConfig("mistralai/Mistral-Large-Instruct-2407", org="Mistral"),
550
- TokenizerConfig("mistralai/Mistral-Nemo-Instruct-2407", org="Mistral"),
551
- TokenizerConfig("paust/pko-t5-large", org="PAUST"),
552
- TokenizerConfig("01-ai/Yi-6B", org="Yi"),
553
- TokenizerConfig("01-ai/Yi-34B", org="Yi"),
554
- TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
555
- TokenizerConfig("01-ai/Yi-1.5-34B", org="Yi"),
556
- TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
557
- TokenizerConfig("microsoft/phi-1", org="Microsoft"),
558
- TokenizerConfig("microsoft/phi-2", org="Microsoft"),
559
- TokenizerConfig(
560
- "microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"
561
- ),
562
- TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
563
- TokenizerConfig("google/mobilebert-uncased", org="Google"),
564
- # ("google/mobilenet_v2_1.0_224",), # error
565
- TokenizerConfig("google/switch-c-2048", org="Google"),
566
- TokenizerConfig("google/byt5-small", org="Google"),
567
- TokenizerConfig("google/mt5-large", org="Google"),
568
- TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
569
- TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
570
- TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
571
- TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
572
- TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
573
- TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
574
- TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
575
- TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
576
- TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
577
- TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
578
- TokenizerConfig(
579
- "deepseek-ai/DeepSeek-R1", org="DeepSeek"
580
- ), # 在llama3的词典上,增加了一些中文token,删掉了一部分token
581
- TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
582
- TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
583
- TokenizerConfig("google/gemma-7b", org="Google"),
584
- TokenizerConfig("google/gemma-2-9b", org="Google"),
585
- TokenizerConfig("allenai/OLMo-7B-hf", org="Allen AI"),
586
- TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
587
- TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
588
- TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
589
- TokenizerConfig("MiniMaxAI/MiniMax-Text-01", org="MiniMax"),
590
- # TokenizerConfig("nvidia/Nemotron-4-340B-Instruct", org="Nvidia"),
591
- # ("claude",),
592
- # https://github.com/Duxiaoman-DI/XuanYuan
593
- # https://huggingface.co/apple/OpenELM-3B-Instruct https://huggingface.co/apple/OpenELM-3B
594
- ]
595
-
596
- assert len(set([config.name_display for config in _all_tokenizer_config])) == len(
597
- _all_tokenizer_config
598
- )
599
- assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(
600
- _all_tokenizer_config
601
- )
602
- assert len(
603
- set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])
604
- ) == len(_all_tokenizer_config)
605
-
606
-
607
- class TokenizerFactory:
608
- def __init__(self):
609
- # self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
610
- self.all_tokenizer_configs = sorted(
611
- _all_tokenizer_config, key=lambda k: k.name_display
612
- )
613
- self.all_tokenizer_names = [
614
- config.name_or_path for config in self.all_tokenizer_configs
615
- ]
616
- self.name_to_config_list = [
617
- {config.name_or_path: config for config in self.all_tokenizer_configs},
618
- {config.name_display: config for config in self.all_tokenizer_configs},
619
- {
620
- config.name_display.split("/")[-1]: config
621
- for config in self.all_tokenizer_configs
622
- },
623
- ]
624
- self.tokenizer_cache = {}
625
-
626
- def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
627
- for name_to_config in self.name_to_config_list:
628
- if tokenizer_name in name_to_config:
629
- return name_to_config[tokenizer_name]
630
- return None
631
-
632
- def get_tokenizer(self, tokenizer_name: str):
633
- """
634
- :param tokenizer_name:
635
- :return:
636
- """
637
- tokenizer_config = self.get_tokenizer_config(tokenizer_name)
638
-
639
- # 1. load from cache
640
- if tokenizer_config in self.tokenizer_cache:
641
- return self.tokenizer_cache[tokenizer_config]
642
-
643
- # 2. load tokenizer
644
- tokenizer = self.load_tokenizer(tokenizer_config)
645
-
646
- self.tokenizer_cache[tokenizer_config] = tokenizer
647
- return tokenizer
648
-
649
- def get_name_with_hyperlink(self, tokenizer_name: str) -> str:
650
- def model_hyperlink(link, model_name):
651
- model_name = model_name
652
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
653
-
654
- tokenizer_config = self.get_tokenizer_config(tokenizer_name)
655
- return model_hyperlink(
656
- tokenizer_config.link, tokenizer_config.name_display.split("/")[-1]
657
- )
658
-
659
- def load_tokenizer(self, tokenizer_config):
660
- if tokenizer_config == None:
661
- print("dd")
662
- logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
663
- if (
664
- tokenizer_config.impl == TokenizerImpl.TikToken
665
- and "openai" in tokenizer_config.name_or_path
666
- ):
667
- tokenizer = tiktoken.encoding_for_model(
668
- tokenizer_config.name_or_path.replace("openai/", "")
669
- )
670
- else:
671
- tokenizer = AutoTokenizer.from_pretrained(
672
- tokenizer_config.name_or_path,
673
- trust_remote_code=True,
674
- **tokenizer_config.init_kwargs,
675
- )
676
- return tokenizer
677
-
678
- def add_config(
679
- self,
680
- ):
681
- pass
682
-
683
- def add_tokenizer(self, tokenizer_name):
684
- pass
685
-
686
-
687
- tokenizer_factory = TokenizerFactory()
688
-
689
-
690
- def add_tokenizer(tokenizer_name: str):
691
- """
692
- :param tokenizer_name:
693
- :return:
694
- """
695
- if tokenizer_name in []:
696
- logger.info(f"{tokenizer_name} already exits")
697
- else:
698
- # add to config
699
- tokenizer_config = TokenizerConfig(tokenizer_name, org="-")
700
-
701
- # add to tokenizer
702
- tokenizer = tokenizer_factory.load_tokenizer(tokenizer_config)
703
-
704
- # refresh cache
705
-
706
- try:
707
- tokenizer = AutoTokenizer.from_pretrained(
708
- tokenizer_name, trust_remote_code=True, **tokenizer_config.init_kwargs
709
- )
710
- tokenizer_factory.all_tokenizer_configs.append(
711
- "",
712
- )
713
- tokenizer_factory
714
-
715
- except Exception as e:
716
- logger.error(e)
717
-
718
- pass
719
-
720
-
721
- # class TokenizerType(Enum):
722
- #
723
- # # BERTTokenizer
724
- # # 依赖一个txt文件
725
- #
726
- #
727
- # # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
728
- # # 依赖一个json文件,Tokenizer.from_file(vocab_file)
729
- # # 案例:gpt-neox-20B
730
- # HFTokenizer = auto()
731
- #
732
- # # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
733
- # # 案例:
734
- # SentencePieceTokenizer = auto()
735
- #
736
- #
737
- # # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
738
- # # 源码:
739
- # # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
740
- # # Byte-level BPE
741
- # GPT2BPETokenizer = auto()
742
-
743
-
744
- if __name__ == "__main__":
745
- for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
746
- if True:
747
- # if "t5" in tokenizer_config.name_or_path:
748
- tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
749
- tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
750
- tokenizer3 = tokenizer_factory.get_tokenizer(
751
- tokenizer_config.name_display.split("/")[-1]
752
- )
753
- assert tokenizer1 == tokenizer2 == tokenizer3
754
- print(tokenizer_config.name_or_path, len(tokenizer1))