jxy commited on
Commit
7f125a7
·
1 Parent(s): 471a254

Upload tokenizer

Browse files
qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenization_qwen.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
13
+
14
+ import tiktoken
15
+ from transformers import PreTrainedTokenizer, AddedToken
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen_finance.tiktoken"}
21
+
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ BEGINOFMASK = "<|beginofmask|>"
27
+ ENDOFMASK = "<|endofmask|>"
28
+ # as the default behavior is changed to allow special tokens in
29
+ # regular texts, the surface forms of special tokens need to be
30
+ # as different as possible to minimize the impact
31
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
32
+ # changed to use actual index to avoid misconfiguration with vocabulary expansion
33
+ SPECIAL_START_ID = 153719
34
+ SPECIAL_TOKENS = tuple(
35
+ enumerate(
36
+ (
37
+ (
38
+ ENDOFTEXT,
39
+ IMSTART,
40
+ IMEND,
41
+ BEGINOFMASK,
42
+ ENDOFMASK,
43
+ )
44
+ + EXTRAS
45
+ ),
46
+ start=SPECIAL_START_ID,
47
+ )
48
+ )
49
+ SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
50
+
51
+
52
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
53
+ with open(tiktoken_bpe_file, "rb") as f:
54
+ contents = f.read()
55
+ return {
56
+ base64.b64decode(token): int(rank)
57
+ for token, rank in (line.split() for line in contents.splitlines() if line)
58
+ }
59
+
60
+
61
+ class QWenTokenizer(PreTrainedTokenizer):
62
+ """QWen tokenizer."""
63
+
64
+ vocab_files_names = VOCAB_FILES_NAMES
65
+
66
+ def __init__(
67
+ self,
68
+ vocab_file,
69
+ errors="replace",
70
+ extra_vocab_file=None,
71
+ **kwargs,
72
+ ):
73
+ super().__init__(**kwargs)
74
+
75
+ # how to handle errors in decoding UTF-8 byte sequences
76
+ # use ignore if you are in streaming inference
77
+ self.errors = errors
78
+
79
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
80
+ self.special_tokens = {
81
+ token: index
82
+ for index, token in SPECIAL_TOKENS
83
+ }
84
+
85
+ # try load extra vocab from file
86
+ if extra_vocab_file is not None:
87
+ used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
88
+ extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
89
+ for token, index in extra_mergeable_ranks.items():
90
+ if token in self.mergeable_ranks:
91
+ logger.info(f"extra token {token} exists, skipping")
92
+ continue
93
+ if index in used_ids:
94
+ logger.info(f'the index {index} for extra token {token} exists, skipping')
95
+ continue
96
+ self.mergeable_ranks[token] = index
97
+ # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
98
+
99
+ enc = tiktoken.Encoding(
100
+ "Qwen",
101
+ pat_str=PAT_STR,
102
+ mergeable_ranks=self.mergeable_ranks,
103
+ special_tokens=self.special_tokens,
104
+ )
105
+ assert (
106
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
107
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
108
+
109
+ self.decoder = {
110
+ v: k for k, v in self.mergeable_ranks.items()
111
+ } # type: dict[int, bytes|str]
112
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
113
+
114
+ self.tokenizer = enc # type: tiktoken.Encoding
115
+
116
+ self.eod_id = self.tokenizer.eot_token
117
+ self.im_start_id = self.special_tokens[IMSTART]
118
+ self.im_end_id = self.special_tokens[IMEND]
119
+
120
+ def __getstate__(self):
121
+ # for pickle lovers
122
+ state = self.__dict__.copy()
123
+ del state["tokenizer"]
124
+ return state
125
+
126
+ def __setstate__(self, state):
127
+ # tokenizer is not python native; don't pass it; rebuild it
128
+ self.__dict__.update(state)
129
+ enc = tiktoken.Encoding(
130
+ "Qwen",
131
+ pat_str=PAT_STR,
132
+ mergeable_ranks=self.mergeable_ranks,
133
+ special_tokens=self.special_tokens,
134
+ )
135
+ self.tokenizer = enc
136
+
137
+ def __len__(self) -> int:
138
+ return self.tokenizer.n_vocab
139
+
140
+ def get_vocab(self) -> Dict[bytes, int]:
141
+ return self.mergeable_ranks
142
+
143
+ def convert_tokens_to_ids(
144
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
145
+ ) -> List[int]:
146
+ ids = []
147
+ if isinstance(tokens, (str, bytes)):
148
+ if tokens in self.special_tokens:
149
+ return self.special_tokens[tokens]
150
+ else:
151
+ return self.mergeable_ranks.get(tokens)
152
+ for token in tokens:
153
+ if token in self.special_tokens:
154
+ ids.append(self.special_tokens[token])
155
+ else:
156
+ ids.append(self.mergeable_ranks.get(token))
157
+ return ids
158
+
159
+ def _add_tokens(
160
+ self,
161
+ new_tokens: Union[List[str], List[AddedToken]],
162
+ special_tokens: bool = False,
163
+ ) -> int:
164
+ if not special_tokens and new_tokens:
165
+ raise ValueError("Adding regular tokens is not supported")
166
+ for token in new_tokens:
167
+ surface_form = token.content if isinstance(token, AddedToken) else token
168
+ if surface_form not in SPECIAL_TOKENS_SET:
169
+ raise ValueError("Adding unknown special tokens is not supported")
170
+ return 0
171
+
172
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
173
+ """
174
+ Save only the vocabulary of the tokenizer (vocabulary).
175
+ Returns:
176
+ `Tuple(str)`: Paths to the files saved.
177
+ """
178
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
179
+ with open(file_path, "w", encoding="utf8") as w:
180
+ for k, v in self.mergeable_ranks.items():
181
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
182
+ w.write(line)
183
+ return (file_path,)
184
+
185
+ def tokenize(
186
+ self,
187
+ text: str,
188
+ allowed_special: Union[Set, str] = "all",
189
+ disallowed_special: Union[Collection, str] = (),
190
+ **kwargs,
191
+ ) -> List[Union[bytes, str]]:
192
+ """
193
+ Converts a string in a sequence of tokens.
194
+ Args:
195
+ text (`str`):
196
+ The sequence to be encoded.
197
+ allowed_special (`Literal["all"]` or `set`):
198
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
199
+ Default to "all".
200
+ disallowed_special (`Literal["all"]` or `Collection`):
201
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
202
+ Default to an empty tuple.
203
+ kwargs (additional keyword arguments, *optional*):
204
+ Will be passed to the underlying model specific encode method.
205
+ Returns:
206
+ `List[bytes|str]`: The list of tokens.
207
+ """
208
+ tokens = []
209
+ text = unicodedata.normalize("NFC", text)
210
+
211
+ # this implementation takes a detour: text -> token id -> token surface forms
212
+ for t in self.tokenizer.encode(
213
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
214
+ ):
215
+ tokens.append(self.decoder[t])
216
+ return tokens
217
+
218
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
219
+ """
220
+ Converts a sequence of tokens in a single string.
221
+ """
222
+ text = ""
223
+ temp = b""
224
+ for t in tokens:
225
+ if isinstance(t, str):
226
+ if temp:
227
+ text += temp.decode("utf-8", errors=self.errors)
228
+ temp = b""
229
+ text += t
230
+ elif isinstance(t, bytes):
231
+ temp += t
232
+ else:
233
+ raise TypeError("token should only be of type types or str")
234
+ if temp:
235
+ text += temp.decode("utf-8", errors=self.errors)
236
+ return text
237
+
238
+ @property
239
+ def vocab_size(self):
240
+ return self.tokenizer.n_vocab
241
+
242
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
243
+ """Converts an id to a token, special tokens included"""
244
+ if index in self.decoder:
245
+ return self.decoder[index]
246
+ raise ValueError("unknown ids")
247
+
248
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
249
+ """Converts a token to an id using the vocab, special tokens included"""
250
+ if token in self.special_tokens:
251
+ return self.special_tokens[token]
252
+ if token in self.mergeable_ranks:
253
+ return self.mergeable_ranks[token]
254
+ raise ValueError("unknown token")
255
+
256
+ def _tokenize(self, text: str, **kwargs):
257
+ """
258
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
259
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
260
+ Do NOT take care of added tokens.
261
+ """
262
+ raise NotImplementedError
263
+
264
+ def _decode(
265
+ self,
266
+ token_ids: Union[int, List[int]],
267
+ skip_special_tokens: bool = False,
268
+ errors: str = None,
269
+ **kwargs,
270
+ ) -> str:
271
+ if isinstance(token_ids, int):
272
+ token_ids = [token_ids]
273
+ if skip_special_tokens:
274
+ token_ids = [i for i in token_ids if i < self.eod_id]
275
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 16384,
10
+ "tokenizer_class": "QWenTokenizer"
11
+ }