JosephusCheung commited on
Commit
dc2add3
1 Parent(s): a31bc4b

Delete tokenization_qwen.py

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +0 -230
tokenization_qwen.py DELETED
@@ -1,230 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Tokenization classes for QWen."""
7
-
8
- import base64
9
- import logging
10
- import os
11
- import unicodedata
12
- from typing import Collection, Dict, List, Set, Tuple, Union
13
-
14
- import tiktoken
15
- from transformers import PreTrainedTokenizer, AddedToken
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
-
22
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
- ENDOFTEXT = "<|endoftext|>"
24
- IMSTART = "<|im_start|>"
25
- IMEND = "<|im_end|>"
26
- # as the default behavior is changed to allow special tokens in
27
- # regular texts, the surface forms of special tokens need to be
28
- # as different as possible to minimize the impact
29
- EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- SPECIAL_TOKENS = (
31
- ENDOFTEXT,
32
- IMSTART,
33
- IMEND,
34
- ) + EXTRAS
35
-
36
-
37
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
- with open(tiktoken_bpe_file, "rb") as f:
39
- contents = f.read()
40
- return {
41
- base64.b64decode(token): int(rank)
42
- for token, rank in (line.split() for line in contents.splitlines() if line)
43
- }
44
-
45
- class QWenTokenizer(PreTrainedTokenizer):
46
- """QWen tokenizer."""
47
-
48
- vocab_files_names = VOCAB_FILES_NAMES
49
-
50
- def __init__(
51
- self,
52
- vocab_file,
53
- errors="replace",
54
- **kwargs,
55
- ):
56
- super().__init__(**kwargs)
57
-
58
- self.errors = errors # how to handle errors in decoding
59
-
60
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
61
- self.special_tokens = {
62
- token: index
63
- for index, token in enumerate(
64
- SPECIAL_TOKENS, start=len(self.mergeable_ranks)
65
- )
66
- }
67
-
68
- enc = tiktoken.Encoding(
69
- "Qwen",
70
- pat_str=PAT_STR,
71
- mergeable_ranks=self.mergeable_ranks,
72
- special_tokens=self.special_tokens,
73
- )
74
- assert (
75
- len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
76
- ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
77
-
78
- self.decoder = {
79
- v: k for k, v in self.mergeable_ranks.items()
80
- } # type: dict[int, bytes|str]
81
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
82
-
83
- self.tokenizer = enc # type: tiktoken.Encoding
84
-
85
- self.eod_id = self.tokenizer.eot_token
86
- self.im_start_id = self.special_tokens[IMSTART]
87
- self.im_end_id = self.special_tokens[IMEND]
88
-
89
- def __len__(self) -> int:
90
- return self.tokenizer.n_vocab
91
-
92
- def get_vocab(self) -> Dict[bytes, int]:
93
- return self.mergeable_ranks
94
-
95
- def convert_tokens_to_ids(
96
- self, tokens: Union[bytes, str, List[Union[bytes, str]]]
97
- ) -> List[int]:
98
- ids = []
99
- if isinstance(tokens, (str, bytes)):
100
- if tokens in self.special_tokens:
101
- return self.special_tokens[tokens]
102
- else:
103
- return self.mergeable_ranks.get(tokens)
104
- for token in tokens:
105
- if token in self.special_tokens:
106
- ids.append(self.special_tokens[token])
107
- else:
108
- ids.append(self.mergeable_ranks.get(token))
109
- return ids
110
-
111
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
112
- if not special_tokens and new_tokens:
113
- raise ValueError('Adding regular tokens is not supported')
114
- for token in new_tokens:
115
- surface_form = token.content if isinstance(token, AddedToken) else token
116
- if surface_form not in SPECIAL_TOKENS:
117
- raise ValueError('Adding unknown special tokens is not supported')
118
- return 0
119
-
120
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
121
- """
122
- Save only the vocabulary of the tokenizer (vocabulary).
123
-
124
- Returns:
125
- `Tuple(str)`: Paths to the files saved.
126
- """
127
- file_path = os.path.join(save_directory, "qwen.tiktoken")
128
- with open(file_path, "w", encoding="utf8") as w:
129
- for k, v in self.mergeable_ranks.items():
130
- line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
131
- w.write(line)
132
- return (file_path,)
133
-
134
- def tokenize(
135
- self,
136
- text: str,
137
- allowed_special: Union[Set, str] = "all",
138
- disallowed_special: Union[Collection, str] = (),
139
- **kwargs,
140
- ) -> List[Union[bytes, str]]:
141
- """
142
- Converts a string in a sequence of tokens.
143
-
144
- Args:
145
- text (`str`):
146
- The sequence to be encoded.
147
- allowed_special (`Literal["all"]` or `set`):
148
- The surface forms of the tokens to be encoded as special tokens in regular texts.
149
- Default to "all".
150
- disallowed_special (`Literal["all"]` or `Collection`):
151
- The surface forms of the tokens that should not be in regular texts and trigger errors.
152
- Default to an empty tuple.
153
-
154
- kwargs (additional keyword arguments, *optional*):
155
- Will be passed to the underlying model specific encode method.
156
-
157
- Returns:
158
- `List[bytes|str]`: The list of tokens.
159
- """
160
-
161
-
162
- tokens = []
163
- text = unicodedata.normalize("NFC", text)
164
-
165
- # this implementation takes a detour: text -> token id -> token surface forms
166
- for t in self.tokenizer.encode(
167
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
168
- ):
169
- tokens.append(self.decoder[t])
170
- return tokens
171
-
172
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
173
- """
174
- Converts a sequence of tokens in a single string.
175
- """
176
- text = ""
177
- temp = b""
178
- for t in tokens:
179
- if isinstance(t, str):
180
- if temp:
181
- text += temp.decode("utf-8", errors=self.errors)
182
- temp = b""
183
- text += t
184
- elif isinstance(t, bytes):
185
- temp += t
186
- else:
187
- raise TypeError("token should only be of type types or str")
188
- if temp:
189
- text += temp.decode("utf-8", errors=self.errors)
190
- return text
191
-
192
- @property
193
- def vocab_size(self):
194
- return self.tokenizer.n_vocab
195
-
196
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
197
- """Converts an id to a token, special tokens included"""
198
- if index in self.decoder:
199
- return self.decoder[index]
200
- raise ValueError("unknown ids")
201
-
202
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
203
- """Converts a token to an id using the vocab, special tokens included"""
204
- if token in self.special_tokens:
205
- return self.special_tokens[token]
206
- if token in self.mergeable_ranks:
207
- return self.mergeable_ranks[token]
208
- raise ValueError("unknown token")
209
-
210
- def _tokenize(self, text: str, **kwargs):
211
- """
212
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
213
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
214
-
215
- Do NOT take care of added tokens.
216
- """
217
- raise NotImplementedError
218
-
219
- def _decode(
220
- self,
221
- token_ids: Union[int, List[int]],
222
- skip_special_tokens: bool = False,
223
- errors: str = None,
224
- **kwargs,
225
- ) -> str:
226
- if isinstance(token_ids, int):
227
- token_ids = [token_ids]
228
- if skip_special_tokens:
229
- token_ids = [i for i in token_ids if i < self.eod_id]
230
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)