mashirong
commited on
Commit
•
e346937
1
Parent(s):
5a3cf15
Remove unused file
Browse files- tokenization_deepseek.py +0 -328
tokenization_deepseek.py
DELETED
@@ -1,328 +0,0 @@
|
|
1 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
|
15 |
-
"""
|
16 |
-
Forked from the file src/transformers/models/bert_generation/tokenization_bert_generation.py from the HuggingFace Transformers library.
|
17 |
-
Permalink: https://github.com/huggingface/transformers/blob/04ab5605fbb4ef207b10bf2772d88c53fc242e83/src/transformers/models/bert_generation/tokenization_bert_generation.py
|
18 |
-
Tokenizer class for ReplitLM
|
19 |
-
Class is modified for compatibility with custom vocabulary and to achieve desired encode/decode behavior for Replit Code V1 3B model.
|
20 |
-
"""
|
21 |
-
import os
|
22 |
-
import sentencepiece as spm
|
23 |
-
from sentencepiece import SentencePieceProcessor
|
24 |
-
from shutil import copyfile
|
25 |
-
from transformers import PreTrainedTokenizer
|
26 |
-
from typing import Any, Dict, List, Optional, Tuple
|
27 |
-
import base64
|
28 |
-
|
29 |
-
VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
|
30 |
-
|
31 |
-
class Tokenizer:
|
32 |
-
def __init__(self, model_path="/weka-jd/prod/deepseek/permanent/shared/mingchuan/llama_data/tokenizer.model"):
|
33 |
-
# reload tokenizer
|
34 |
-
assert os.path.isfile(model_path), model_path
|
35 |
-
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
36 |
-
|
37 |
-
# # ? print spm for debugging
|
38 |
-
# spm_proto = sp_pb2_model.ModelProto()
|
39 |
-
# spm_proto.ParseFromString(self.sp_model.serialized_model_proto())
|
40 |
-
# print(dir(spm_proto))
|
41 |
-
# attrs = ['denormalizer_spec', 'normalizer_spec', 'trainer_spec']
|
42 |
-
# print('=======' * 5)
|
43 |
-
# for attr in attrs:
|
44 |
-
# print('=======', attr, '=======')
|
45 |
-
# print(getattr(spm_proto, attr))
|
46 |
-
|
47 |
-
# BOS / EOS token IDs
|
48 |
-
self.n_words: int = self.sp_model.vocab_size()
|
49 |
-
self.bos_id: int = self.sp_model.bos_id()
|
50 |
-
self.eos_id: int = self.sp_model.eos_id()
|
51 |
-
self.pad_id: int = self.sp_model.pad_id()
|
52 |
-
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
53 |
-
|
54 |
-
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
55 |
-
assert type(s) is str
|
56 |
-
t = self.sp_model.encode(s)
|
57 |
-
if bos:
|
58 |
-
t = [self.bos_id] + t
|
59 |
-
if eos:
|
60 |
-
t = t + [self.eos_id]
|
61 |
-
return t
|
62 |
-
|
63 |
-
def decode(self, t: List[int]) -> str:
|
64 |
-
return self.sp_model.decode(t)
|
65 |
-
|
66 |
-
class LineBBPETokenizer(Tokenizer):
|
67 |
-
def __init__(self,
|
68 |
-
model_path="/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/spm_0717_final/100000/bbpe_full_bytes.model",
|
69 |
-
ignore_decode_err=False, attachfile_path=None):
|
70 |
-
super().__init__(model_path=model_path)
|
71 |
-
self.ignore_decode_err = ignore_decode_err
|
72 |
-
Bvocab_path = attachfile_path + "/byteVocab.txt"
|
73 |
-
#'/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/byteVocab.txt'
|
74 |
-
punct_path = attachfile_path + "/all_punct.txt"
|
75 |
-
#punct_path = '/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/all_punct.txt'
|
76 |
-
Bvocab = open(Bvocab_path, 'r', encoding = 'utf-8')
|
77 |
-
self.punct = []
|
78 |
-
with open(punct_path, 'r', encoding='utf-8') as f:
|
79 |
-
lines = f.readlines()
|
80 |
-
for line in lines:
|
81 |
-
line = line.strip()
|
82 |
-
if line:
|
83 |
-
self.punct.append(line)
|
84 |
-
|
85 |
-
self.numchars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
86 |
-
self.white_space = [' ']
|
87 |
-
self.special_chars = set(self.numchars) | set(self.punct) | set(self.white_space)
|
88 |
-
|
89 |
-
# ! remove chars that will be encoded to 0 (unk_id)
|
90 |
-
unk_ch = set()
|
91 |
-
for ch in self.special_chars:
|
92 |
-
ids = self.sp_model.encode(ch)
|
93 |
-
if 0 in ids:
|
94 |
-
unk_ch.update(ch)
|
95 |
-
self.special_chars = self.special_chars - unk_ch
|
96 |
-
|
97 |
-
self.byte2ch = [-1] * 256
|
98 |
-
self.ch2byte = {}
|
99 |
-
for line in list(Bvocab.readlines())[:256]:
|
100 |
-
tokens = line.strip().split('\t')
|
101 |
-
self.byte2ch[int(tokens[0])] = tokens[1]
|
102 |
-
self.ch2byte[tokens[1]] = int(tokens[0])
|
103 |
-
self.b16_dec = {}
|
104 |
-
self.b16_enc = ['x'] * 16
|
105 |
-
for i in range(10):
|
106 |
-
self.b16_dec[str(i)] = i
|
107 |
-
self.b16_enc[i] = str(i)
|
108 |
-
self.b16_dec['A'] = 10
|
109 |
-
self.b16_dec['B'] = 11
|
110 |
-
self.b16_dec['C'] = 12
|
111 |
-
self.b16_dec['D'] = 13
|
112 |
-
self.b16_dec['E'] = 14
|
113 |
-
self.b16_dec['F'] = 15
|
114 |
-
self.b16_enc[10] = 'A'
|
115 |
-
self.b16_enc[11] = 'B'
|
116 |
-
self.b16_enc[12] = 'C'
|
117 |
-
self.b16_enc[13] = 'D'
|
118 |
-
self.b16_enc[14] = 'E'
|
119 |
-
self.b16_enc[15] = 'F'
|
120 |
-
|
121 |
-
self.new_line_id = self.sp_model.encode(self.mapping_raw_to_256ch('\n'))[-1]
|
122 |
-
|
123 |
-
def base16encode(self, n):
|
124 |
-
return self.b16_enc[n // 16] + self.b16_enc[n % 16]
|
125 |
-
|
126 |
-
def base16decode(self, s):
|
127 |
-
return self.b16_dec[s[0]] * 16 + self.b16_dec[s[1]]
|
128 |
-
|
129 |
-
def mapping_raw_to_256ch(self, s: str) -> str:
|
130 |
-
mapped_s = []
|
131 |
-
for token in s:
|
132 |
-
if token in self.special_chars:
|
133 |
-
mapped_s.append(token)
|
134 |
-
continue
|
135 |
-
tk = str(base64.b16encode(token.encode("utf-8")))[2:-1]
|
136 |
-
num = len(tk) // 2
|
137 |
-
for i in range(num):
|
138 |
-
mapped_s.append(self.byte2ch[(self.base16decode(tk[2*i:2*i+2]))])
|
139 |
-
return ''.join(mapped_s)
|
140 |
-
|
141 |
-
def mapping_256ch_to_raw(self, s: str) -> str:
|
142 |
-
mapped_s = ''
|
143 |
-
for token in s:
|
144 |
-
if token in self.ch2byte:
|
145 |
-
mapped_s += self.base16encode(self.ch2byte[token])
|
146 |
-
else:
|
147 |
-
mapped_s += str(base64.b16encode(token.encode("utf-8")))[2:-1]
|
148 |
-
# decode utf-8 string to text string
|
149 |
-
byte_s = bytes.fromhex(mapped_s)
|
150 |
-
if self.ignore_decode_err:
|
151 |
-
try:
|
152 |
-
mapped_s = byte_s.decode('utf-8')
|
153 |
-
except UnicodeDecodeError:
|
154 |
-
mapped_s = ''
|
155 |
-
else:
|
156 |
-
mapped_s = byte_s.decode('utf-8')
|
157 |
-
return mapped_s
|
158 |
-
|
159 |
-
def encode_line(self, s):
|
160 |
-
if s == '\n':
|
161 |
-
return [self.new_line_id]
|
162 |
-
ss = self.mapping_raw_to_256ch(s)
|
163 |
-
t = self.sp_model.encode(ss)
|
164 |
-
return t
|
165 |
-
|
166 |
-
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
167 |
-
assert type(s) is str
|
168 |
-
t = []
|
169 |
-
lines = s.split('\n')
|
170 |
-
n_lines = len(lines)
|
171 |
-
for i in range(n_lines):
|
172 |
-
if i != n_lines - 1:
|
173 |
-
line = lines[i] + '\n'
|
174 |
-
else:
|
175 |
-
line = lines[i]
|
176 |
-
tt = self.encode_line(line)
|
177 |
-
t += tt
|
178 |
-
if bos:
|
179 |
-
t = [self.bos_id] + t
|
180 |
-
if eos:
|
181 |
-
t = t + [self.eos_id]
|
182 |
-
return t
|
183 |
-
|
184 |
-
def get_restored_white_space(self, t):
|
185 |
-
t = t[:3]
|
186 |
-
if t[0] == self.bos_id:
|
187 |
-
t = t[1:]
|
188 |
-
decoded = self.sp_model.decode(t)
|
189 |
-
encoded = self.sp_model.encode(decoded)
|
190 |
-
if len(encoded) < len(t):
|
191 |
-
return ' '
|
192 |
-
else:
|
193 |
-
return ''
|
194 |
-
|
195 |
-
def decode_line(self, t):
|
196 |
-
if len(t) == 1 and t[0] == self.new_line_id:
|
197 |
-
return '\n'
|
198 |
-
# ? special bug fixing for a single whitespace in the line beginning, sentencepiece will consume it, we restore it
|
199 |
-
restored_white_space = self.get_restored_white_space(t)
|
200 |
-
ss = self.sp_model.decode(t)
|
201 |
-
s = restored_white_space + self.mapping_256ch_to_raw(ss)
|
202 |
-
return s
|
203 |
-
|
204 |
-
def decode(self, t: List[int]) -> str:
|
205 |
-
s = ''
|
206 |
-
new_line_indices = [index for index, value in enumerate(t) if value == self.new_line_id]
|
207 |
-
last_idx = 0
|
208 |
-
for i in range(len(new_line_indices)):
|
209 |
-
line_id = t[last_idx:new_line_indices[i] + 1]
|
210 |
-
ss = self.decode_line(line_id)
|
211 |
-
s += ss
|
212 |
-
last_idx = new_line_indices[i] + 1
|
213 |
-
if last_idx < len(t):
|
214 |
-
line_id = t[last_idx:]
|
215 |
-
ss = self.decode_line(line_id)
|
216 |
-
s += ss
|
217 |
-
return s
|
218 |
-
|
219 |
-
def add_special(self, special_tokens):
|
220 |
-
'''
|
221 |
-
add special tokens to the tokenizer
|
222 |
-
'''
|
223 |
-
spm_proto = sp_pb2_model.ModelProto()
|
224 |
-
spm_proto.ParseFromString(self.sp_model.serialized_model_proto())
|
225 |
-
for special_token in special_tokens:
|
226 |
-
new_p = sp_pb2_model.ModelProto().SentencePiece()
|
227 |
-
new_p.piece = self.mapping_raw_to_256ch(special_token)
|
228 |
-
new_p.score = 0.0
|
229 |
-
new_p.type = 4
|
230 |
-
spm_proto.pieces.append(new_p)
|
231 |
-
print(f'special token added: {special_token}')
|
232 |
-
self.sp_model.LoadFromSerializedProto(spm_proto.SerializeToString())
|
233 |
-
|
234 |
-
class DeepSeekTokenizer(PreTrainedTokenizer):
|
235 |
-
"""
|
236 |
-
Construct a ReplitLMTokenizer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
|
237 |
-
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
|
238 |
-
Args:
|
239 |
-
vocab_file (`str`):
|
240 |
-
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
|
241 |
-
contains the vocabulary necessary to instantiate a tokenizer.
|
242 |
-
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
243 |
-
The end of sequence token.
|
244 |
-
bos_token (`str`, *optional*, defaults to `None`):
|
245 |
-
The begin of sequence token.
|
246 |
-
unk_token (`str`, *optional*, defaults to `"<|unk|>"`):
|
247 |
-
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
248 |
-
token instead.
|
249 |
-
pad_token (`str`, *optional*, defaults to `"<|pad|>"`):
|
250 |
-
The token used for padding, for example when batching sequences of different lengths.
|
251 |
-
sp_model_kwargs (`dict`, *optional*):
|
252 |
-
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
|
253 |
-
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
|
254 |
-
to set:
|
255 |
-
- `enable_sampling`: Enable subword regularization.
|
256 |
-
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
|
257 |
-
- `nbest_size = {0,1}`: No sampling is performed.
|
258 |
-
- `nbest_size > 1`: samples from the nbest_size results.
|
259 |
-
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
|
260 |
-
using forward-filtering-and-backward-sampling algorithm.
|
261 |
-
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
262 |
-
BPE-dropout.
|
263 |
-
"""
|
264 |
-
vocab_files_names = VOCAB_FILES_NAMES
|
265 |
-
prefix_tokens: List[int] = []
|
266 |
-
model_input_names = ['input_ids', 'attention_mask']
|
267 |
-
|
268 |
-
def __init__(self, vocab_file, bos_token="<s>", eos_token='</s>', unk_token=None, pad_token=None, sep_token='</s>', sp_model_kwargs: Optional[Dict[str, Any]]=None, name_or_path=None, **kwargs) -> None:
|
269 |
-
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
270 |
-
super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, sp_model_kwargs=self.sp_model_kwargs, **kwargs)
|
271 |
-
#obtain the current directory of py
|
272 |
-
vocab_path = name_or_path
|
273 |
-
print("vocab_path: ", vocab_path)
|
274 |
-
self.vocab_path = vocab_path
|
275 |
-
self.vocab_file = vocab_path + '/tokenizer.model'
|
276 |
-
self.token = LineBBPETokenizer(model_path=self.vocab_file, attachfile_path=vocab_path, ignore_decode_err=True)
|
277 |
-
|
278 |
-
@property
|
279 |
-
def vocab_size(self):
|
280 |
-
return self.token.sp_model.get_piece_size()
|
281 |
-
|
282 |
-
def get_vocab(self):
|
283 |
-
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
284 |
-
vocab.update(self.added_tokens_encoder)
|
285 |
-
return vocab
|
286 |
-
|
287 |
-
def __getstate__(self):
|
288 |
-
state = self.__dict__.copy()
|
289 |
-
state['token'] = None
|
290 |
-
return state
|
291 |
-
|
292 |
-
def __setstate__(self, d):
|
293 |
-
self.__dict__ = d
|
294 |
-
if not hasattr(self, 'sp_model_kwargs'):
|
295 |
-
self.sp_model_kwargs = {}
|
296 |
-
self.token = LineBBPETokenizer(model_path=self.vocab_file, attachfile_path=self.vocab_path)
|
297 |
-
|
298 |
-
def _tokenize(self, text: str) -> List[str]:
|
299 |
-
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
|
300 |
-
token_ids = self.token.encode(text, bos=True, eos=False)
|
301 |
-
string_tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
|
302 |
-
return string_tokens
|
303 |
-
|
304 |
-
def _convert_token_to_id(self, token):
|
305 |
-
"""Converts a token (str) in an id using the vocab."""
|
306 |
-
return self.token.sp_model.piece_to_id(token)
|
307 |
-
|
308 |
-
def _convert_id_to_token(self, index):
|
309 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
310 |
-
token = self.token.sp_model.id_to_piece(index)
|
311 |
-
return token
|
312 |
-
|
313 |
-
def convert_tokens_to_string(self, tokens):
|
314 |
-
"""Converts a sequence of tokens (string) in a single string."""
|
315 |
-
ids = [self._convert_token_to_id(token) for token in tokens]
|
316 |
-
return self.token.decode(ids)
|
317 |
-
|
318 |
-
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str]=None) -> Tuple[str]:
|
319 |
-
if not os.path.isdir(save_directory):
|
320 |
-
raise ValueError(f'Vocabulary path ({save_directory}) should be a directory')
|
321 |
-
out_vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'])
|
322 |
-
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
323 |
-
copyfile(self.vocab_file, out_vocab_file)
|
324 |
-
elif not os.path.isfile(self.vocab_file):
|
325 |
-
with open(out_vocab_file, 'wb') as fi:
|
326 |
-
content_spiece_model = self.sp_model.serialized_model_proto()
|
327 |
-
fi.write(content_spiece_model)
|
328 |
-
return (out_vocab_file,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|