mashirong commited on
Commit
e346937
1 Parent(s): 5a3cf15

Remove unused file

Browse files
Files changed (1) hide show
  1. tokenization_deepseek.py +0 -328
tokenization_deepseek.py DELETED
@@ -1,328 +0,0 @@
1
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """
16
- Forked from the file src/transformers/models/bert_generation/tokenization_bert_generation.py from the HuggingFace Transformers library.
17
- Permalink: https://github.com/huggingface/transformers/blob/04ab5605fbb4ef207b10bf2772d88c53fc242e83/src/transformers/models/bert_generation/tokenization_bert_generation.py
18
- Tokenizer class for ReplitLM
19
- Class is modified for compatibility with custom vocabulary and to achieve desired encode/decode behavior for Replit Code V1 3B model.
20
- """
21
- import os
22
- import sentencepiece as spm
23
- from sentencepiece import SentencePieceProcessor
24
- from shutil import copyfile
25
- from transformers import PreTrainedTokenizer
26
- from typing import Any, Dict, List, Optional, Tuple
27
- import base64
28
-
29
- VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
30
-
31
- class Tokenizer:
32
- def __init__(self, model_path="/weka-jd/prod/deepseek/permanent/shared/mingchuan/llama_data/tokenizer.model"):
33
- # reload tokenizer
34
- assert os.path.isfile(model_path), model_path
35
- self.sp_model = SentencePieceProcessor(model_file=model_path)
36
-
37
- # # ? print spm for debugging
38
- # spm_proto = sp_pb2_model.ModelProto()
39
- # spm_proto.ParseFromString(self.sp_model.serialized_model_proto())
40
- # print(dir(spm_proto))
41
- # attrs = ['denormalizer_spec', 'normalizer_spec', 'trainer_spec']
42
- # print('=======' * 5)
43
- # for attr in attrs:
44
- # print('=======', attr, '=======')
45
- # print(getattr(spm_proto, attr))
46
-
47
- # BOS / EOS token IDs
48
- self.n_words: int = self.sp_model.vocab_size()
49
- self.bos_id: int = self.sp_model.bos_id()
50
- self.eos_id: int = self.sp_model.eos_id()
51
- self.pad_id: int = self.sp_model.pad_id()
52
- assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
53
-
54
- def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
55
- assert type(s) is str
56
- t = self.sp_model.encode(s)
57
- if bos:
58
- t = [self.bos_id] + t
59
- if eos:
60
- t = t + [self.eos_id]
61
- return t
62
-
63
- def decode(self, t: List[int]) -> str:
64
- return self.sp_model.decode(t)
65
-
66
- class LineBBPETokenizer(Tokenizer):
67
- def __init__(self,
68
- model_path="/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/spm_0717_final/100000/bbpe_full_bytes.model",
69
- ignore_decode_err=False, attachfile_path=None):
70
- super().__init__(model_path=model_path)
71
- self.ignore_decode_err = ignore_decode_err
72
- Bvocab_path = attachfile_path + "/byteVocab.txt"
73
- #'/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/byteVocab.txt'
74
- punct_path = attachfile_path + "/all_punct.txt"
75
- #punct_path = '/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/all_punct.txt'
76
- Bvocab = open(Bvocab_path, 'r', encoding = 'utf-8')
77
- self.punct = []
78
- with open(punct_path, 'r', encoding='utf-8') as f:
79
- lines = f.readlines()
80
- for line in lines:
81
- line = line.strip()
82
- if line:
83
- self.punct.append(line)
84
-
85
- self.numchars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
86
- self.white_space = [' ']
87
- self.special_chars = set(self.numchars) | set(self.punct) | set(self.white_space)
88
-
89
- # ! remove chars that will be encoded to 0 (unk_id)
90
- unk_ch = set()
91
- for ch in self.special_chars:
92
- ids = self.sp_model.encode(ch)
93
- if 0 in ids:
94
- unk_ch.update(ch)
95
- self.special_chars = self.special_chars - unk_ch
96
-
97
- self.byte2ch = [-1] * 256
98
- self.ch2byte = {}
99
- for line in list(Bvocab.readlines())[:256]:
100
- tokens = line.strip().split('\t')
101
- self.byte2ch[int(tokens[0])] = tokens[1]
102
- self.ch2byte[tokens[1]] = int(tokens[0])
103
- self.b16_dec = {}
104
- self.b16_enc = ['x'] * 16
105
- for i in range(10):
106
- self.b16_dec[str(i)] = i
107
- self.b16_enc[i] = str(i)
108
- self.b16_dec['A'] = 10
109
- self.b16_dec['B'] = 11
110
- self.b16_dec['C'] = 12
111
- self.b16_dec['D'] = 13
112
- self.b16_dec['E'] = 14
113
- self.b16_dec['F'] = 15
114
- self.b16_enc[10] = 'A'
115
- self.b16_enc[11] = 'B'
116
- self.b16_enc[12] = 'C'
117
- self.b16_enc[13] = 'D'
118
- self.b16_enc[14] = 'E'
119
- self.b16_enc[15] = 'F'
120
-
121
- self.new_line_id = self.sp_model.encode(self.mapping_raw_to_256ch('\n'))[-1]
122
-
123
- def base16encode(self, n):
124
- return self.b16_enc[n // 16] + self.b16_enc[n % 16]
125
-
126
- def base16decode(self, s):
127
- return self.b16_dec[s[0]] * 16 + self.b16_dec[s[1]]
128
-
129
- def mapping_raw_to_256ch(self, s: str) -> str:
130
- mapped_s = []
131
- for token in s:
132
- if token in self.special_chars:
133
- mapped_s.append(token)
134
- continue
135
- tk = str(base64.b16encode(token.encode("utf-8")))[2:-1]
136
- num = len(tk) // 2
137
- for i in range(num):
138
- mapped_s.append(self.byte2ch[(self.base16decode(tk[2*i:2*i+2]))])
139
- return ''.join(mapped_s)
140
-
141
- def mapping_256ch_to_raw(self, s: str) -> str:
142
- mapped_s = ''
143
- for token in s:
144
- if token in self.ch2byte:
145
- mapped_s += self.base16encode(self.ch2byte[token])
146
- else:
147
- mapped_s += str(base64.b16encode(token.encode("utf-8")))[2:-1]
148
- # decode utf-8 string to text string
149
- byte_s = bytes.fromhex(mapped_s)
150
- if self.ignore_decode_err:
151
- try:
152
- mapped_s = byte_s.decode('utf-8')
153
- except UnicodeDecodeError:
154
- mapped_s = ''
155
- else:
156
- mapped_s = byte_s.decode('utf-8')
157
- return mapped_s
158
-
159
- def encode_line(self, s):
160
- if s == '\n':
161
- return [self.new_line_id]
162
- ss = self.mapping_raw_to_256ch(s)
163
- t = self.sp_model.encode(ss)
164
- return t
165
-
166
- def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
167
- assert type(s) is str
168
- t = []
169
- lines = s.split('\n')
170
- n_lines = len(lines)
171
- for i in range(n_lines):
172
- if i != n_lines - 1:
173
- line = lines[i] + '\n'
174
- else:
175
- line = lines[i]
176
- tt = self.encode_line(line)
177
- t += tt
178
- if bos:
179
- t = [self.bos_id] + t
180
- if eos:
181
- t = t + [self.eos_id]
182
- return t
183
-
184
- def get_restored_white_space(self, t):
185
- t = t[:3]
186
- if t[0] == self.bos_id:
187
- t = t[1:]
188
- decoded = self.sp_model.decode(t)
189
- encoded = self.sp_model.encode(decoded)
190
- if len(encoded) < len(t):
191
- return ' '
192
- else:
193
- return ''
194
-
195
- def decode_line(self, t):
196
- if len(t) == 1 and t[0] == self.new_line_id:
197
- return '\n'
198
- # ? special bug fixing for a single whitespace in the line beginning, sentencepiece will consume it, we restore it
199
- restored_white_space = self.get_restored_white_space(t)
200
- ss = self.sp_model.decode(t)
201
- s = restored_white_space + self.mapping_256ch_to_raw(ss)
202
- return s
203
-
204
- def decode(self, t: List[int]) -> str:
205
- s = ''
206
- new_line_indices = [index for index, value in enumerate(t) if value == self.new_line_id]
207
- last_idx = 0
208
- for i in range(len(new_line_indices)):
209
- line_id = t[last_idx:new_line_indices[i] + 1]
210
- ss = self.decode_line(line_id)
211
- s += ss
212
- last_idx = new_line_indices[i] + 1
213
- if last_idx < len(t):
214
- line_id = t[last_idx:]
215
- ss = self.decode_line(line_id)
216
- s += ss
217
- return s
218
-
219
- def add_special(self, special_tokens):
220
- '''
221
- add special tokens to the tokenizer
222
- '''
223
- spm_proto = sp_pb2_model.ModelProto()
224
- spm_proto.ParseFromString(self.sp_model.serialized_model_proto())
225
- for special_token in special_tokens:
226
- new_p = sp_pb2_model.ModelProto().SentencePiece()
227
- new_p.piece = self.mapping_raw_to_256ch(special_token)
228
- new_p.score = 0.0
229
- new_p.type = 4
230
- spm_proto.pieces.append(new_p)
231
- print(f'special token added: {special_token}')
232
- self.sp_model.LoadFromSerializedProto(spm_proto.SerializeToString())
233
-
234
- class DeepSeekTokenizer(PreTrainedTokenizer):
235
- """
236
- Construct a ReplitLMTokenizer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
237
- This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
238
- Args:
239
- vocab_file (`str`):
240
- [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
241
- contains the vocabulary necessary to instantiate a tokenizer.
242
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
243
- The end of sequence token.
244
- bos_token (`str`, *optional*, defaults to `None`):
245
- The begin of sequence token.
246
- unk_token (`str`, *optional*, defaults to `"<|unk|>"`):
247
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
248
- token instead.
249
- pad_token (`str`, *optional*, defaults to `"<|pad|>"`):
250
- The token used for padding, for example when batching sequences of different lengths.
251
- sp_model_kwargs (`dict`, *optional*):
252
- Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
253
- SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
254
- to set:
255
- - `enable_sampling`: Enable subword regularization.
256
- - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
257
- - `nbest_size = {0,1}`: No sampling is performed.
258
- - `nbest_size > 1`: samples from the nbest_size results.
259
- - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
260
- using forward-filtering-and-backward-sampling algorithm.
261
- - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
262
- BPE-dropout.
263
- """
264
- vocab_files_names = VOCAB_FILES_NAMES
265
- prefix_tokens: List[int] = []
266
- model_input_names = ['input_ids', 'attention_mask']
267
-
268
- def __init__(self, vocab_file, bos_token="<s>", eos_token='</s>', unk_token=None, pad_token=None, sep_token='</s>', sp_model_kwargs: Optional[Dict[str, Any]]=None, name_or_path=None, **kwargs) -> None:
269
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
270
- super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, sp_model_kwargs=self.sp_model_kwargs, **kwargs)
271
- #obtain the current directory of py
272
- vocab_path = name_or_path
273
- print("vocab_path: ", vocab_path)
274
- self.vocab_path = vocab_path
275
- self.vocab_file = vocab_path + '/tokenizer.model'
276
- self.token = LineBBPETokenizer(model_path=self.vocab_file, attachfile_path=vocab_path, ignore_decode_err=True)
277
-
278
- @property
279
- def vocab_size(self):
280
- return self.token.sp_model.get_piece_size()
281
-
282
- def get_vocab(self):
283
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
284
- vocab.update(self.added_tokens_encoder)
285
- return vocab
286
-
287
- def __getstate__(self):
288
- state = self.__dict__.copy()
289
- state['token'] = None
290
- return state
291
-
292
- def __setstate__(self, d):
293
- self.__dict__ = d
294
- if not hasattr(self, 'sp_model_kwargs'):
295
- self.sp_model_kwargs = {}
296
- self.token = LineBBPETokenizer(model_path=self.vocab_file, attachfile_path=self.vocab_path)
297
-
298
- def _tokenize(self, text: str) -> List[str]:
299
- """Take as input a string and return a list of strings (tokens) for words/sub-words"""
300
- token_ids = self.token.encode(text, bos=True, eos=False)
301
- string_tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
302
- return string_tokens
303
-
304
- def _convert_token_to_id(self, token):
305
- """Converts a token (str) in an id using the vocab."""
306
- return self.token.sp_model.piece_to_id(token)
307
-
308
- def _convert_id_to_token(self, index):
309
- """Converts an index (integer) in a token (str) using the vocab."""
310
- token = self.token.sp_model.id_to_piece(index)
311
- return token
312
-
313
- def convert_tokens_to_string(self, tokens):
314
- """Converts a sequence of tokens (string) in a single string."""
315
- ids = [self._convert_token_to_id(token) for token in tokens]
316
- return self.token.decode(ids)
317
-
318
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str]=None) -> Tuple[str]:
319
- if not os.path.isdir(save_directory):
320
- raise ValueError(f'Vocabulary path ({save_directory}) should be a directory')
321
- out_vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'])
322
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
323
- copyfile(self.vocab_file, out_vocab_file)
324
- elif not os.path.isfile(self.vocab_file):
325
- with open(out_vocab_file, 'wb') as fi:
326
- content_spiece_model = self.sp_model.serialized_model_proto()
327
- fi.write(content_spiece_model)
328
- return (out_vocab_file,)