fnlp
/

txsun commited on
Commit
002f447
·
1 Parent(s): d7170f1

Upload tokenization_moss.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_moss.py +380 -0
tokenization_moss.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for Moss"""
2
+
3
+ import json
4
+ import os
5
+ import numpy as np
6
+ import regex as re
7
+
8
+ from functools import lru_cache
9
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
10
+
11
+ from transformers.utils import is_tf_available, is_torch_available, logging
12
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
13
+
14
+
15
+ if TYPE_CHECKING:
16
+ if is_torch_available():
17
+ import torch
18
+ if is_tf_available():
19
+ import tensorflow as tf
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+ VOCAB_FILES_NAMES = {
25
+ "vocab_file": "vocab.json",
26
+ "merges_file": "merges.txt",
27
+ }
28
+
29
+ PRETRAINED_VOCAB_FILES_MAP = {
30
+ "vocab_file": {
31
+ "fnlp/moss-moon-003-base": "https://huggingface.co/fnlp/moss-moon-003-base/resolve/main/vocab.json",
32
+ "fnlp/moss-moon-003-sft": "https://huggingface.co/fnlp/moss-moon-003-sft/resolve/main/vocab.json",
33
+ "fnlp/moss-moon-003-sft-plugin": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin/resolve/main/vocab.json",
34
+ "fnlp/moss-moon-003-sft-int8": "https://huggingface.co/fnlp/moss-moon-003-sft-int8/resolve/main/vocab.json",
35
+ "fnlp/moss-moon-003-sft-plugin-int8": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin-int8/resolve/main/vocab.json",
36
+ "fnlp/moss-moon-003-sft-int4": "https://huggingface.co/fnlp/moss-moon-003-sft-int4/resolve/main/vocab.json",
37
+ "fnlp/moss-moon-003-sft-plugin-int4": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin-int4/resolve/main/vocab.json",
38
+ },
39
+ "merges_file": {
40
+ "fnlp/moss-moon-003-base": "https://huggingface.co/fnlp/moss-moon-003-base/resolve/main/merges.txt",
41
+ "fnlp/moss-moon-003-sft": "https://huggingface.co/fnlp/moss-moon-003-sft/resolve/main/merges.txt",
42
+ "fnlp/moss-moon-003-sft-plugin": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin/resolve/main/merges.txt",
43
+ "fnlp/moss-moon-003-sft-int8": "https://huggingface.co/fnlp/moss-moon-003-sft-int8/resolve/main/merges.txt",
44
+ "fnlp/moss-moon-003-sft-plugin-int8": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin-int8/resolve/main/merges.txt",
45
+ "fnlp/moss-moon-003-sft-int4": "https://huggingface.co/fnlp/moss-moon-003-sft-int4/resolve/main/merges.txt",
46
+ "fnlp/moss-moon-003-sft-plugin-int4": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin-int4/resolve/main/merges.txt",
47
+ },
48
+ }
49
+
50
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
51
+ "fnlp/moss-moon-003-base": 2048,
52
+ "fnlp/moss-moon-003-sft": 2048,
53
+ "fnlp/moss-moon-003-sft-plugin": 2048,
54
+ "fnlp/moss-moon-003-sft-int8": 2048,
55
+ "fnlp/moss-moon-003-sft-plugin-int8": 2048,
56
+ "fnlp/moss-moon-003-sft-int4": 2048,
57
+ "fnlp/moss-moon-003-sft-plugin-int4": 2048,
58
+ }
59
+
60
+
61
+ @lru_cache()
62
+ def bytes_to_unicode():
63
+ """
64
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
65
+ characters the bpe code barfs on.
66
+
67
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
68
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
69
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
70
+ tables between utf-8 bytes and unicode strings.
71
+ """
72
+ bs = (
73
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
74
+ )
75
+ cs = bs[:]
76
+ n = 0
77
+ for b in range(2**8):
78
+ if b not in bs:
79
+ bs.append(b)
80
+ cs.append(2**8 + n)
81
+ n += 1
82
+ cs = [chr(n) for n in cs]
83
+ return dict(zip(bs, cs))
84
+
85
+
86
+ def get_pairs(word):
87
+ """
88
+ Return set of symbol pairs in a word.
89
+
90
+ Word is represented as tuple of symbols (symbols being variable-length strings).
91
+ """
92
+ pairs = set()
93
+ prev_char = word[0]
94
+ for char in word[1:]:
95
+ pairs.add((prev_char, char))
96
+ prev_char = char
97
+ return pairs
98
+
99
+
100
+ class MossTokenizer(PreTrainedTokenizer):
101
+ """
102
+ Construct a Moss tokenizer. Based on byte-level Byte-Pair-Encoding.
103
+
104
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
105
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
106
+
107
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
108
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
109
+
110
+ <Tip>
111
+
112
+ When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
113
+
114
+ </Tip>
115
+
116
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
117
+ this superclass for more information regarding those methods.
118
+
119
+ Args:
120
+ vocab_file (`str`):
121
+ Path to the vocabulary file.
122
+ merges_file (`str`):
123
+ Path to the merges file.
124
+ errors (`str`, *optional*, defaults to `"replace"`):
125
+ Paradigm to follow when decoding bytes to UTF-8. See
126
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
127
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
128
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
129
+ token instead.
130
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
131
+ The beginning of sequence token.
132
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
133
+ The end of sequence token.
134
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
135
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
136
+ other word. (Moss tokenizer detect beginning of words by the preceding space).
137
+ """
138
+
139
+ vocab_files_names = VOCAB_FILES_NAMES
140
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
141
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
142
+ model_input_names = ["input_ids", "attention_mask"]
143
+
144
+ def __init__(
145
+ self,
146
+ vocab_file,
147
+ merges_file,
148
+ errors="replace",
149
+ unk_token="<|endoftext|>",
150
+ bos_token="<|endoftext|>",
151
+ eos_token="<eom>",
152
+ pad_token=None,
153
+ add_prefix_space=False,
154
+ add_bos_token=False,
155
+ **kwargs,
156
+ ):
157
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
158
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
159
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
160
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
161
+ super().__init__(
162
+ errors=errors,
163
+ unk_token=unk_token,
164
+ bos_token=bos_token,
165
+ eos_token=eos_token,
166
+ pad_token=pad_token,
167
+ add_prefix_space=add_prefix_space,
168
+ add_bos_token=add_bos_token,
169
+ **kwargs,
170
+ )
171
+ self.add_bos_token = add_bos_token
172
+
173
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
174
+ self.encoder = json.load(vocab_handle)
175
+ self.decoder = {v: k for k, v in self.encoder.items()}
176
+ self.errors = errors # how to handle errors in decoding
177
+ self.byte_encoder = bytes_to_unicode()
178
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
179
+ with open(merges_file, encoding="utf-8") as merges_handle:
180
+ bpe_merges = merges_handle.read().split("\n")[1:-1]
181
+ bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
182
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
183
+ self.cache = {}
184
+ self.add_prefix_space = add_prefix_space
185
+
186
+ # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
187
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
188
+
189
+ @property
190
+ def vocab_size(self):
191
+ return len(self.encoder)
192
+
193
+ def get_vocab(self):
194
+ return dict(self.encoder, **self.added_tokens_encoder)
195
+
196
+ def bpe(self, token):
197
+ if token in self.cache:
198
+ return self.cache[token]
199
+ word = tuple(token)
200
+ pairs = get_pairs(word)
201
+
202
+ if not pairs:
203
+ return token
204
+
205
+ while True:
206
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
207
+ if bigram not in self.bpe_ranks:
208
+ break
209
+ first, second = bigram
210
+ new_word = []
211
+ i = 0
212
+ while i < len(word):
213
+ try:
214
+ j = word.index(first, i)
215
+ except ValueError:
216
+ new_word.extend(word[i:])
217
+ break
218
+ else:
219
+ new_word.extend(word[i:j])
220
+ i = j
221
+
222
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
223
+ new_word.append(first + second)
224
+ i += 2
225
+ else:
226
+ new_word.append(word[i])
227
+ i += 1
228
+ new_word = tuple(new_word)
229
+ word = new_word
230
+ if len(word) == 1:
231
+ break
232
+ else:
233
+ pairs = get_pairs(word)
234
+ word = " ".join(word)
235
+ self.cache[token] = word
236
+ return word
237
+
238
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
239
+ if self.add_bos_token:
240
+ bos_token_ids = [self.bos_token_id]
241
+ else:
242
+ bos_token_ids = []
243
+
244
+ output = bos_token_ids + token_ids_0
245
+
246
+ if token_ids_1 is None:
247
+ return output
248
+
249
+ return output + bos_token_ids + token_ids_1
250
+
251
+ def _tokenize(self, text):
252
+ """Tokenize a string."""
253
+ bpe_tokens = []
254
+ for token in re.findall(self.pat, text):
255
+ token = "".join(
256
+ self.byte_encoder[b] for b in token.encode("utf-8")
257
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
258
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
259
+ return bpe_tokens
260
+
261
+ def _convert_token_to_id(self, token):
262
+ """Converts a token (str) in an id using the vocab."""
263
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
264
+
265
+ def _convert_id_to_token(self, index):
266
+ """Converts an index (integer) in a token (str) using the vocab."""
267
+ return self.decoder.get(index)
268
+
269
+ def convert_tokens_to_string(self, tokens):
270
+ """Converts a sequence of tokens (string) in a single string."""
271
+ text = "".join(tokens)
272
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
273
+ return text
274
+
275
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
276
+ if not os.path.isdir(save_directory):
277
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
278
+ return
279
+ vocab_file = os.path.join(
280
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
281
+ )
282
+ merge_file = os.path.join(
283
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
284
+ )
285
+
286
+ with open(vocab_file, "w", encoding="utf-8") as f:
287
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
288
+
289
+ index = 0
290
+ with open(merge_file, "w", encoding="utf-8") as writer:
291
+ writer.write("#version: 0.2\n")
292
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
293
+ if index != token_index:
294
+ logger.warning(
295
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
296
+ " Please check that the tokenizer is not corrupted!"
297
+ )
298
+ index = token_index
299
+ writer.write(" ".join(bpe_tokens) + "\n")
300
+ index += 1
301
+
302
+ return vocab_file, merge_file
303
+
304
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
305
+ add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
306
+ if is_split_into_words or add_prefix_space:
307
+ text = " " + text
308
+ return (text, kwargs)
309
+
310
+ def decode(
311
+ self,
312
+ token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
313
+ skip_special_tokens: bool = False,
314
+ clean_up_tokenization_spaces: bool = None,
315
+ truncate_before_pattern: Optional[List[str]] = None,
316
+ **kwargs,
317
+ ) -> str:
318
+ """
319
+ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
320
+ tokens and clean up tokenization spaces.
321
+
322
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
323
+
324
+ Args:
325
+ token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
326
+ List of tokenized input ids. Can be obtained using the `__call__` method.
327
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
328
+ Whether or not to remove special tokens in the decoding.
329
+ clean_up_tokenization_spaces (`bool`, *optional*):
330
+ Whether or not to clean up the tokenization spaces. If `None`, will default to
331
+ `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
332
+ truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
333
+ A list of regular expression strings that will be used to truncate the returned string. This can be
334
+ used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
335
+ of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
336
+ kwargs (additional keyword arguments, *optional*):
337
+ Will be passed to the underlying model specific decode method.
338
+
339
+ Returns:
340
+ `str`: The decoded sentence.
341
+ """
342
+ decoded_text = super()._decode(
343
+ token_ids=token_ids,
344
+ skip_special_tokens=skip_special_tokens,
345
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
346
+ **kwargs,
347
+ )
348
+
349
+ if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
350
+ decoded_text = self.truncate(decoded_text, truncate_before_pattern)
351
+
352
+ return decoded_text
353
+
354
+ def truncate(self, completion, truncate_before_pattern):
355
+ def find_re(string, pattern, start_pos):
356
+ m = pattern.search(string, start_pos)
357
+ return m.start() if m else -1
358
+
359
+ terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
360
+
361
+ prints = list(re.finditer("^print", completion, re.MULTILINE))
362
+
363
+ if len(prints) > 1:
364
+ completion = completion[: prints[1].start()]
365
+
366
+ defs = list(re.finditer("^def", completion, re.MULTILINE))
367
+
368
+ if len(defs) > 1:
369
+ completion = completion[: defs[1].start()]
370
+
371
+ start_pos = 0
372
+
373
+ terminals_pos = [
374
+ pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
375
+ ]
376
+
377
+ if len(terminals_pos) > 0:
378
+ return completion[: min(terminals_pos)]
379
+ else:
380
+ return completion