Tonic commited on
Commit
455fa04
·
1 Parent(s): 6cebfe1

Delete tokenization_xgen.py

Browse files
Files changed (1) hide show
  1. tokenization_xgen.py +0 -246
tokenization_xgen.py DELETED
@@ -1,246 +0,0 @@
1
- # Copyright (c) 2023, salesforce.com, inc.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
- # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/Apache-2.0
5
- """Tokenization classes for xgen."""
6
-
7
- from typing import List, Optional
8
-
9
- from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
10
- from transformers.utils import logging
11
-
12
- try:
13
- import tiktoken
14
- except ModuleNotFoundError as e:
15
- raise ModuleNotFoundError("XGen requires the installation of tiktoken. Please install it via `pip install tiktoken`.") from e
16
-
17
-
18
- logger = logging.get_logger(__name__)
19
-
20
- MAX_MODEL_INPUT_SIZES = {
21
- "Salesforce/xgen-7b-4k-base": 4096,
22
- "Salesforce/xgen-7b-8k-base": 8192,
23
- "Salesforce/xgen-7b-4k-inst": 4096,
24
- "Salesforce/xgen-7b-8k-inst": 8192
25
- }
26
-
27
-
28
- def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
29
- if not add_special:
30
- return tiktoken.get_encoding(base)
31
-
32
- def include_whitespace(n_min=2, n_max=20):
33
- whitespaces = [" " * n for n in reversed(range(n_min, n_max))]
34
- return whitespaces
35
-
36
- def include_tabs(n_min=2, n_max=20):
37
- tabs = ["\t" * n for n in reversed(range(n_min, n_max))]
38
- return tabs
39
-
40
- def include_fim_tokens():
41
- fim_tokens = [
42
- "<fim_prefix>",
43
- "<fim_middle>",
44
- "<fim_suffix>",
45
- "<fim_pad>",
46
- "<filename>",
47
- "<gh_stars>",
48
- "<issue_start>",
49
- "<issue_comment>",
50
- "<issue_closed>",
51
- "<jupyter_start>",
52
- "<jupyter_text>",
53
- "<jupyter_code>",
54
- "<jupyter_output>",
55
- "<empty_output>",
56
- "<commit_before>",
57
- "<commit_msg>",
58
- "<commit_after>",
59
- "<reponame>"
60
- ]
61
- return fim_tokens
62
-
63
- def include_additional_tokens():
64
- tokens = []
65
- tokens += [f"<dummy_{i}>" for i in range(4)]
66
- tokens.append("<sep>") # 50317
67
- tokens.append("<eom>") # 50318
68
- tokens += [f"<mask_{i}>" for i in reversed(range(1, 51199-50318+1))]
69
- return tokens
70
-
71
- add_whitespaces = include_whitespace(n_min=2, n_max=32)
72
- add_tabs = include_tabs(n_min=2, n_max=10)
73
- fim_tokens = include_fim_tokens()
74
- additional_tokens = include_additional_tokens()
75
-
76
- tokenizer = tiktoken.get_encoding(base)
77
-
78
- idx = tokenizer.n_vocab
79
-
80
- bpe_ranks = tokenizer._mergeable_ranks
81
-
82
- for wsp in add_whitespaces:
83
- bpe_ranks[bytes(wsp, 'ascii')] = idx
84
- idx += 1
85
- for t in add_tabs:
86
- bpe_ranks[bytes(t, 'ascii')] = idx
87
- idx += 1
88
-
89
- special_tokens = dict()
90
-
91
- for sp in fim_tokens:
92
- special_tokens[sp] = idx
93
- idx += 1
94
- for sp in additional_tokens:
95
- special_tokens[sp] = idx
96
- idx += 1
97
-
98
- if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
99
- special_tokens[pad_token] = idx
100
- idx += 1
101
- # In production, load the arguments directly instead of accessing private attributes
102
- # See openai_public.py for examples of arguments for specific encodings
103
- enc = tiktoken.Encoding(
104
- # If you're changing the set of special tokens, make sure to use a different name
105
- # It should be clear from the name what behaviour to expect.
106
- name=base.replace("base", "im"),
107
- pat_str=tokenizer._pat_str,
108
- mergeable_ranks=bpe_ranks,
109
- special_tokens={
110
- **tokenizer._special_tokens,
111
- **special_tokens
112
- }
113
- )
114
- return enc
115
-
116
-
117
- class XgenTokenizer(PreTrainedTokenizer):
118
- """
119
- Construct a Xgen tokenizer. Based on byte-level Byte-Pair-Encoding.
120
- Args:
121
- vocab_file (`str`):
122
- Path to the vocabulary file.
123
- """
124
- max_model_input_sizes = MAX_MODEL_INPUT_SIZES
125
- model_input_names = ["input_ids", "attention_mask"]
126
-
127
- def __init__(
128
- self,
129
- pad_token=None,
130
- eos_token="<|endoftext|>",
131
- add_eos_token=False,
132
- add_special_tokens=True,
133
- **kwargs,
134
- ):
135
- pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
136
- eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
137
- self.add_eos_token = add_eos_token
138
- self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
139
- super().__init__(
140
- pad_token=pad_token_added,
141
- eos_token=eos_token_added,
142
- add_eos_token=add_eos_token,
143
- add_special_tokens=add_special_tokens,
144
- **kwargs,
145
- )
146
-
147
- @property
148
- def vocab_size(self):
149
- """Returns vocab size"""
150
- return self.encoder.n_vocab
151
-
152
- def get_vocab(self):
153
- """Returns vocab as a dict"""
154
- vocab = {self.encoder.decode_single_token_bytes(i): i for i in range(self.vocab_size)}
155
- return vocab
156
-
157
- def _tokenize(self, text, **kwargs):
158
- """Returns a tokenized string."""
159
- return self.encoder.encode(text, allowed_special="all")
160
-
161
- def _convert_token_to_id(self, token):
162
- """Converts a token (str) in an id using the vocab."""
163
- if isinstance(token, str):
164
- return self.encoder.encode_single_token(token)
165
- else:
166
- return token
167
-
168
- def _convert_id_to_token(self, index):
169
- """Converts an index (integer) in a token (str) using the vocab."""
170
- return self.encoder.decode_single_token_bytes(index).decode("utf-8")
171
-
172
- def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
173
- if skip_special_tokens:
174
- token_ids = [t for t in token_ids if t not in self.all_special_ids]
175
- return self.encoder.decode(token_ids)
176
-
177
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
178
- """Build model inputs from a sequence by appending eos_token_id."""
179
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
180
-
181
- output = token_ids_0 + eos_token_id
182
-
183
- if token_ids_1 is not None:
184
- output = output + token_ids_1 + eos_token_id
185
-
186
- return output
187
-
188
- def get_special_tokens_mask(
189
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
190
- already_has_special_tokens: bool = False
191
- ) -> List[int]:
192
- """
193
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
194
- special tokens using the tokenizer `prepare_for_model` method.
195
- Args:
196
- token_ids_0 (`List[int]`):
197
- List of IDs.
198
- token_ids_1 (`List[int]`, *optional*):
199
- Optional second list of IDs for sequence pairs.
200
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
201
- Whether the token list is already formatted with special tokens for the model.
202
- Returns:
203
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
204
- """
205
- if already_has_special_tokens:
206
- return super().get_special_tokens_mask(
207
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
208
- )
209
-
210
- eos_token_id = [1] if self.add_eos_token else []
211
-
212
- if token_ids_1 is None:
213
- return ([0] * len(token_ids_0)) + eos_token_id
214
- return ([0] * len(token_ids_0)) + eos_token_id + ([0] * len(token_ids_1)) + eos_token_id
215
-
216
- def create_token_type_ids_from_sequences(
217
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
218
- ) -> List[int]:
219
- """
220
- Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
221
- sequence pair mask has the following format:
222
- ```
223
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
224
- | first sequence | second sequence |
225
- ```
226
- if token_ids_1 is None, only returns the first portion of the mask (0s).
227
- Args:
228
- token_ids_0 (`List[int]`):
229
- List of ids.
230
- token_ids_1 (`List[int]`, *optional*):
231
- Optional second list of IDs for sequence pairs.
232
- Returns:
233
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
234
- """
235
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
236
-
237
- output = [0] * len(token_ids_0 + eos_token_id)
238
-
239
- if token_ids_1 is not None:
240
- output += [1] * len(token_ids_1 + eos_token_id)
241
-
242
- return output
243
-
244
- # has no vocab file
245
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
246
- return ()