File size: 7,595 Bytes
b9f4adf 04d6513 d2cc612 04d6513 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/Apache-2.0
"""Tokenization classes for xgen."""
from typing import List, Optional
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging
try:
import tiktoken
except ModuleNotFoundError as e:
raise ModuleNotFoundError("XGen requires the installation of tiktoken. Please install it via `pip install tiktoken`.") from e
logger = logging.get_logger(__name__)
MAX_MODEL_INPUT_SIZES = {
"Salesforce/xgen-7b-4k-base": 4096,
"Salesforce/xgen-7b-8k-base": 8192,
"Salesforce/xgen-7b-4k-inst": 4096,
"Salesforce/xgen-7b-8k-inst": 8192
}
def tiktoken_tokenizer(base="gpt2", add_special=True):
if not add_special:
return tiktoken.get_encoding(base)
def include_whitespace(n_min=2, n_max=20):
whitespaces = [" " * n for n in reversed(range(n_min, n_max))]
return whitespaces
def include_tabs(n_min=2, n_max=20):
tabs = ["\t" * n for n in reversed(range(n_min, n_max))]
return tabs
def include_fim_tokens():
fim_tokens = [
"<fim_prefix>",
"<fim_middle>",
"<fim_suffix>",
"<fim_pad>",
"<filename>",
"<gh_stars>",
"<issue_start>",
"<issue_comment>",
"<issue_closed>",
"<jupyter_start>",
"<jupyter_text>",
"<jupyter_code>",
"<jupyter_output>",
"<empty_output>",
"<commit_before>",
"<commit_msg>",
"<commit_after>",
"<reponame>"
]
return fim_tokens
add_whitespaces = include_whitespace(n_min=2, n_max=32)
add_tabs = include_tabs(n_min=2, n_max=10)
fim_tokens = include_fim_tokens()
tokenizer = tiktoken.get_encoding(base)
idx = tokenizer.n_vocab
bpe_ranks = tokenizer._mergeable_ranks
for wsp in add_whitespaces:
bpe_ranks[bytes(wsp, 'ascii')] = idx
idx += 1
for t in add_tabs:
bpe_ranks[bytes(t, 'ascii')] = idx
idx += 1
special_tokens = dict()
for sp in fim_tokens:
special_tokens[sp] = idx
idx += 1
# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc = tiktoken.Encoding(
# If you're changing the set of special tokens, make sure to use a different name
# It should be clear from the name what behaviour to expect.
name=base.replace("base", "im"),
pat_str=tokenizer._pat_str,
mergeable_ranks=bpe_ranks,
special_tokens={
**tokenizer._special_tokens,
**special_tokens
}
)
return enc
class XgenTokenizer(PreTrainedTokenizer):
"""
Construct a Xgen tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
max_model_input_sizes = MAX_MODEL_INPUT_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
pad_token=None,
add_eos_token=False,
add_special_tokens=True,
**kwargs,
):
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
super().__init__(
pad_token=pad_token,
add_eos_token=add_eos_token,
add_special_tokens=add_special_tokens,
**kwargs,
)
self.add_eos_token = add_eos_token
self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
@property
def vocab_size(self):
"""Returns vocab size"""
return self.encoder.n_vocab
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
return vocab
def _tokenize(self, text, **kwargs):
"""Returns a tokenized string."""
return self.encoder.encode(text, allowed_special="all")
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.encoder.decode_single_token_bytes(index)
def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
return self.encoder.decode(token_ids)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""Build model inputs from a sequence by appending eos_token_id."""
eos_token_id = [50256] if self.add_eos_token else []
output = token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
eos_token_id = [1] if self.add_eos_token else []
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + eos_token_id
return ([0] * len(token_ids_0)) + eos_token_id + ([0] * len(token_ids_1)) + eos_token_id
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
eos_token_id = [50256] if self.add_eos_token else []
output = [0] * len(token_ids_0 + eos_token_id)
if token_ids_1 is not None:
output += [1] * len(token_ids_1 + eos_token_id)
return output
|