File size: 6,556 Bytes
ae541cd c3ceda6 ae541cd c3ceda6 ae541cd d7a3880 99c923e 49eb973 0599565 49eb973 99c923e 49eb973 0599565 99c923e 06459c6 49eb973 99c923e ae541cd e013219 8d0ce5f e013219 c3ceda6 e013219 c3ceda6 ae541cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import os
import json
from huggingface_hub import HfApi
from transformers import PreTrainedTokenizer
class CharacterTokenizer(PreTrainedTokenizer):
"""
Simple character-level tokenizer
"""
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab=None,
unk_token="[UNK]",
pad_token="[PAD]",
bos_token="[BOS]",
eos_token="[EOS]",
sep_token="[SEP]",
**kwargs
):
if vocab is None:
vocab = {}
# Add special tokens
special_tokens = [
unk_token,
pad_token,
bos_token,
eos_token,
sep_token,
]
for token in special_tokens:
if token not in vocab:
vocab[token] = len(vocab)
self.vocab = vocab
self.inv_vocab = {v: k for k, v in self.vocab.items()}
# Call parent constructor
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
**kwargs
)
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab)
def _tokenize(self, text):
return list(text)
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.inv_vocab.get(index, self.unk_token)
def save_vocabulary(self, save_directory, filename_prefix=None):
if not os.path.isdir(save_directory):
os.makedirs(save_directory)
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
)
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self.vocab, f, ensure_ascii=False)
return (vocab_file,)
def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
encoded_texts = [self.encode(text) for text in texts]
# Handle max_length (truncation)
if max_length is not None:
encoded_texts = [ids[:max_length] for ids in encoded_texts]
if add_special_tokens:
bos_token_id = self.convert_tokens_to_ids(self.bos_token)
eos_token_id = self.convert_tokens_to_ids(self.eos_token)
encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts]
# Handle padding
if padding:
# properly handle padding side
pad_id = self.vocab.get(self.pad_token, 0)
max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
if self.padding_side == "right":
encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
else:
encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts]
return encoded_texts
def train(self, texts):
# Start with special tokens
vocab = {}
special_tokens = [
self.unk_token,
self.pad_token,
self.bos_token,
self.eos_token,
self.sep_token,
]
for token in special_tokens:
if token not in vocab:
vocab[token] = len(vocab)
# Add all unique characters from the training data
for text in texts:
# Replace all special tokens with placeholders
processed_text = text
for token in special_tokens:
processed_text = processed_text.replace(token, " ")
# Add remaining characters
for char in processed_text:
if char not in vocab:
vocab[char] = len(vocab)
self.vocab = vocab
self.inv_vocab = {v: k for k, v in self.vocab.items()}
print(f"Vocabulary built with {len(self.vocab)} tokens")
return self
def convert_tokens_to_string(self, tokens):
return "".join(tokens)
@classmethod
def from_json(cls, vocab_file, **kwargs):
with open(vocab_file, 'r', encoding='utf-8') as f:
vocab = json.load(f)
return cls(vocab=vocab, **kwargs)
@classmethod
def from_vocab(cls, vocab, **kwargs):
return cls(vocab=vocab, **kwargs)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
from transformers import PreTrainedTokenizerFast
from transformers.utils import cached_file
# Try to get the vocab file from the Hub or local directory
try:
# Try to get vocab.json using the HF Hub utilities
vocab_file = cached_file(
pretrained_model_name_or_path,
"vocab.json",
_raise_exceptions_for_missing_entries=False
)
# If vocab file is found, load the tokenizer from it
if vocab_file:
return cls.from_json(vocab_file, *inputs, **kwargs)
except Exception as e:
# If the file is not found on the Hub, try to load it from a local directory
local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
if os.path.exists(local_vocab_file):
return cls.from_json(local_vocab_file, *inputs, **kwargs)
# If both approaches fail, try to load using the PreTrainedTokenizerFast fallback
tokenizer_files = [
os.path.join(pretrained_model_name_or_path, "tokenizer.json"),
os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
]
for tokenizer_file in tokenizer_files:
if os.path.exists(tokenizer_file):
print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}")
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path)
vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()}
return cls.from_vocab(vocab, *inputs, **kwargs)
# If all else fails, raise an error
raise ValueError(
f"Could not find vocab.json in {pretrained_model_name_or_path}. "
f"Error: {str(e)}"
)
|