|
import os |
|
import json |
|
from huggingface_hub import HfApi |
|
from transformers import PreTrainedTokenizer |
|
|
|
class CharacterTokenizer(PreTrainedTokenizer): |
|
""" |
|
Simple character-level tokenizer |
|
""" |
|
|
|
model_input_names = ["input_ids", "attention_mask"] |
|
|
|
def __init__( |
|
self, |
|
vocab=None, |
|
unk_token="[UNK]", |
|
pad_token="[PAD]", |
|
bos_token="[BOS]", |
|
eos_token="[EOS]", |
|
sep_token="[SEP]", |
|
**kwargs |
|
): |
|
if vocab is None: |
|
vocab = {} |
|
|
|
special_tokens = [ |
|
unk_token, |
|
pad_token, |
|
bos_token, |
|
eos_token, |
|
sep_token, |
|
] |
|
for token in special_tokens: |
|
if token not in vocab: |
|
vocab[token] = len(vocab) |
|
self.vocab = vocab |
|
self.inv_vocab = {v: k for k, v in self.vocab.items()} |
|
|
|
|
|
super().__init__( |
|
unk_token=unk_token, |
|
pad_token=pad_token, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
sep_token=sep_token, |
|
**kwargs |
|
) |
|
|
|
@property |
|
def vocab_size(self): |
|
return len(self.vocab) |
|
|
|
def get_vocab(self): |
|
return dict(self.vocab) |
|
|
|
def _tokenize(self, text): |
|
return list(text) |
|
|
|
def _convert_token_to_id(self, token): |
|
return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
|
def _convert_id_to_token(self, index): |
|
return self.inv_vocab.get(index, self.unk_token) |
|
|
|
def save_vocabulary(self, save_directory, filename_prefix=None): |
|
if not os.path.isdir(save_directory): |
|
os.makedirs(save_directory) |
|
|
|
vocab_file = os.path.join( |
|
save_directory, |
|
(filename_prefix + "-" if filename_prefix else "") + "vocab.json" |
|
) |
|
|
|
with open(vocab_file, "w", encoding="utf-8") as f: |
|
json.dump(self.vocab, f, ensure_ascii=False) |
|
|
|
return (vocab_file,) |
|
|
|
def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None): |
|
encoded_texts = [self.encode(text) for text in texts] |
|
|
|
if max_length is not None: |
|
encoded_texts = [ids[:max_length] for ids in encoded_texts] |
|
if add_special_tokens: |
|
bos_token_id = self.convert_tokens_to_ids(self.bos_token) |
|
eos_token_id = self.convert_tokens_to_ids(self.eos_token) |
|
encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts] |
|
|
|
if padding: |
|
|
|
pad_id = self.vocab.get(self.pad_token, 0) |
|
max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length |
|
if self.padding_side == "right": |
|
encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts] |
|
else: |
|
encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts] |
|
return encoded_texts |
|
|
|
def train(self, texts): |
|
|
|
vocab = {} |
|
special_tokens = [ |
|
self.unk_token, |
|
self.pad_token, |
|
self.bos_token, |
|
self.eos_token, |
|
self.sep_token, |
|
] |
|
for token in special_tokens: |
|
if token not in vocab: |
|
vocab[token] = len(vocab) |
|
|
|
|
|
for text in texts: |
|
|
|
processed_text = text |
|
for token in special_tokens: |
|
processed_text = processed_text.replace(token, " ") |
|
|
|
|
|
for char in processed_text: |
|
if char not in vocab: |
|
vocab[char] = len(vocab) |
|
|
|
self.vocab = vocab |
|
self.inv_vocab = {v: k for k, v in self.vocab.items()} |
|
|
|
print(f"Vocabulary built with {len(self.vocab)} tokens") |
|
return self |
|
|
|
def convert_tokens_to_string(self, tokens): |
|
return "".join(tokens) |
|
|
|
@classmethod |
|
def from_json(cls, vocab_file, **kwargs): |
|
with open(vocab_file, 'r', encoding='utf-8') as f: |
|
vocab = json.load(f) |
|
return cls(vocab=vocab, **kwargs) |
|
|
|
@classmethod |
|
def from_vocab(cls, vocab, **kwargs): |
|
return cls(vocab=vocab, **kwargs) |
|
|
|
@classmethod |
|
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): |
|
from transformers import PreTrainedTokenizerFast |
|
from transformers.utils import cached_file |
|
|
|
|
|
try: |
|
|
|
vocab_file = cached_file( |
|
pretrained_model_name_or_path, |
|
"vocab.json", |
|
_raise_exceptions_for_missing_entries=False |
|
) |
|
|
|
|
|
if vocab_file: |
|
return cls.from_json(vocab_file, *inputs, **kwargs) |
|
|
|
except Exception as e: |
|
|
|
local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json") |
|
if os.path.exists(local_vocab_file): |
|
return cls.from_json(local_vocab_file, *inputs, **kwargs) |
|
|
|
|
|
tokenizer_files = [ |
|
os.path.join(pretrained_model_name_or_path, "tokenizer.json"), |
|
os.path.join(pretrained_model_name_or_path, "tokenizer_config.json") |
|
] |
|
|
|
for tokenizer_file in tokenizer_files: |
|
if os.path.exists(tokenizer_file): |
|
print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}") |
|
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path) |
|
vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()} |
|
return cls.from_vocab(vocab, *inputs, **kwargs) |
|
|
|
|
|
raise ValueError( |
|
f"Could not find vocab.json in {pretrained_model_name_or_path}. " |
|
f"Error: {str(e)}" |
|
) |
|
|