MagedSaeed's picture
Update tokenizer_script.py
06459c6 verified
import os
import json
from huggingface_hub import HfApi
from transformers import PreTrainedTokenizer
class CharacterTokenizer(PreTrainedTokenizer):
"""
Simple character-level tokenizer
"""
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab=None,
unk_token="[UNK]",
pad_token="[PAD]",
bos_token="[BOS]",
eos_token="[EOS]",
sep_token="[SEP]",
**kwargs
):
if vocab is None:
vocab = {}
# Add special tokens
special_tokens = [
unk_token,
pad_token,
bos_token,
eos_token,
sep_token,
]
for token in special_tokens:
if token not in vocab:
vocab[token] = len(vocab)
self.vocab = vocab
self.inv_vocab = {v: k for k, v in self.vocab.items()}
# Call parent constructor
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
**kwargs
)
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab)
def _tokenize(self, text):
return list(text)
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.inv_vocab.get(index, self.unk_token)
def save_vocabulary(self, save_directory, filename_prefix=None):
if not os.path.isdir(save_directory):
os.makedirs(save_directory)
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
)
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self.vocab, f, ensure_ascii=False)
return (vocab_file,)
def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
encoded_texts = [self.encode(text) for text in texts]
# Handle max_length (truncation)
if max_length is not None:
encoded_texts = [ids[:max_length] for ids in encoded_texts]
if add_special_tokens:
bos_token_id = self.convert_tokens_to_ids(self.bos_token)
eos_token_id = self.convert_tokens_to_ids(self.eos_token)
encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts]
# Handle padding
if padding:
# properly handle padding side
pad_id = self.vocab.get(self.pad_token, 0)
max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
if self.padding_side == "right":
encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
else:
encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts]
return encoded_texts
def train(self, texts):
# Start with special tokens
vocab = {}
special_tokens = [
self.unk_token,
self.pad_token,
self.bos_token,
self.eos_token,
self.sep_token,
]
for token in special_tokens:
if token not in vocab:
vocab[token] = len(vocab)
# Add all unique characters from the training data
for text in texts:
# Replace all special tokens with placeholders
processed_text = text
for token in special_tokens:
processed_text = processed_text.replace(token, " ")
# Add remaining characters
for char in processed_text:
if char not in vocab:
vocab[char] = len(vocab)
self.vocab = vocab
self.inv_vocab = {v: k for k, v in self.vocab.items()}
print(f"Vocabulary built with {len(self.vocab)} tokens")
return self
def convert_tokens_to_string(self, tokens):
return "".join(tokens)
@classmethod
def from_json(cls, vocab_file, **kwargs):
with open(vocab_file, 'r', encoding='utf-8') as f:
vocab = json.load(f)
return cls(vocab=vocab, **kwargs)
@classmethod
def from_vocab(cls, vocab, **kwargs):
return cls(vocab=vocab, **kwargs)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
from transformers import PreTrainedTokenizerFast
from transformers.utils import cached_file
# Try to get the vocab file from the Hub or local directory
try:
# Try to get vocab.json using the HF Hub utilities
vocab_file = cached_file(
pretrained_model_name_or_path,
"vocab.json",
_raise_exceptions_for_missing_entries=False
)
# If vocab file is found, load the tokenizer from it
if vocab_file:
return cls.from_json(vocab_file, *inputs, **kwargs)
except Exception as e:
# If the file is not found on the Hub, try to load it from a local directory
local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
if os.path.exists(local_vocab_file):
return cls.from_json(local_vocab_file, *inputs, **kwargs)
# If both approaches fail, try to load using the PreTrainedTokenizerFast fallback
tokenizer_files = [
os.path.join(pretrained_model_name_or_path, "tokenizer.json"),
os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
]
for tokenizer_file in tokenizer_files:
if os.path.exists(tokenizer_file):
print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}")
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path)
vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()}
return cls.from_vocab(vocab, *inputs, **kwargs)
# If all else fails, raise an error
raise ValueError(
f"Could not find vocab.json in {pretrained_model_name_or_path}. "
f"Error: {str(e)}"
)