import os import json from huggingface_hub import HfApi from transformers import PreTrainedTokenizer class CharacterTokenizer(PreTrainedTokenizer): """ Simple character-level tokenizer """ model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab=None, unk_token="[UNK]", pad_token="[PAD]", bos_token="[BOS]", eos_token="[EOS]", sep_token="[SEP]", **kwargs ): if vocab is None: vocab = {} # Add special tokens special_tokens = [ unk_token, pad_token, bos_token, eos_token, sep_token, ] for token in special_tokens: if token not in vocab: vocab[token] = len(vocab) self.vocab = vocab self.inv_vocab = {v: k for k, v in self.vocab.items()} # Call parent constructor super().__init__( unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, sep_token=sep_token, **kwargs ) @property def vocab_size(self): return len(self.vocab) def get_vocab(self): return dict(self.vocab) def _tokenize(self, text): return list(text) def _convert_token_to_id(self, token): return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): return self.inv_vocab.get(index, self.unk_token) def save_vocabulary(self, save_directory, filename_prefix=None): if not os.path.isdir(save_directory): os.makedirs(save_directory) vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json" ) with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self.vocab, f, ensure_ascii=False) return (vocab_file,) def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None): encoded_texts = [self.encode(text) for text in texts] # Handle max_length (truncation) if max_length is not None: encoded_texts = [ids[:max_length] for ids in encoded_texts] if add_special_tokens: bos_token_id = self.convert_tokens_to_ids(self.bos_token) eos_token_id = self.convert_tokens_to_ids(self.eos_token) encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts] # Handle padding if padding: # properly handle padding side pad_id = self.vocab.get(self.pad_token, 0) max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length if self.padding_side == "right": encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts] else: encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts] return encoded_texts def train(self, texts): # Start with special tokens vocab = {} special_tokens = [ self.unk_token, self.pad_token, self.bos_token, self.eos_token, self.sep_token, ] for token in special_tokens: if token not in vocab: vocab[token] = len(vocab) # Add all unique characters from the training data for text in texts: # Replace all special tokens with placeholders processed_text = text for token in special_tokens: processed_text = processed_text.replace(token, " ") # Add remaining characters for char in processed_text: if char not in vocab: vocab[char] = len(vocab) self.vocab = vocab self.inv_vocab = {v: k for k, v in self.vocab.items()} print(f"Vocabulary built with {len(self.vocab)} tokens") return self def convert_tokens_to_string(self, tokens): return "".join(tokens) @classmethod def from_json(cls, vocab_file, **kwargs): with open(vocab_file, 'r', encoding='utf-8') as f: vocab = json.load(f) return cls(vocab=vocab, **kwargs) @classmethod def from_vocab(cls, vocab, **kwargs): return cls(vocab=vocab, **kwargs) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): from transformers import PreTrainedTokenizerFast from transformers.utils import cached_file # Try to get the vocab file from the Hub or local directory try: # Try to get vocab.json using the HF Hub utilities vocab_file = cached_file( pretrained_model_name_or_path, "vocab.json", _raise_exceptions_for_missing_entries=False ) # If vocab file is found, load the tokenizer from it if vocab_file: return cls.from_json(vocab_file, *inputs, **kwargs) except Exception as e: # If the file is not found on the Hub, try to load it from a local directory local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json") if os.path.exists(local_vocab_file): return cls.from_json(local_vocab_file, *inputs, **kwargs) # If both approaches fail, try to load using the PreTrainedTokenizerFast fallback tokenizer_files = [ os.path.join(pretrained_model_name_or_path, "tokenizer.json"), os.path.join(pretrained_model_name_or_path, "tokenizer_config.json") ] for tokenizer_file in tokenizer_files: if os.path.exists(tokenizer_file): print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}") fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path) vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()} return cls.from_vocab(vocab, *inputs, **kwargs) # If all else fails, raise an error raise ValueError( f"Could not find vocab.json in {pretrained_model_name_or_path}. " f"Error: {str(e)}" )