File size: 6,556 Bytes

import os
import json
from huggingface_hub import HfApi
from transformers import PreTrainedTokenizer

class CharacterTokenizer(PreTrainedTokenizer):
    """
    Simple character-level tokenizer
    """

    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab=None,
        unk_token="[UNK]",
        pad_token="[PAD]",
        bos_token="[BOS]",
        eos_token="[EOS]",
        sep_token="[SEP]",
        **kwargs
    ):
        if vocab is None:
          vocab = {}
          # Add special tokens
          special_tokens = [
              unk_token,
              pad_token,
              bos_token,
              eos_token,
              sep_token,
            ]
          for token in special_tokens:
              if token not in vocab:
                  vocab[token] = len(vocab)
        self.vocab = vocab
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

        # Call parent constructor
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            **kwargs
        )

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return dict(self.vocab)

    def _tokenize(self, text):
        return list(text)

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        return self.inv_vocab.get(index, self.unk_token)

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)

        vocab_file = os.path.join(
            save_directory,
            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
        )

        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False)

        return (vocab_file,)

    def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
        encoded_texts = [self.encode(text) for text in texts]
        # Handle max_length (truncation)
        if max_length is not None:
            encoded_texts = [ids[:max_length] for ids in encoded_texts]
        if add_special_tokens:
            bos_token_id = self.convert_tokens_to_ids(self.bos_token)
            eos_token_id = self.convert_tokens_to_ids(self.eos_token)
            encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts]
        # Handle padding
        if padding:
            # properly handle padding side
            pad_id = self.vocab.get(self.pad_token, 0)
            max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
            if self.padding_side == "right":
              encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
            else:
              encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts]
        return encoded_texts
    
    def train(self, texts):
        # Start with special tokens
        vocab = {}
        special_tokens = [
            self.unk_token,
            self.pad_token,
            self.bos_token,
            self.eos_token,
            self.sep_token,
          ]
        for token in special_tokens:
            if token not in vocab:
                vocab[token] = len(vocab)

        # Add all unique characters from the training data
        for text in texts:
            # Replace all special tokens with placeholders
            processed_text = text
            for token in special_tokens:
                processed_text = processed_text.replace(token, " ")

            # Add remaining characters
            for char in processed_text:
                if char not in vocab:
                    vocab[char] = len(vocab)

        self.vocab = vocab
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

        print(f"Vocabulary built with {len(self.vocab)} tokens")
        return self

    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)
    
    @classmethod
    def from_json(cls, vocab_file, **kwargs):
        with open(vocab_file, 'r', encoding='utf-8') as f:
            vocab = json.load(f)
        return cls(vocab=vocab, **kwargs)
    
    @classmethod
    def from_vocab(cls, vocab, **kwargs):
        return cls(vocab=vocab, **kwargs)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        from transformers import PreTrainedTokenizerFast
        from transformers.utils import cached_file

        # Try to get the vocab file from the Hub or local directory
        try:
            # Try to get vocab.json using the HF Hub utilities
            vocab_file = cached_file(
                pretrained_model_name_or_path,
                "vocab.json",
                _raise_exceptions_for_missing_entries=False
            )

            # If vocab file is found, load the tokenizer from it
            if vocab_file:
                return cls.from_json(vocab_file, *inputs, **kwargs)

        except Exception as e:
            # If the file is not found on the Hub, try to load it from a local directory
            local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
            if os.path.exists(local_vocab_file):
                return cls.from_json(local_vocab_file, *inputs, **kwargs)

            # If both approaches fail, try to load using the PreTrainedTokenizerFast fallback
            tokenizer_files = [
                os.path.join(pretrained_model_name_or_path, "tokenizer.json"),
                os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
            ]

            for tokenizer_file in tokenizer_files:
                if os.path.exists(tokenizer_file):
                    print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}")
                    fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path)
                    vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()}
                    return cls.from_vocab(vocab, *inputs, **kwargs)

            # If all else fails, raise an error
            raise ValueError(
                f"Could not find vocab.json in {pretrained_model_name_or_path}. "
                f"Error: {str(e)}"
            )