diff --git "a/tokenizer.py" "b/tokenizer.py"
deleted file mode 100755--- "a/tokenizer.py"
+++ /dev/null
@@ -1,2834 +0,0 @@
-from typing import List, Optional, Tuple, Dict, Union, Any, overload, Sequence, NamedTuple
-import collections
-import os
-import re
-import unicodedata
-import itertools
-import requests
-import copy
-import json
-from contextlib import contextmanager
-from collections import OrderedDict, UserDict
-from enum import Enum
-import numpy as np
-from utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
-from tokenizers import AddedToken
-from tokenizers import Encoding as EncodingFast
-
-
-VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
-LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
-
-SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
-ADDED_TOKENS_FILE = "added_tokens.json"
-TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
-FULL_TOKENIZER_FILE = "tokenizer.json"
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
-    }
-}
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-uncased": 512
-}
-PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-uncased": {"do_lower_case": True}
-}
-
-
-TextInput = str
-PreTokenizedInput = List[str]
-EncodedInput = List[int]
-TextInputPair = Tuple[str, str]
-PreTokenizedInputPair = Tuple[List[str], List[str]]
-EncodedInputPair = Tuple[List[int], List[int]]
-
-
-class ExplicitEnum(Enum):
-  @classmethod
-  def _missing_(cls, value):
-    raise ValueError(
-      "%r is not a valid %s, please select one of %s"
-      % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
-    )
-
-
-class TruncationStrategy(ExplicitEnum):
-  ONLY_FIRST = "only_first"
-  ONLY_SECOND = "only_second"
-  LONGEST_FIRST = "longest_first"
-  DO_NOT_TRUNCATE = "do_not_truncate"
-
-
-class PaddingStrategy(ExplicitEnum):
-  LONGEST = "longest"
-  MAX_LENGTH = "max_length"
-  DO_NOT_PAD = "do_not_pad"
-
-
-class TensorType(ExplicitEnum):
-  PYTORCH = "pt"
-  TENSORFLOW = "tf"
-  NUMPY = "np"
-  JAX = "jax"
-
-
-class CharSpan(NamedTuple):
-  start: int
-  end: int
-
-
-class TokenSpan(NamedTuple):
-  start: int
-  end: int
-
-
-def to_py_obj(obj):
-  """
-  Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
-  """
-  if isinstance(obj, (dict, BatchEncoding)):
-    return {k: to_py_obj(v) for k, v in obj.items()}
-  elif isinstance(obj, (list, tuple)):
-    return [to_py_obj(o) for o in obj]
-  elif is_tf_available() and _is_tensorflow(obj):
-    return obj.numpy().tolist()
-  elif is_torch_available() and _is_torch(obj):
-    return obj.detach().cpu().tolist()
-  elif isinstance(obj, np.ndarray):
-    return obj.tolist()
-  else:
-    return obj
-
-
-def _is_torch(x):
-  import torch
-  return isinstance(x, torch.Tensor)
-
-
-def _is_torch_device(x):
-  import torch
-  return isinstance(x, torch.device)
-
-
-def _is_end_of_word(text):
-  last_char = text[-1]
-  return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
-
-
-def _is_start_of_word(text):
-  first_char = text[0]
-  return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
-
-
-def _is_punctuation(char):
-  cp = ord(char)
-  # We treat all non-letter/number ASCII as punctuation.
-  # Characters such as "^", "$", and "`" are not in the Unicode
-  # Punctuation class but we treat them as punctuation anyways, for
-  # consistency.
-  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
-    return True
-  cat = unicodedata.category(char)
-  if cat.startswith("P"):
-    return True
-  return False
-
-
-def _is_whitespace(char):
-  # \t, \n, and \r are technically control characters but we treat them
-  # as whitespace since they are generally considered as such.
-  if char == " " or char == "\t" or char == "\n" or char == "\r":
-    return True
-  cat = unicodedata.category(char)
-  if cat == "Zs":
-    return True
-  return False
-
-
-def _is_control(char):
-  # These are technically control characters but we count them as whitespace
-  # characters.
-  if char == "\t" or char == "\n" or char == "\r":
-    return False
-  cat = unicodedata.category(char)
-  if cat.startswith("C"):
-    return True
-  return False
-
-
-def load_vocab(vocab_file):
-  vocab = collections.OrderedDict()
-  with open(vocab_file, "r", encoding="utf-8") as reader:
-    tokens = reader.readlines()
-  for index, token in enumerate(tokens):
-    token = token.rstrip("\n")
-    vocab[token] = index
-  return vocab
-
-
-def whitespace_tokenize(text):
-  text = text.strip()
-  if not text:
-    return []
-  tokens = text.split()
-  return tokens
-
-
-class BatchEncoding(UserDict):
-  def __init__(
-    self,
-    data: Optional[Dict[str, Any]] = None,
-    encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
-    tensor_type: Union[None, str, TensorType] = None,
-    prepend_batch_axis: bool = False,
-    n_sequences: Optional[int] = None,
-  ):
-    super().__init__(data)
-
-    if isinstance(encoding, EncodingFast):
-      encoding = [encoding]
-
-    self._encodings = encoding
-
-    if n_sequences is None and encoding is not None and len(encoding):
-      n_sequences = encoding[0].n_sequences
-
-    self._n_sequences = n_sequences
-
-    self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
-
-  @property
-  def n_sequences(self) -> Optional[int]:
-    return self._n_sequences
-
-  @property
-  def is_fast(self) -> bool:
-    return self._encodings is not None
-
-  def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
-    if isinstance(item, str):
-      return self.data[item]
-    elif self._encodings is not None:
-      return self._encodings[item]
-    else:
-      raise KeyError(
-        "Indexing with integers (to access backend Encoding for a given batch index) "
-        "is not available when using Python based tokenizers"
-      )
-
-  def __getattr__(self, item: str):
-    try:
-      return self.data[item]
-    except KeyError:
-      raise AttributeError
-
-  def __getstate__(self):
-    return {"data": self.data, "encodings": self._encodings}
-
-  def __setstate__(self, state):
-    if "data" in state:
-      self.data = state["data"]
-
-    if "encodings" in state:
-      self._encodings = state["encodings"]
-
-  def keys(self):
-    return self.data.keys()
-
-  def values(self):
-    return self.data.values()
-
-  def items(self):
-    return self.data.items()
-
-  # After this point:
-  # Extended properties and methods only available for fast (Rust-based) tokenizers
-  # provided by HuggingFace tokenizers library.
-
-  @property
-  def encodings(self) -> Optional[List[EncodingFast]]:
-    return self._encodings
-
-  def tokens(self, batch_index: int = 0) -> List[str]:
-    if not self._encodings:
-      raise ValueError("tokens() is not available when using Python-based tokenizers")
-    return self._encodings[batch_index].tokens
-
-  def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
-    if not self._encodings:
-      raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
-    return self._encodings[batch_index].sequence_ids
-
-  def words(self, batch_index: int = 0) -> List[Optional[int]]:
-    if not self._encodings:
-      raise ValueError("words() is not available when using Python-based tokenizers")
-    return self.word_ids(batch_index)
-
-  def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
-    if not self._encodings:
-      raise ValueError("word_ids() is not available when using Python-based tokenizers")
-    return self._encodings[batch_index].word_ids
-
-  def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
-    if not self._encodings:
-      raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
-    if token_index is not None:
-      batch_index = batch_or_token_index
-    else:
-      batch_index = 0
-      token_index = batch_or_token_index
-    if batch_index < 0:
-      batch_index = self._batch_size + batch_index
-    if token_index < 0:
-      token_index = self._seq_len + token_index
-    return self._encodings[batch_index].token_to_sequence(token_index)
-
-  def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
-    if not self._encodings:
-      raise ValueError("token_to_word() is not available when using Python based tokenizers")
-    if token_index is not None:
-      batch_index = batch_or_token_index
-    else:
-      batch_index = 0
-      token_index = batch_or_token_index
-    if batch_index < 0:
-      batch_index = self._batch_size + batch_index
-    if token_index < 0:
-      token_index = self._seq_len + token_index
-    return self._encodings[batch_index].token_to_word(token_index)
-
-  def word_to_tokens(
-    self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
-  ) -> Optional[TokenSpan]:
-    if not self._encodings:
-      raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
-    if word_index is not None:
-      batch_index = batch_or_word_index
-    else:
-      batch_index = 0
-      word_index = batch_or_word_index
-    if batch_index < 0:
-      batch_index = self._batch_size + batch_index
-    if word_index < 0:
-      word_index = self._seq_len + word_index
-    span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
-    return TokenSpan(*span) if span is not None else None
-
-  def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
-    if not self._encodings:
-      raise ValueError("token_to_chars() is not available when using Python based tokenizers")
-    if token_index is not None:
-      batch_index = batch_or_token_index
-    else:
-      batch_index = 0
-      token_index = batch_or_token_index
-    return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
-
-  def char_to_token(
-    self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
-  ) -> int:
-    if not self._encodings:
-      raise ValueError("char_to_token() is not available when using Python based tokenizers")
-    if char_index is not None:
-      batch_index = batch_or_char_index
-    else:
-      batch_index = 0
-      char_index = batch_or_char_index
-    return self._encodings[batch_index].char_to_token(char_index, sequence_index)
-
-  def word_to_chars(
-    self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
-  ) -> CharSpan:
-    if not self._encodings:
-      raise ValueError("word_to_chars() is not available when using Python based tokenizers")
-    if word_index is not None:
-      batch_index = batch_or_word_index
-    else:
-      batch_index = 0
-      word_index = batch_or_word_index
-    return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
-
-  def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
-    if not self._encodings:
-      raise ValueError("char_to_word() is not available when using Python based tokenizers")
-    if char_index is not None:
-      batch_index = batch_or_char_index
-    else:
-      batch_index = 0
-      char_index = batch_or_char_index
-    return self._encodings[batch_index].char_to_word(char_index, sequence_index)
-
-  def convert_to_tensors(
-    self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
-  ):
-    if tensor_type is None:
-      return self
-
-    # Convert to TensorType
-    if not isinstance(tensor_type, TensorType):
-      tensor_type = TensorType(tensor_type)
-
-    # Get a function reference for the correct framework
-    if tensor_type == TensorType.TENSORFLOW:
-      if not is_tf_available():
-        raise ImportError(
-          "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-        )
-      import tensorflow as tf
-
-      as_tensor = tf.constant
-      is_tensor = tf.is_tensor
-    elif tensor_type == TensorType.PYTORCH:
-      if not is_torch_available():
-        raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
-      import torch
-
-      as_tensor = torch.tensor
-      is_tensor = torch.is_tensor
-    elif tensor_type == TensorType.JAX:
-      if not is_flax_available():
-        raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-      import jax.numpy as jnp  # noqa: F811
-
-      as_tensor = jnp.array
-      is_tensor = _is_jax
-    else:
-      as_tensor = np.asarray
-      is_tensor = _is_numpy
-    # (mfuntowicz: This code is unreachable)
-    # else:
-    #     raise ImportError(
-    #         "Unable to convert output to tensors format {}".format(tensor_type)
-    #     )
-
-    # Do the tensor conversion in batch
-    for key, value in self.items():
-      try:
-        if prepend_batch_axis:
-          value = [value]
-
-        if not is_tensor(value):
-          tensor = as_tensor(value)
-
-          # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
-          # # at-least2d
-          # if tensor.ndim > 2:
-          #     tensor = tensor.squeeze(0)
-          # elif tensor.ndim < 2:
-          #     tensor = tensor[None, :]
-
-          self[key] = tensor
-      except:  # noqa E722
-        if key == "overflowing_tokens":
-          raise ValueError(
-            "Unable to create tensor returning overflowing tokens of different lengths. "
-            "Please see if a fast version of this tokenizer is available to have this feature available."
-          )
-        raise ValueError(
-          "Unable to create tensor, you should probably activate truncation and/or padding "
-          "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
-        )
-
-    return self
-
-  def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
-    # This check catches things like APEX blindly calling "to" on all inputs to a module
-    # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
-    # into a HalfTensor
-    if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
-      self.data = {k: v.to(device=device) for k, v in self.data.items()}
-    return self
-
-
-class SpecialTokensMixin:
-  SPECIAL_TOKENS_ATTRIBUTES = [
-    "bos_token",
-    "eos_token",
-    "unk_token",
-    "sep_token",
-    "pad_token",
-    "cls_token",
-    "mask_token",
-    "additional_special_tokens",
-  ]
-
-  def __init__(self, verbose=True, **kwargs):
-    self._bos_token = None
-    self._eos_token = None
-    self._unk_token = None
-    self._sep_token = None
-    self._pad_token = None
-    self._cls_token = None
-    self._mask_token = None
-    self._pad_token_type_id = 0
-    self._additional_special_tokens = []
-    self.verbose = verbose
-
-    # We directly set the hidden value to allow initialization with special tokens
-    # which are not yet in the vocabulary. Necessary for serialization/de-serialization
-    # TODO clean this up at some point (probably by switching to fast tokenizers)
-    for key, value in kwargs.items():
-      if value is None:
-        continue
-      if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-        if key == "additional_special_tokens":
-          assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
-          assert all(isinstance(t, str) for t in value), "One of the tokens is not a string"
-          setattr(self, key, value)
-        elif isinstance(value, (str, AddedToken)):
-          setattr(self, key, value)
-        else:
-          raise TypeError(
-            "special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
-          )
-
-  def sanitize_special_tokens(self) -> int:
-    return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
-
-  def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
-    if not special_tokens_dict:
-      return 0
-
-    added_tokens = 0
-    for key, value in special_tokens_dict.items():
-      assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
-
-      setattr(self, key, value)
-
-      if key == "additional_special_tokens":
-        assert isinstance(value, (list, tuple)) and all(
-          isinstance(t, (str, AddedToken)) for t in value
-        ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
-        added_tokens += self.add_tokens(value, special_tokens=True)
-      else:
-        assert isinstance(
-          value, (str, AddedToken)
-        ), f"Token {value} for key {key} should be a str or an AddedToken instance"
-        added_tokens += self.add_tokens([value], special_tokens=True)
-
-    return added_tokens
-
-  def add_tokens(
-    self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
-  ) -> int:
-    if not new_tokens:
-      return 0
-
-    if not isinstance(new_tokens, (list, tuple)):
-      new_tokens = [new_tokens]
-
-    return self._add_tokens(new_tokens, special_tokens=special_tokens)
-
-  def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
-    raise NotImplementedError
-
-  @property
-  def bos_token(self) -> str:
-    if self._bos_token is None and self.verbose:
-      return None
-    return str(self._bos_token)
-
-  @property
-  def eos_token(self) -> str:
-    if self._eos_token is None and self.verbose:
-      return None
-    return str(self._eos_token)
-
-  @property
-  def unk_token(self) -> str:
-    if self._unk_token is None and self.verbose:
-      return None
-    return str(self._unk_token)
-
-  @property
-  def sep_token(self) -> str:
-    if self._sep_token is None and self.verbose:
-      return None
-    return str(self._sep_token)
-
-  @property
-  def pad_token(self) -> str:
-    if self._pad_token is None and self.verbose:
-      return None
-    return str(self._pad_token)
-
-  @property
-  def cls_token(self) -> str:
-    if self._cls_token is None and self.verbose:
-      return None
-    return str(self._cls_token)
-
-  @property
-  def mask_token(self) -> str:
-    if self._mask_token is None and self.verbose:
-      return None
-    return str(self._mask_token)
-
-  @property
-  def additional_special_tokens(self) -> List[str]:
-    if self._additional_special_tokens is None and self.verbose:
-      return None
-    return [str(tok) for tok in self._additional_special_tokens]
-
-  @bos_token.setter
-  def bos_token(self, value):
-    self._bos_token = value
-
-  @eos_token.setter
-  def eos_token(self, value):
-    self._eos_token = value
-
-  @unk_token.setter
-  def unk_token(self, value):
-    self._unk_token = value
-
-  @sep_token.setter
-  def sep_token(self, value):
-    self._sep_token = value
-
-  @pad_token.setter
-  def pad_token(self, value):
-    self._pad_token = value
-
-  @cls_token.setter
-  def cls_token(self, value):
-    self._cls_token = value
-
-  @mask_token.setter
-  def mask_token(self, value):
-    self._mask_token = value
-
-  @additional_special_tokens.setter
-  def additional_special_tokens(self, value):
-    self._additional_special_tokens = value
-
-  @property
-  def bos_token_id(self) -> Optional[int]:
-    if self._bos_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.bos_token)
-
-  @property
-  def eos_token_id(self) -> Optional[int]:
-    if self._eos_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.eos_token)
-
-  @property
-  def unk_token_id(self) -> Optional[int]:
-    if self._unk_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.unk_token)
-
-  @property
-  def sep_token_id(self) -> Optional[int]:
-    if self._sep_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.sep_token)
-
-  @property
-  def pad_token_id(self) -> Optional[int]:
-    if self._pad_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.pad_token)
-
-  @property
-  def pad_token_type_id(self) -> int:
-    return self._pad_token_type_id
-
-  @property
-  def cls_token_id(self) -> Optional[int]:
-    if self._cls_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.cls_token)
-
-  @property
-  def mask_token_id(self) -> Optional[int]:
-    if self._mask_token is None:
-      return None
-    return self.convert_tokens_to_ids(self.mask_token)
-
-  @property
-  def additional_special_tokens_ids(self) -> List[int]:
-    return self.convert_tokens_to_ids(self.additional_special_tokens)
-
-  @bos_token_id.setter
-  def bos_token_id(self, value):
-    self._bos_token = self.convert_tokens_to_ids(value)
-
-  @eos_token_id.setter
-  def eos_token_id(self, value):
-    self._eos_token = self.convert_tokens_to_ids(value)
-
-  @unk_token_id.setter
-  def unk_token_id(self, value):
-    self._unk_token = self.convert_tokens_to_ids(value)
-
-  @sep_token_id.setter
-  def sep_token_id(self, value):
-    self._sep_token = self.convert_tokens_to_ids(value)
-
-  @pad_token_id.setter
-  def pad_token_id(self, value):
-    self._pad_token = self.convert_tokens_to_ids(value)
-
-  @cls_token_id.setter
-  def cls_token_id(self, value):
-    self._cls_token = self.convert_tokens_to_ids(value)
-
-  @mask_token_id.setter
-  def mask_token_id(self, value):
-    self._mask_token = self.convert_tokens_to_ids(value)
-
-  @additional_special_tokens_ids.setter
-  def additional_special_tokens_ids(self, values):
-    self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]
-
-  @property
-  def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
-    set_attr = {}
-    for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-      attr_value = getattr(self, "_" + attr)
-      if attr_value:
-        set_attr[attr] = str(attr_value)
-    return set_attr
-
-  @property
-  def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
-    set_attr = {}
-    for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-      attr_value = getattr(self, "_" + attr)
-      if attr_value:
-        set_attr[attr] = attr_value
-    return set_attr
-
-  @property
-  def all_special_tokens(self) -> List[str]:
-    all_toks = [str(s) for s in self.all_special_tokens_extended]
-    return all_toks
-
-  @property
-  def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
-    all_toks = []
-    set_attr = self.special_tokens_map_extended
-    for attr_value in set_attr.values():
-      all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
-    all_toks = list(OrderedDict.fromkeys(all_toks))
-    return all_toks
-
-  @property
-  def all_special_ids(self) -> List[int]:
-    all_toks = self.all_special_tokens
-    all_ids = self.convert_tokens_to_ids(all_toks)
-    return all_ids
-
-
-class PreTrainedTokenizerBase(SpecialTokensMixin):
-  vocab_files_names: Dict[str, str] = {}
-  pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
-  pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
-  max_model_input_sizes: Dict[str, Optional[int]] = {}
-
-  # first name has to correspond to main model input name
-  # to make sure `tokenizer.pad(...)` works correctly
-  model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
-  padding_side: str = "right"
-  slow_tokenizer_class = None
-
-  def __init__(self, **kwargs):
-    # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
-    self.init_inputs = ()
-    self.init_kwargs = copy.deepcopy(kwargs)
-    self.name_or_path = kwargs.pop("name_or_path", "")
-
-    # For backward compatibility we fallback to set model_max_length from max_len if provided
-    model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
-    self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
-
-    # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
-    self.padding_side = kwargs.pop("padding_side", self.padding_side)
-    assert self.padding_side in [
-      "right",
-      "left",
-    ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
-    self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
-
-    self.deprecation_warnings = (
-      {}
-    )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
-
-    super().__init__(**kwargs)
-
-  @property
-  def max_len_single_sentence(self) -> int:
-    return self.model_max_length - self.num_special_tokens_to_add(pair=False)
-
-  @property
-  def max_len_sentences_pair(self) -> int:
-    return self.model_max_length - self.num_special_tokens_to_add(pair=True)
-
-  @max_len_single_sentence.setter
-  def max_len_single_sentence(self, value) -> int:
-    # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
-    if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
-      self.deprecation_warnings["max_len_single_sentence"] = True
-    else:
-      raise ValueError(
-        "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
-      )
-
-  @max_len_sentences_pair.setter
-  def max_len_sentences_pair(self, value) -> int:
-    # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
-    if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
-      self.deprecation_warnings["max_len_sentences_pair"] = True
-    else:
-      raise ValueError(
-        "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
-      )
-
-  def __repr__(self) -> str:
-    return (
-      f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
-      f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
-      f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
-    )
-
-  def get_vocab(self) -> Dict[str, int]:
-    raise NotImplementedError()
-
-  @classmethod
-  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
-    cache_dir = kwargs.pop("cache_dir", None)
-    force_download = kwargs.pop("force_download", False)
-    resume_download = kwargs.pop("resume_download", False)
-    proxies = kwargs.pop("proxies", None)
-    local_files_only = kwargs.pop("local_files_only", False)
-    use_auth_token = kwargs.pop("use_auth_token", None)
-    revision = kwargs.pop("revision", None)
-    subfolder = kwargs.pop("subfolder", None)
-
-    s3_models = list(cls.max_model_input_sizes.keys())
-    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    vocab_files = {}
-    init_configuration = {}
-    if pretrained_model_name_or_path in s3_models:
-      # Get the vocabulary from AWS S3 bucket
-      for file_id, map_list in cls.pretrained_vocab_files_map.items():
-        vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-      if (
-        cls.pretrained_init_configuration
-        and pretrained_model_name_or_path in cls.pretrained_init_configuration
-      ):
-        init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
-    else:
-      # Get the vocabulary from local files
-      if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-        if len(cls.vocab_files_names) > 1:
-          raise ValueError(
-            "Calling {}.from_pretrained() with the path to a single file or url is not supported."
-            "Use a model identifier or the path to a directory instead.".format(cls.__name__)
-          )
-        file_id = list(cls.vocab_files_names.keys())[0]
-        vocab_files[file_id] = pretrained_model_name_or_path
-      else:
-        # At this point pretrained_model_name_or_path is either a directory or a model identifier name
-        additional_files_names = {
-          "added_tokens_file": ADDED_TOKENS_FILE,
-          "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
-          "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-          "tokenizer_file": FULL_TOKENIZER_FILE,
-        }
-        # Look for the tokenizer files
-        for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
-          if os.path.isdir(pretrained_model_name_or_path):
-            if subfolder is not None:
-              full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
-            else:
-              full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-            if not os.path.exists(full_file_name):
-              full_file_name = None
-          else:
-            full_file_name = hf_bucket_url(
-              pretrained_model_name_or_path,
-              filename=file_name,
-              subfolder=subfolder,
-              revision=revision,
-              mirror=None,
-            )
-
-          vocab_files[file_id] = full_file_name
-
-    # Get files from url, cache, or disk depending on the case
-    resolved_vocab_files = {}
-    unresolved_files = []
-    for file_id, file_path in vocab_files.items():
-      if file_path is None:
-        resolved_vocab_files[file_id] = None
-      else:
-        try:
-          try:
-            resolved_vocab_files[file_id] = cached_path(
-              file_path,
-              cache_dir=cache_dir,
-              force_download=force_download,
-              proxies=proxies,
-              resume_download=resume_download,
-              local_files_only=local_files_only,
-              use_auth_token=use_auth_token,
-            )
-          except FileNotFoundError as error:
-            if local_files_only:
-              unresolved_files.append(file_id)
-            else:
-              raise error
-
-        except requests.exceptions.HTTPError as err:
-          if "404 Client Error" in str(err):
-            resolved_vocab_files[file_id] = None
-          else:
-            raise err
-
-    if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
-      msg = (
-        f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-        f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-        f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
-      )
-      raise EnvironmentError(msg)
-
-    for file_id, file_path in vocab_files.items():
-      if file_id not in resolved_vocab_files:
-        continue
-
-    return cls._from_pretrained(
-      resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
-    )
-
-  @classmethod
-  def _from_pretrained(
-    cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
-  ):
-    # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
-    # file or if `from_slow` is set to True.
-    from_slow = kwargs.get("from_slow", False)
-    has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
-    if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
-      slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
-        copy.deepcopy(resolved_vocab_files),
-        pretrained_model_name_or_path,
-        copy.deepcopy(init_configuration),
-        *init_inputs,
-        **(copy.deepcopy(kwargs)),
-      )
-    else:
-      slow_tokenizer = None
-
-    # Prepare tokenizer initialization kwargs
-    # Did we saved some inputs and kwargs to reload ?
-    tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
-    if tokenizer_config_file is not None:
-      with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
-        init_kwargs = json.load(tokenizer_config_handle)
-      saved_init_inputs = init_kwargs.pop("init_inputs", ())
-      if not init_inputs:
-        init_inputs = saved_init_inputs
-    else:
-      init_kwargs = init_configuration
-
-    # Update with newly provided kwargs
-    init_kwargs.update(kwargs)
-
-    # Convert AddedTokens serialized as dict to class instances
-    def convert_added_tokens(obj: Union[AddedToken, Any]):
-      if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
-        obj.pop("__type")
-        return AddedToken(**obj)
-      elif isinstance(obj, (list, tuple)):
-        return list(convert_added_tokens(o) for o in obj)
-      elif isinstance(obj, dict):
-        return {k: convert_added_tokens(v) for k, v in obj.items()}
-      return obj
-
-    init_kwargs = convert_added_tokens(init_kwargs)
-
-    # Set max length if needed
-    if pretrained_model_name_or_path in cls.max_model_input_sizes:
-      # if we're using a pretrained model, ensure the tokenizer
-      # wont index sequences longer than the number of positional embeddings
-      model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
-      if model_max_length is not None and isinstance(model_max_length, (int, float)):
-        init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
-
-    # Merge resolved_vocab_files arguments in init_kwargs.
-    added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
-    for args_name, file_path in resolved_vocab_files.items():
-      if args_name not in init_kwargs:
-        init_kwargs[args_name] = file_path
-
-    if slow_tokenizer is not None:
-      init_kwargs["__slow_tokenizer"] = slow_tokenizer
-
-    init_kwargs["name_or_path"] = pretrained_model_name_or_path
-
-    # Instantiate tokenizer.
-    try:
-      tokenizer = cls(*init_inputs, **init_kwargs)
-    except OSError:
-      raise OSError(
-        "Unable to load vocabulary from file. "
-        "Please check that the provided vocabulary is accessible and not corrupted."
-      )
-
-    # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
-    # Removed: Now done at the base class level
-    # tokenizer.init_inputs = init_inputs
-    # tokenizer.init_kwargs = init_kwargs
-
-    # If there is a complementary special token map, load it
-    special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
-    if special_tokens_map_file is not None:
-      with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
-        special_tokens_map = json.load(special_tokens_map_handle)
-      for key, value in special_tokens_map.items():
-        if isinstance(value, dict):
-          value = AddedToken(**value)
-        elif isinstance(value, list):
-          value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
-        setattr(tokenizer, key, value)
-
-    # Add supplementary tokens.
-    special_tokens = tokenizer.all_special_tokens
-    if added_tokens_file is not None:
-      with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
-        added_tok_encoder = json.load(added_tokens_handle)
-
-      # Sort added tokens by index
-      added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
-
-      for token, index in added_tok_encoder_sorted:
-        assert index == len(tokenizer), (
-          f"Non-consecutive added token '{token}' found. "
-          f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
-        )
-        tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
-
-    # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
-    added_tokens = tokenizer.sanitize_special_tokens()
-
-    return tokenizer
-
-  def save_pretrained(
-    self,
-    save_directory: Union[str, os.PathLike],
-    legacy_format: bool = True,
-    filename_prefix: Optional[str] = None,
-  ) -> Tuple[str]:
-    if os.path.isfile(save_directory):
-      return
-    os.makedirs(save_directory, exist_ok=True)
-
-    special_tokens_map_file = os.path.join(
-      save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
-    )
-    tokenizer_config_file = os.path.join(
-      save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
-    )
-
-    tokenizer_config = copy.deepcopy(self.init_kwargs)
-    if len(self.init_inputs) > 0:
-      tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
-    for file_id in self.vocab_files_names.keys():
-      tokenizer_config.pop(file_id, None)
-
-    # Sanitize AddedTokens
-    def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
-      if isinstance(obj, AddedToken):
-        out = obj.__getstate__()
-        if add_type_field:
-          out["__type"] = "AddedToken"
-        return out
-      elif isinstance(obj, (list, tuple)):
-        return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
-      elif isinstance(obj, dict):
-        return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
-      return obj
-
-    # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
-    tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
-    with open(tokenizer_config_file, "w", encoding="utf-8") as f:
-      f.write(json.dumps(tokenizer_config, ensure_ascii=False))
-
-    # Sanitize AddedTokens in special_tokens_map
-    write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
-    with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-      f.write(json.dumps(write_dict, ensure_ascii=False))
-
-    file_names = (tokenizer_config_file, special_tokens_map_file)
-
-    return self._save_pretrained(
-      save_directory=save_directory,
-      file_names=file_names,
-      legacy_format=legacy_format,
-      filename_prefix=filename_prefix,
-    )
-
-  def _save_pretrained(
-    self,
-    save_directory: Union[str, os.PathLike],
-    file_names: Tuple[str],
-    legacy_format: bool = True,
-    filename_prefix: Optional[str] = None,
-  ) -> Tuple[str]:
-    if not legacy_format:
-      raise ValueError(
-        "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
-      )
-
-    save_directory = str(save_directory)
-
-    added_tokens_file = os.path.join(
-      save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
-    )
-    added_vocab = self.get_added_vocab()
-    if added_vocab:
-      with open(added_tokens_file, "w", encoding="utf-8") as f:
-        out_str = json.dumps(added_vocab, ensure_ascii=False)
-        f.write(out_str)
-
-    vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
-
-    return file_names + vocab_files + (added_tokens_file,)
-
-  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-    raise NotImplementedError
-
-  def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
-    raise NotImplementedError
-
-  def encode(
-    self,
-    text: Union[TextInput, PreTokenizedInput, EncodedInput],
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-    add_special_tokens: bool = True,
-    padding: Union[bool, str, PaddingStrategy] = False,
-    truncation: Union[bool, str, TruncationStrategy] = False,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    **kwargs
-  ) -> List[int]:
-    encoded_inputs = self.encode_plus(
-      text,
-      text_pair=text_pair,
-      add_special_tokens=add_special_tokens,
-      padding=padding,
-      truncation=truncation,
-      max_length=max_length,
-      stride=stride,
-      return_tensors=return_tensors,
-      **kwargs,
-    )
-
-    return encoded_inputs["input_ids"]
-
-  def num_special_tokens_to_add(self, pair: bool = False) -> int:
-    raise NotImplementedError
-
-  def _get_padding_truncation_strategies(
-    self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
-  ):
-    old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
-    old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
-
-    # Backward compatibility for previous behavior, maybe we should deprecate it:
-    # If you only set max_length, it activates truncation for max_length
-    if max_length is not None and padding is False and truncation is False:
-      if verbose:
-        self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
-      truncation = "longest_first"
-
-    # Get padding strategy
-    if padding is False and old_pad_to_max_length:
-      if max_length is None:
-        padding_strategy = PaddingStrategy.LONGEST
-      else:
-        padding_strategy = PaddingStrategy.MAX_LENGTH
-    elif padding is not False:
-      if padding is True:
-        padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
-      elif not isinstance(padding, PaddingStrategy):
-        padding_strategy = PaddingStrategy(padding)
-      elif isinstance(padding, PaddingStrategy):
-        padding_strategy = padding
-    else:
-      padding_strategy = PaddingStrategy.DO_NOT_PAD
-
-    # Get truncation strategy
-    if truncation is False and old_truncation_strategy != "do_not_truncate":
-      truncation_strategy = TruncationStrategy(old_truncation_strategy)
-    elif truncation is not False:
-      if truncation is True:
-        truncation_strategy = (
-          TruncationStrategy.LONGEST_FIRST
-        )  # Default to truncate the longest sequences in pairs of inputs
-      elif not isinstance(truncation, TruncationStrategy):
-        truncation_strategy = TruncationStrategy(truncation)
-      elif isinstance(truncation, TruncationStrategy):
-        truncation_strategy = truncation
-    else:
-      truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
-
-    # Set max length if needed
-    if max_length is None:
-      if padding_strategy == PaddingStrategy.MAX_LENGTH:
-        if self.model_max_length > LARGE_INTEGER:
-          if verbose:
-            self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
-          padding_strategy = PaddingStrategy.DO_NOT_PAD
-        else:
-          max_length = self.model_max_length
-
-      if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
-        if self.model_max_length > LARGE_INTEGER:
-          if verbose:
-            self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
-          truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
-        else:
-          max_length = self.model_max_length
-
-    # Test if we have a padding token
-    if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
-      raise ValueError(
-        "Asking to pad but the tokenizer does not have a padding token. "
-        "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
-        "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
-      )
-
-    # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
-    if (
-      truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
-      and padding_strategy != PaddingStrategy.DO_NOT_PAD
-      and pad_to_multiple_of is not None
-      and max_length is not None
-      and (max_length % pad_to_multiple_of != 0)
-    ):
-      raise ValueError(
-        f"Truncation and padding are both activated but "
-        f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
-      )
-
-    return padding_strategy, truncation_strategy, max_length, kwargs
-
-  def __call__(
-    self,
-    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
-    add_special_tokens: bool = True,
-    padding: Union[bool, str, PaddingStrategy] = False,
-    truncation: Union[bool, str, TruncationStrategy] = False,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    # Input type checking for clearer error
-    assert isinstance(text, str) or (
-      isinstance(text, (list, tuple))
-      and (
-        len(text) == 0
-        or (
-          isinstance(text[0], str)
-          or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
-        )
-      )
-    ), (
-      "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
-      "or `List[List[str]]` (batch of pretokenized examples)."
-    )
-
-    assert (
-      text_pair is None
-      or isinstance(text_pair, str)
-      or (
-        isinstance(text_pair, (list, tuple))
-        and (
-          len(text_pair) == 0
-          or (
-            isinstance(text_pair[0], str)
-            or (
-              isinstance(text_pair[0], (list, tuple))
-              and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
-            )
-          )
-        )
-      )
-    ), (
-      "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
-      "or `List[List[str]]` (batch of pretokenized examples)."
-    )
-
-    is_batched = bool(
-      (not is_split_into_words and isinstance(text, (list, tuple)))
-      or (
-        is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-      )
-    )
-
-    if is_batched:
-      batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-      return self.batch_encode_plus(
-        batch_text_or_text_pairs=batch_text_or_text_pairs,
-        add_special_tokens=add_special_tokens,
-        padding=padding,
-        truncation=truncation,
-        max_length=max_length,
-        stride=stride,
-        is_split_into_words=is_split_into_words,
-        pad_to_multiple_of=pad_to_multiple_of,
-        return_tensors=return_tensors,
-        return_token_type_ids=return_token_type_ids,
-        return_attention_mask=return_attention_mask,
-        return_overflowing_tokens=return_overflowing_tokens,
-        return_special_tokens_mask=return_special_tokens_mask,
-        return_offsets_mapping=return_offsets_mapping,
-        return_length=return_length,
-        verbose=verbose,
-        **kwargs,
-      )
-    else:
-      return self.encode_plus(
-        text=text,
-        text_pair=text_pair,
-        add_special_tokens=add_special_tokens,
-        padding=padding,
-        truncation=truncation,
-        max_length=max_length,
-        stride=stride,
-        is_split_into_words=is_split_into_words,
-        pad_to_multiple_of=pad_to_multiple_of,
-        return_tensors=return_tensors,
-        return_token_type_ids=return_token_type_ids,
-        return_attention_mask=return_attention_mask,
-        return_overflowing_tokens=return_overflowing_tokens,
-        return_special_tokens_mask=return_special_tokens_mask,
-        return_offsets_mapping=return_offsets_mapping,
-        return_length=return_length,
-        verbose=verbose,
-        **kwargs,
-      )
-
-  def encode_plus(
-    self,
-    text: Union[TextInput, PreTokenizedInput, EncodedInput],
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-    add_special_tokens: bool = True,
-    padding: Union[bool, str, PaddingStrategy] = False,
-    truncation: Union[bool, str, TruncationStrategy] = False,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-      padding=padding,
-      truncation=truncation,
-      max_length=max_length,
-      pad_to_multiple_of=pad_to_multiple_of,
-      verbose=verbose,
-      **kwargs,
-    )
-
-    return self._encode_plus(
-      text=text,
-      text_pair=text_pair,
-      add_special_tokens=add_special_tokens,
-      padding_strategy=padding_strategy,
-      truncation_strategy=truncation_strategy,
-      max_length=max_length,
-      stride=stride,
-      is_split_into_words=is_split_into_words,
-      pad_to_multiple_of=pad_to_multiple_of,
-      return_tensors=return_tensors,
-      return_token_type_ids=return_token_type_ids,
-      return_attention_mask=return_attention_mask,
-      return_overflowing_tokens=return_overflowing_tokens,
-      return_special_tokens_mask=return_special_tokens_mask,
-      return_offsets_mapping=return_offsets_mapping,
-      return_length=return_length,
-      verbose=verbose,
-      **kwargs,
-    )
-
-  def _encode_plus(
-    self,
-    text: Union[TextInput, PreTokenizedInput, EncodedInput],
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-    add_special_tokens: bool = True,
-    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    raise NotImplementedError
-
-  def batch_encode_plus(
-    self,
-    batch_text_or_text_pairs: Union[
-      List[TextInput],
-      List[TextInputPair],
-      List[PreTokenizedInput],
-      List[PreTokenizedInputPair],
-      List[EncodedInput],
-      List[EncodedInputPair],
-    ],
-    add_special_tokens: bool = True,
-    padding: Union[bool, str, PaddingStrategy] = False,
-    truncation: Union[bool, str, TruncationStrategy] = False,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-      padding=padding,
-      truncation=truncation,
-      max_length=max_length,
-      pad_to_multiple_of=pad_to_multiple_of,
-      verbose=verbose,
-      **kwargs,
-    )
-
-    return self._batch_encode_plus(
-      batch_text_or_text_pairs=batch_text_or_text_pairs,
-      add_special_tokens=add_special_tokens,
-      padding_strategy=padding_strategy,
-      truncation_strategy=truncation_strategy,
-      max_length=max_length,
-      stride=stride,
-      is_split_into_words=is_split_into_words,
-      pad_to_multiple_of=pad_to_multiple_of,
-      return_tensors=return_tensors,
-      return_token_type_ids=return_token_type_ids,
-      return_attention_mask=return_attention_mask,
-      return_overflowing_tokens=return_overflowing_tokens,
-      return_special_tokens_mask=return_special_tokens_mask,
-      return_offsets_mapping=return_offsets_mapping,
-      return_length=return_length,
-      verbose=verbose,
-      **kwargs,
-    )
-
-  def _batch_encode_plus(
-    self,
-    batch_text_or_text_pairs: Union[
-      List[TextInput],
-      List[TextInputPair],
-      List[PreTokenizedInput],
-      List[PreTokenizedInputPair],
-      List[EncodedInput],
-      List[EncodedInputPair],
-    ],
-    add_special_tokens: bool = True,
-    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    raise NotImplementedError
-
-  def pad(
-    self,
-    encoded_inputs: Union[
-      BatchEncoding,
-      List[BatchEncoding],
-      Dict[str, EncodedInput],
-      Dict[str, List[EncodedInput]],
-      List[Dict[str, EncodedInput]],
-    ],
-    padding: Union[bool, str, PaddingStrategy] = True,
-    max_length: Optional[int] = None,
-    pad_to_multiple_of: Optional[int] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    verbose: bool = True,
-  ) -> BatchEncoding:
-    # If we have a list of dicts, let's convert it in a dict of lists
-    # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
-    if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
-      encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
-
-    # The model's main input name, usually `input_ids`, has be passed for padding
-    if self.model_input_names[0] not in encoded_inputs:
-      raise ValueError(
-        "You should supply an encoding or a list of encodings to this method"
-        f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
-      )
-
-    required_input = encoded_inputs[self.model_input_names[0]]
-
-    if not required_input:
-      if return_attention_mask:
-        encoded_inputs["attention_mask"] = []
-      return encoded_inputs
-
-    # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
-    # and rebuild them afterwards if no return_tensors is specified
-    # Note that we lose the specific device the tensor may be on for PyTorch
-
-    first_element = required_input[0]
-    if isinstance(first_element, (list, tuple)):
-      # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
-      index = 0
-      while len(required_input[index]) == 0:
-        index += 1
-      if index < len(required_input):
-        first_element = required_input[index][0]
-    # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
-    if not isinstance(first_element, (int, list, tuple)):
-      if is_tf_available() and _is_tensorflow(first_element):
-        return_tensors = "tf" if return_tensors is None else return_tensors
-      elif is_torch_available() and _is_torch(first_element):
-        return_tensors = "pt" if return_tensors is None else return_tensors
-      elif isinstance(first_element, np.ndarray):
-        return_tensors = "np" if return_tensors is None else return_tensors
-      else:
-        raise ValueError(
-          f"type of {first_element} unknown: {type(first_element)}. "
-          f"Should be one of a python, numpy, pytorch or tensorflow object."
-        )
-
-      for key, value in encoded_inputs.items():
-        encoded_inputs[key] = to_py_obj(value)
-
-    # Convert padding_strategy in PaddingStrategy
-    padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
-      padding=padding, max_length=max_length, verbose=verbose
-    )
-
-    required_input = encoded_inputs[self.model_input_names[0]]
-    if required_input and not isinstance(required_input[0], (list, tuple)):
-      encoded_inputs = self._pad(
-        encoded_inputs,
-        max_length=max_length,
-        padding_strategy=padding_strategy,
-        pad_to_multiple_of=pad_to_multiple_of,
-        return_attention_mask=return_attention_mask,
-      )
-      return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
-
-    batch_size = len(required_input)
-    assert all(
-      len(v) == batch_size for v in encoded_inputs.values()
-    ), "Some items in the output dictionary have a different batch size than others."
-
-    if padding_strategy == PaddingStrategy.LONGEST:
-      max_length = max(len(inputs) for inputs in required_input)
-      padding_strategy = PaddingStrategy.MAX_LENGTH
-
-    batch_outputs = {}
-    for i in range(batch_size):
-      inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
-      outputs = self._pad(
-        inputs,
-        max_length=max_length,
-        padding_strategy=padding_strategy,
-        pad_to_multiple_of=pad_to_multiple_of,
-        return_attention_mask=return_attention_mask,
-      )
-
-      for key, value in outputs.items():
-        if key not in batch_outputs:
-          batch_outputs[key] = []
-        batch_outputs[key].append(value)
-
-    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-  def create_token_type_ids_from_sequences(
-    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-  ) -> List[int]:
-    if token_ids_1 is None:
-      return len(token_ids_0) * [0]
-    return [0] * len(token_ids_0) + [1] * len(token_ids_1)
-
-  def build_inputs_with_special_tokens(
-    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-  ) -> List[int]:
-    if token_ids_1 is None:
-      return token_ids_0
-    return token_ids_0 + token_ids_1
-
-  def prepare_for_model(
-    self,
-    ids: List[int],
-    pair_ids: Optional[List[int]] = None,
-    add_special_tokens: bool = True,
-    padding: Union[bool, str, PaddingStrategy] = False,
-    truncation: Union[bool, str, TruncationStrategy] = False,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    prepend_batch_axis: bool = False,
-    **kwargs
-  ) -> BatchEncoding:
-    # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-      padding=padding,
-      truncation=truncation,
-      max_length=max_length,
-      pad_to_multiple_of=pad_to_multiple_of,
-      verbose=verbose,
-      **kwargs,
-    )
-
-    pair = bool(pair_ids is not None)
-    len_ids = len(ids)
-    len_pair_ids = len(pair_ids) if pair else 0
-
-    if return_token_type_ids and not add_special_tokens:
-      raise ValueError(
-        "Asking to return token_type_ids while setting add_special_tokens to False "
-        "results in an undefined behavior. Please set add_special_tokens to True or "
-        "set return_token_type_ids to None."
-      )
-
-    # Load from model defaults
-    if return_token_type_ids is None:
-      return_token_type_ids = "token_type_ids" in self.model_input_names
-    if return_attention_mask is None:
-      return_attention_mask = "attention_mask" in self.model_input_names
-
-    encoded_inputs = {}
-
-    # Compute the total size of the returned encodings
-    total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-    # Truncation: Handle max sequence length
-    overflowing_tokens = []
-    if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-      ids, pair_ids, overflowing_tokens = self.truncate_sequences(
-        ids,
-        pair_ids=pair_ids,
-        num_tokens_to_remove=total_len - max_length,
-        truncation_strategy=truncation_strategy,
-        stride=stride,
-      )
-
-    if return_overflowing_tokens:
-      encoded_inputs["overflowing_tokens"] = overflowing_tokens
-      encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-    # Add special tokens
-    if add_special_tokens:
-      sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-      token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-    else:
-      sequence = ids + pair_ids if pair else ids
-      token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-
-    # Build output dictionary
-    encoded_inputs["input_ids"] = sequence
-    if return_token_type_ids:
-      encoded_inputs["token_type_ids"] = token_type_ids
-    if return_special_tokens_mask:
-      if add_special_tokens:
-        encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-      else:
-        encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
-
-    # Check lengths
-    self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-    # Padding
-    if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-      encoded_inputs = self.pad(
-        encoded_inputs,
-        max_length=max_length,
-        padding=padding_strategy.value,
-        pad_to_multiple_of=pad_to_multiple_of,
-        return_attention_mask=return_attention_mask,
-      )
-
-    if return_length:
-      encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-    batch_outputs = BatchEncoding(
-      encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-    )
-
-    return batch_outputs
-
-  def truncate_sequences(
-    self,
-    ids: List[int],
-    pair_ids: Optional[List[int]] = None,
-    num_tokens_to_remove: int = 0,
-    truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-    stride: int = 0,
-  ) -> Tuple[List[int], List[int], List[int]]:
-    if num_tokens_to_remove <= 0:
-      return ids, pair_ids, []
-
-    if not isinstance(truncation_strategy, TruncationStrategy):
-      truncation_strategy = TruncationStrategy(truncation_strategy)
-
-    overflowing_tokens = []
-    if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-      for _ in range(num_tokens_to_remove):
-        if pair_ids is None or len(ids) > len(pair_ids):
-          if not overflowing_tokens:
-            window_len = min(len(ids), stride + 1)
-          else:
-            window_len = 1
-          overflowing_tokens.extend(ids[-window_len:])
-          ids = ids[:-1]
-        else:
-          if not overflowing_tokens:
-            window_len = min(len(pair_ids), stride + 1)
-          else:
-            window_len = 1
-          overflowing_tokens.extend(pair_ids[-window_len:])
-          pair_ids = pair_ids[:-1]
-    elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
-      if len(ids) > num_tokens_to_remove:
-        window_len = min(len(ids), stride + num_tokens_to_remove)
-        overflowing_tokens = ids[-window_len:]
-        ids = ids[:-num_tokens_to_remove]
-    elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-      if len(pair_ids) > num_tokens_to_remove:
-        window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-        overflowing_tokens = pair_ids[-window_len:]
-        pair_ids = pair_ids[:-num_tokens_to_remove]
-
-    return (ids, pair_ids, overflowing_tokens)
-
-  def _pad(
-    self,
-    encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-    max_length: Optional[int] = None,
-    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-    pad_to_multiple_of: Optional[int] = None,
-    return_attention_mask: Optional[bool] = None,
-  ) -> dict:
-    # Load from model defaults
-    if return_attention_mask is None:
-      return_attention_mask = "attention_mask" in self.model_input_names
-
-    required_input = encoded_inputs[self.model_input_names[0]]
-
-    if padding_strategy == PaddingStrategy.LONGEST:
-      max_length = len(required_input)
-
-    if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-      max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-    needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-    if needs_to_be_padded:
-      difference = max_length - len(required_input)
-      if self.padding_side == "right":
-        if return_attention_mask:
-          encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
-        if "token_type_ids" in encoded_inputs:
-          encoded_inputs["token_type_ids"] = (
-            encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-          )
-        if "special_tokens_mask" in encoded_inputs:
-          encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-        encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-      elif self.padding_side == "left":
-        if return_attention_mask:
-          encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
-        if "token_type_ids" in encoded_inputs:
-          encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-            "token_type_ids"
-          ]
-        if "special_tokens_mask" in encoded_inputs:
-          encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-        encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-      else:
-        raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-    elif return_attention_mask and "attention_mask" not in encoded_inputs:
-      encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-    return encoded_inputs
-
-  def convert_tokens_to_string(self, tokens: List[str]) -> str:
-    raise NotImplementedError
-
-  def batch_decode(
-    self,
-    sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
-    skip_special_tokens: bool = False,
-    clean_up_tokenization_spaces: bool = True,
-    **kwargs
-  ) -> List[str]:
-    return [
-      self.decode(
-        seq,
-        skip_special_tokens=skip_special_tokens,
-        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-        **kwargs,
-      )
-      for seq in sequences
-    ]
-
-  def decode(
-    self,
-    token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
-    skip_special_tokens: bool = False,
-    clean_up_tokenization_spaces: bool = True,
-    **kwargs
-  ) -> str:
-    # Convert inputs to python lists
-    token_ids = to_py_obj(token_ids)
-
-    return self._decode(
-      token_ids=token_ids,
-      skip_special_tokens=skip_special_tokens,
-      clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-      **kwargs,
-    )
-
-  def _decode(
-    self,
-    token_ids: Union[int, List[int]],
-    skip_special_tokens: bool = False,
-    clean_up_tokenization_spaces: bool = True,
-    **kwargs
-  ) -> str:
-    raise NotImplementedError
-
-  def get_special_tokens_mask(
-    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-  ) -> List[int]:
-    assert already_has_special_tokens and token_ids_1 is None, (
-      "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
-      "Please use a slow (full python) tokenizer to activate this argument."
-      "Or set `return_special_tokens_mask=True` when calling the encoding method "
-      "to get the special tokens mask in any tokenizer. "
-    )
-
-    all_special_ids = self.all_special_ids  # cache the property
-
-    special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
-
-    return special_tokens_mask
-
-  @staticmethod
-  def clean_up_tokenization(out_string: str) -> str:
-    """
-    Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
-    Args:
-        out_string (:obj:`str`): The text to clean up.
-    Returns:
-        :obj:`str`: The cleaned-up string.
-    """
-    out_string = (
-      out_string.replace(" .", ".")
-        .replace(" ?", "?")
-        .replace(" !", "!")
-        .replace(" ,", ",")
-        .replace(" ' ", "'")
-        .replace(" n't", "n't")
-        .replace(" 'm", "'m")
-        .replace(" 's", "'s")
-        .replace(" 've", "'ve")
-        .replace(" 're", "'re")
-    )
-    return out_string
-
-  def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
-    if max_length is None and len(ids) > self.model_max_length and verbose:
-      self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
-
-  @contextmanager
-  def as_target_tokenizer(self):
-    yield
-
-  def prepare_seq2seq_batch(
-    self,
-    src_texts: List[str],
-    tgt_texts: Optional[List[str]] = None,
-    max_length: Optional[int] = None,
-    max_target_length: Optional[int] = None,
-    padding: str = "longest",
-    return_tensors: str = None,
-    truncation: bool = True,
-    **kwargs,
-  ) -> BatchEncoding:
-    # mBART-specific kwargs that should be ignored by other models.
-    kwargs.pop("src_lang", None)
-    kwargs.pop("tgt_lang", None)
-    if max_length is None:
-      max_length = self.model_max_length
-    model_inputs = self(
-      src_texts,
-      add_special_tokens=True,
-      return_tensors=return_tensors,
-      max_length=max_length,
-      padding=padding,
-      truncation=truncation,
-      **kwargs,
-    )
-    if tgt_texts is None:
-      return model_inputs
-    # Process tgt_texts
-    if max_target_length is None:
-      max_target_length = max_length
-    with self.as_target_tokenizer():
-      labels = self(
-        tgt_texts,
-        add_special_tokens=True,
-        return_tensors=return_tensors,
-        padding=padding,
-        max_length=max_target_length,
-        truncation=truncation,
-        **kwargs,
-      )
-    model_inputs["labels"] = labels["input_ids"]
-    return model_inputs
-
-
-class PreTrainedTokenizer(PreTrainedTokenizerBase):
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    # Added tokens - We store this for both slow and fast tokenizers
-    # until the serialization of Fast tokenizers is updated
-    self.added_tokens_encoder: Dict[str, int] = {}
-    self.added_tokens_decoder: Dict[int, str] = {}
-    self.unique_no_split_tokens: List[str] = []
-
-  @property
-  def is_fast(self) -> bool:
-    return False
-
-  @property
-  def vocab_size(self) -> int:
-    """
-    :obj:`int`: Size of the base vocabulary (without the added tokens).
-    """
-    raise NotImplementedError
-
-  def get_added_vocab(self) -> Dict[str, int]:
-    """
-    Returns the added tokens in the vocabulary as a dictionary of token to index.
-    Returns:
-        :obj:`Dict[str, int]`: The added tokens.
-    """
-    return self.added_tokens_encoder
-
-  def __len__(self):
-    """
-    Size of the full vocabulary with the added tokens.
-    """
-    return self.vocab_size + len(self.added_tokens_encoder)
-
-  def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
-    """
-    Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
-    it with indices starting from length of the current vocabulary.
-    Args:
-        new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
-            Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-            checking if the tokenizer assign the index of the ``unk_token`` to them).
-        special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the tokens should be added as special tokens.
-    Returns:
-        :obj:`int`: The number of tokens actually added to the vocabulary.
-    Examples::
-        # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-        print('We have added', num_added_toks, 'tokens')
-        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-        model.resize_token_embeddings(len(tokenizer))
-    """
-    new_tokens = [str(tok) for tok in new_tokens]
-
-    tokens_to_add = []
-    for token in new_tokens:
-      assert isinstance(token, str)
-      if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
-        token = token.lower()
-      if (
-        token != self.unk_token
-        and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
-        and token not in tokens_to_add
-      ):
-        tokens_to_add.append(token)
-
-    added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
-    added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-    self.added_tokens_encoder.update(added_tok_encoder)
-    self.added_tokens_decoder.update(added_tok_decoder)
-
-    # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
-    if special_tokens:
-      self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
-    else:
-      # Or on the newly added tokens
-      self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
-
-    return len(tokens_to_add)
-
-  def num_special_tokens_to_add(self, pair: bool = False) -> int:
-    """
-    Returns the number of added tokens when encoding a sequence with special tokens.
-    .. note::
-        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
-        put this inside your training loop.
-    Args:
-        pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether the number of added tokens should be computed in the case of a sequence pair or a single
-            sequence.
-    Returns:
-        :obj:`int`: Number of special tokens added to sequences.
-    """
-    token_ids_0 = []
-    token_ids_1 = []
-    return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
-
-  def tokenize(self, text: TextInput, **kwargs) -> List[str]:
-    """
-    Converts a string in a sequence of tokens, using the tokenizer.
-    Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
-    (BPE/SentencePieces/WordPieces). Takes care of added tokens.
-    Args:
-        text (:obj:`str`):
-            The sequence to be encoded.
-        **kwargs (additional keyword arguments):
-            Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
-    Returns:
-        :obj:`List[str]`: The list of tokens.
-    """
-    # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
-    all_special_tokens_extended = dict(
-      (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
-    )
-
-    text, kwargs = self.prepare_for_tokenization(text, **kwargs)
-
-    # TODO: should this be in the base class?
-    if hasattr(self, "do_lower_case") and self.do_lower_case:
-      # convert non-special tokens to lowercase
-      escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
-      pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
-      text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
-
-    def split_on_token(tok, text):
-      result = []
-      tok_extended = all_special_tokens_extended.get(tok, None)
-      split_text = text.split(tok)
-      full_word = ""
-      for i, sub_text in enumerate(split_text):
-        # AddedToken can control whitespace stripping around them.
-        # We use them for GPT2 and Roberta to have different behavior depending on the special token
-        # Cf. https://github.com/huggingface/transformers/pull/2778
-        # and https://github.com/huggingface/transformers/issues/3788
-        if isinstance(tok_extended, AddedToken):
-          if tok_extended.single_word:
-            # Try to avoid splitting on token
-            if (
-              i < len(split_text) - 1
-              and not _is_end_of_word(sub_text)
-              and not _is_start_of_word(split_text[i + 1])
-            ):
-              # Don't extract the special token
-              full_word += sub_text + tok
-            elif full_word:
-              full_word += sub_text
-              result.append(full_word)
-              full_word = ""
-              continue
-          # Strip white spaces on the right
-          if tok_extended.rstrip and i > 0:
-            # A bit counter-intuitive but we strip the left of the string
-            # since tok_extended.rstrip means the special token is eating all white spaces on its right
-            sub_text = sub_text.lstrip()
-          # Strip white spaces on the left
-          if tok_extended.lstrip and i < len(split_text) - 1:
-            sub_text = sub_text.rstrip()  # Opposite here
-        else:
-          # We strip left and right by default
-          if i < len(split_text) - 1:
-            sub_text = sub_text.rstrip()
-          if i > 0:
-            sub_text = sub_text.lstrip()
-
-        if i == 0 and not sub_text:
-          result.append(tok)
-        elif i == len(split_text) - 1:
-          if sub_text:
-            result.append(sub_text)
-          else:
-            pass
-        else:
-          if sub_text:
-            result.append(sub_text)
-          result.append(tok)
-      return result
-
-    def split_on_tokens(tok_list, text):
-      if not text.strip():
-        return []
-      if not tok_list:
-        return self._tokenize(text)
-
-      tokenized_text = []
-      text_list = [text]
-      for tok in tok_list:
-        tokenized_text = []
-        for sub_text in text_list:
-          if sub_text not in self.unique_no_split_tokens:
-            tokenized_text.extend(split_on_token(tok, sub_text))
-          else:
-            tokenized_text.append(sub_text)
-        text_list = tokenized_text
-
-      return list(
-        itertools.chain.from_iterable(
-          (
-            self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
-            for token in tokenized_text
-          )
-        )
-      )
-
-    no_split_token = self.unique_no_split_tokens
-    tokenized_text = split_on_tokens(no_split_token, text)
-    return tokenized_text
-
-  def _tokenize(self, text, **kwargs):
-    """
-    Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
-    vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
-    Do NOT take care of added tokens.
-    """
-    raise NotImplementedError
-
-  def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
-    """
-    Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
-    vocabulary.
-    Args:
-        tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
-    Returns:
-        :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
-    """
-    if tokens is None:
-      return None
-
-    if isinstance(tokens, str):
-      return self._convert_token_to_id_with_added_voc(tokens)
-
-    ids = []
-    for token in tokens:
-      ids.append(self._convert_token_to_id_with_added_voc(token))
-    return ids
-
-  def _convert_token_to_id_with_added_voc(self, token):
-    if token is None:
-      return None
-
-    if token in self.added_tokens_encoder:
-      return self.added_tokens_encoder[token]
-    return self._convert_token_to_id(token)
-
-  def _convert_token_to_id(self, token):
-    raise NotImplementedError
-
-  def _encode_plus(
-    self,
-    text: Union[TextInput, PreTokenizedInput, EncodedInput],
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-    add_special_tokens: bool = True,
-    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    def get_input_ids(text):
-      if isinstance(text, str):
-        tokens = self.tokenize(text, **kwargs)
-        return self.convert_tokens_to_ids(tokens)
-      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-        if is_split_into_words:
-          tokens = list(
-            itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
-          )
-          return self.convert_tokens_to_ids(tokens)
-        else:
-          return self.convert_tokens_to_ids(text)
-      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-        return text
-      else:
-        if is_split_into_words:
-          raise ValueError(
-            f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
-          )
-        else:
-          raise ValueError(
-            f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
-          )
-
-    if return_offsets_mapping:
-      raise NotImplementedError(
-        "return_offset_mapping is not available when using Python tokenizers."
-        "To use this feature, change your tokenizer to one deriving from "
-        "transformers.PreTrainedTokenizerFast."
-        "More information on available tokenizers at "
-        "https://github.com/huggingface/transformers/pull/2674"
-      )
-
-    first_ids = get_input_ids(text)
-    second_ids = get_input_ids(text_pair) if text_pair is not None else None
-
-    return self.prepare_for_model(
-      first_ids,
-      pair_ids=second_ids,
-      add_special_tokens=add_special_tokens,
-      padding=padding_strategy.value,
-      truncation=truncation_strategy.value,
-      max_length=max_length,
-      stride=stride,
-      pad_to_multiple_of=pad_to_multiple_of,
-      return_tensors=return_tensors,
-      prepend_batch_axis=True,
-      return_attention_mask=return_attention_mask,
-      return_token_type_ids=return_token_type_ids,
-      return_overflowing_tokens=return_overflowing_tokens,
-      return_special_tokens_mask=return_special_tokens_mask,
-      return_length=return_length,
-      verbose=verbose,
-    )
-
-  def _batch_encode_plus(
-    self,
-    batch_text_or_text_pairs: Union[
-      List[TextInput],
-      List[TextInputPair],
-      List[PreTokenizedInput],
-      List[PreTokenizedInputPair],
-      List[EncodedInput],
-      List[EncodedInputPair],
-    ],
-    add_special_tokens: bool = True,
-    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    is_split_into_words: bool = False,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_offsets_mapping: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-    **kwargs
-  ) -> BatchEncoding:
-    def get_input_ids(text):
-      if isinstance(text, str):
-        tokens = self.tokenize(text, **kwargs)
-        return self.convert_tokens_to_ids(tokens)
-      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-        if is_split_into_words:
-          tokens = list(
-            itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
-          )
-          return self.convert_tokens_to_ids(tokens)
-        else:
-          return self.convert_tokens_to_ids(text)
-      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-        return text
-      else:
-        raise ValueError(
-          "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
-        )
-
-    if return_offsets_mapping:
-      raise NotImplementedError(
-        "return_offset_mapping is not available when using Python tokenizers."
-        "To use this feature, change your tokenizer to one deriving from "
-        "transformers.PreTrainedTokenizerFast."
-      )
-
-    input_ids = []
-    for ids_or_pair_ids in batch_text_or_text_pairs:
-      if not isinstance(ids_or_pair_ids, (list, tuple)):
-        ids, pair_ids = ids_or_pair_ids, None
-      elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
-        ids, pair_ids = ids_or_pair_ids, None
-      else:
-        ids, pair_ids = ids_or_pair_ids
-
-      first_ids = get_input_ids(ids)
-      second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
-      input_ids.append((first_ids, second_ids))
-
-    batch_outputs = self._batch_prepare_for_model(
-      input_ids,
-      add_special_tokens=add_special_tokens,
-      padding_strategy=padding_strategy,
-      truncation_strategy=truncation_strategy,
-      max_length=max_length,
-      stride=stride,
-      pad_to_multiple_of=pad_to_multiple_of,
-      return_attention_mask=return_attention_mask,
-      return_token_type_ids=return_token_type_ids,
-      return_overflowing_tokens=return_overflowing_tokens,
-      return_special_tokens_mask=return_special_tokens_mask,
-      return_length=return_length,
-      return_tensors=return_tensors,
-      verbose=verbose,
-    )
-
-    return BatchEncoding(batch_outputs)
-
-  def _batch_prepare_for_model(
-    self,
-    batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
-    add_special_tokens: bool = True,
-    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-    max_length: Optional[int] = None,
-    stride: int = 0,
-    pad_to_multiple_of: Optional[int] = None,
-    return_tensors: Optional[str] = None,
-    return_token_type_ids: Optional[bool] = None,
-    return_attention_mask: Optional[bool] = None,
-    return_overflowing_tokens: bool = False,
-    return_special_tokens_mask: bool = False,
-    return_length: bool = False,
-    verbose: bool = True,
-  ) -> BatchEncoding:
-    """
-    Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-    adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-    manages a moving window (with user defined stride) for overflowing tokens
-    Args:
-        batch_ids_pairs: list of tokenized input ids or input ids pairs
-    """
-
-    batch_outputs = {}
-    for first_ids, second_ids in batch_ids_pairs:
-      outputs = self.prepare_for_model(
-        first_ids,
-        second_ids,
-        add_special_tokens=add_special_tokens,
-        padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-        truncation=truncation_strategy.value,
-        max_length=max_length,
-        stride=stride,
-        pad_to_multiple_of=None,  # we pad in batch afterward
-        return_attention_mask=False,  # we pad in batch afterward
-        return_token_type_ids=return_token_type_ids,
-        return_overflowing_tokens=return_overflowing_tokens,
-        return_special_tokens_mask=return_special_tokens_mask,
-        return_length=return_length,
-        return_tensors=None,  # We convert the whole batch to tensors at the end
-        prepend_batch_axis=False,
-        verbose=verbose,
-      )
-
-      for key, value in outputs.items():
-        if key not in batch_outputs:
-          batch_outputs[key] = []
-        batch_outputs[key].append(value)
-
-    batch_outputs = self.pad(
-      batch_outputs,
-      padding=padding_strategy.value,
-      max_length=max_length,
-      pad_to_multiple_of=pad_to_multiple_of,
-      return_attention_mask=return_attention_mask,
-    )
-
-    batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-    return batch_outputs
-
-  def prepare_for_tokenization(
-    self, text: str, is_split_into_words: bool = False, **kwargs
-  ) -> Tuple[str, Dict[str, Any]]:
-    """
-    Performs any necessary transformations before tokenization.
-    This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
-    :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
-    Args:
-        text (:obj:`str`):
-            The text to prepare.
-        is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the text has been pretokenized.
-        kwargs:
-            Keyword arguments to use for the tokenization.
-    Returns:
-        :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
-    """
-    return (text, kwargs)
-
-  def get_special_tokens_mask(
-    self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
-  ) -> List[int]:
-    """
-    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-    special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-    Args:
-        token_ids_0 (:obj:`List[int]`):
-            List of ids of the first sequence.
-        token_ids_1 (:obj:`List[int]`, `optional`):
-            List of ids of the second sequence.
-        already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the token list is already formatted with special tokens for the model.
-    Returns:
-        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-    """
-    return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
-
-  @overload
-  def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
-    ...
-
-  @overload
-  def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
-    ...
-
-  def convert_ids_to_tokens(
-    self, ids: Union[int, List[int]], skip_special_tokens: bool = False
-  ) -> Union[str, List[str]]:
-    """
-    Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
-    added tokens.
-    Args:
-        ids (:obj:`int` or :obj:`List[int]`):
-            The token id (or token ids) to convert to tokens.
-        skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to remove special tokens in the decoding.
-    Returns:
-        :obj:`str` or :obj:`List[str]`: The decoded token(s).
-    """
-    if isinstance(ids, int):
-      if ids in self.added_tokens_decoder:
-        return self.added_tokens_decoder[ids]
-      else:
-        return self._convert_id_to_token(ids)
-    tokens = []
-    for index in ids:
-      index = int(index)
-      if skip_special_tokens and index in self.all_special_ids:
-        continue
-      if index in self.added_tokens_decoder:
-        tokens.append(self.added_tokens_decoder[index])
-      else:
-        tokens.append(self._convert_id_to_token(index))
-    return tokens
-
-  def _convert_id_to_token(self, index: int) -> str:
-    raise NotImplementedError
-
-  def convert_tokens_to_string(self, tokens: List[str]) -> str:
-    return " ".join(tokens)
-
-  def _decode(
-    self,
-    token_ids: List[int],
-    skip_special_tokens: bool = False,
-    clean_up_tokenization_spaces: bool = True,
-    spaces_between_special_tokens: bool = True,
-  ) -> str:
-    filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-    # To avoid mixing byte-level and unicode for byte-level BPT
-    # we need to build string separately for added tokens and byte-level tokens
-    # cf. https://github.com/huggingface/transformers/issues/1133
-    sub_texts = []
-    current_sub_text = []
-    for token in filtered_tokens:
-      if skip_special_tokens and token in self.all_special_ids:
-        continue
-      if token in self.added_tokens_encoder:
-        if current_sub_text:
-          sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-          current_sub_text = []
-        sub_texts.append(token)
-      else:
-        current_sub_text.append(token)
-    if current_sub_text:
-      sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
-    if spaces_between_special_tokens:
-      text = " ".join(sub_texts)
-    else:
-      text = "".join(sub_texts)
-
-    if clean_up_tokenization_spaces:
-      clean_text = self.clean_up_tokenization(text)
-      return clean_text
-    else:
-      return text
-
-
-
-class BertTokenizer(PreTrainedTokenizer):
-  vocab_files_names = VOCAB_FILES_NAMES
-  pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-  pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-  max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-  def __init__(
-    self,
-    vocab_file,
-    do_lower_case=True,
-    do_basic_tokenize=True,
-    never_split=None,
-    unk_token="[UNK]",
-    sep_token="[SEP]",
-    pad_token="[PAD]",
-    cls_token="[CLS]",
-    mask_token="[MASK]",
-    tokenize_chinese_chars=True,
-    strip_accents=None,
-    **kwargs
-  ):
-    super().__init__(
-      do_lower_case=do_lower_case,
-      do_basic_tokenize=do_basic_tokenize,
-      never_split=never_split,
-      unk_token=unk_token,
-      sep_token=sep_token,
-      pad_token=pad_token,
-      cls_token=cls_token,
-      mask_token=mask_token,
-      tokenize_chinese_chars=tokenize_chinese_chars,
-      strip_accents=strip_accents,
-      **kwargs,
-    )
-    self.vocab = load_vocab(vocab_file)
-    self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-    self.do_basic_tokenize = do_basic_tokenize
-    if do_basic_tokenize:
-      self.basic_tokenizer = BasicTokenizer(
-        do_lower_case=do_lower_case,
-        never_split=never_split,
-        tokenize_chinese_chars=tokenize_chinese_chars,
-        strip_accents=strip_accents,
-      )
-    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
-
-  @property
-  def do_lower_case(self):
-    return self.basic_tokenizer.do_lower_case
-
-  @property
-  def vocab_size(self):
-    return len(self.vocab)
-
-  def get_vocab(self):
-    return dict(self.vocab, **self.added_tokens_encoder)
-
-  def _tokenize(self, text):
-    split_tokens = []
-    if self.do_basic_tokenize:
-      for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
-        # If the token is part of the never_split set
-        if token in self.basic_tokenizer.never_split:
-          split_tokens.append(token)
-        else:
-          split_tokens += self.wordpiece_tokenizer.tokenize(token)
-    else:
-      split_tokens = self.wordpiece_tokenizer.tokenize(text)
-    return split_tokens
-
-  def _convert_token_to_id(self, token):
-    return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-  def _convert_id_to_token(self, index):
-    return self.ids_to_tokens.get(index, self.unk_token)
-
-  def convert_tokens_to_string(self, tokens):
-    out_string = " ".join(tokens).replace(" ##", "").strip()
-    return out_string
-
-  def build_inputs_with_special_tokens(
-    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-  ) -> List[int]:
-    if token_ids_1 is None:
-      return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-    cls = [self.cls_token_id]
-    sep = [self.sep_token_id]
-    return cls + token_ids_0 + sep + token_ids_1 + sep
-
-  def get_special_tokens_mask(
-    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-  ) -> List[int]:
-    if already_has_special_tokens:
-      if token_ids_1 is not None:
-        raise ValueError(
-          "You should not supply a second sequence if the provided sequence of "
-          "ids is already formatted with special tokens for the model."
-        )
-      return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-    if token_ids_1 is not None:
-      return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-    return [1] + ([0] * len(token_ids_0)) + [1]
-
-  def create_token_type_ids_from_sequences(
-    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-  ) -> List[int]:
-    sep = [self.sep_token_id]
-    cls = [self.cls_token_id]
-    if token_ids_1 is None:
-      return len(cls + token_ids_0 + sep) * [0]
-    return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-    index = 0
-    if os.path.isdir(save_directory):
-      vocab_file = os.path.join(
-        save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-      )
-    else:
-      vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-    with open(vocab_file, "w", encoding="utf-8") as writer:
-      for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-        if index != token_index:
-          index = token_index
-        writer.write(token + "\n")
-        index += 1
-    return (vocab_file,)
-
-
-class BasicTokenizer(object):
-  def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
-    if never_split is None:
-      never_split = []
-    self.do_lower_case = do_lower_case
-    self.never_split = set(never_split)
-    self.tokenize_chinese_chars = tokenize_chinese_chars
-    self.strip_accents = strip_accents
-
-  def tokenize(self, text, never_split=None):
-    # union() returns a new set by concatenating the two sets.
-    never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-    text = self._clean_text(text)
-
-    # This was added on November 1st, 2018 for the multilingual and Chinese
-    # models. This is also applied to the English models now, but it doesn't
-    # matter since the English models were not trained on any Chinese data
-    # and generally don't have any Chinese data in them (there are Chinese
-    # characters in the vocabulary because Wikipedia does have some Chinese
-    # words in the English Wikipedia.).
-    if self.tokenize_chinese_chars:
-      text = self._tokenize_chinese_chars(text)
-    orig_tokens = whitespace_tokenize(text)
-    split_tokens = []
-    for token in orig_tokens:
-      if token not in never_split:
-        if self.do_lower_case:
-          token = token.lower()
-          if self.strip_accents is not False:
-            token = self._run_strip_accents(token)
-        elif self.strip_accents:
-          token = self._run_strip_accents(token)
-      split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-    output_tokens = whitespace_tokenize(" ".join(split_tokens))
-    return output_tokens
-
-  def _run_strip_accents(self, text):
-    text = unicodedata.normalize("NFD", text)
-    output = []
-    for char in text:
-      cat = unicodedata.category(char)
-      if cat == "Mn":
-        continue
-      output.append(char)
-    return "".join(output)
-
-  def _run_split_on_punc(self, text, never_split=None):
-    if never_split is not None and text in never_split:
-      return [text]
-    chars = list(text)
-    i = 0
-    start_new_word = True
-    output = []
-    while i < len(chars):
-      char = chars[i]
-      if _is_punctuation(char):
-        output.append([char])
-        start_new_word = True
-      else:
-        if start_new_word:
-          output.append([])
-        start_new_word = False
-        output[-1].append(char)
-      i += 1
-
-    return ["".join(x) for x in output]
-
-  def _tokenize_chinese_chars(self, text):
-    output = []
-    for char in text:
-      cp = ord(char)
-      if self._is_chinese_char(cp):
-        output.append(" ")
-        output.append(char)
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-  def _is_chinese_char(self, cp):
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if (
-      (cp >= 0x4E00 and cp <= 0x9FFF)
-      or (cp >= 0x3400 and cp <= 0x4DBF)  #
-      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-      or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-      or (cp >= 0xF900 and cp <= 0xFAFF)
-      or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
-    ):  #
-      return True
-
-    return False
-
-  def _clean_text(self, text):
-    output = []
-    for char in text:
-      cp = ord(char)
-      if cp == 0 or cp == 0xFFFD or _is_control(char):
-        continue
-      if _is_whitespace(char):
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-
-class WordpieceTokenizer(object):
-  def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-    self.vocab = vocab
-    self.unk_token = unk_token
-    self.max_input_chars_per_word = max_input_chars_per_word
-
-  def tokenize(self, text):
-    output_tokens = []
-    for token in whitespace_tokenize(text):
-      chars = list(token)
-      if len(chars) > self.max_input_chars_per_word:
-        output_tokens.append(self.unk_token)
-        continue
-
-      is_bad = False
-      start = 0
-      sub_tokens = []
-      while start < len(chars):
-        end = len(chars)
-        cur_substr = None
-        while start < end:
-          substr = "".join(chars[start:end])
-          if start > 0:
-            substr = "##" + substr
-          if substr in self.vocab:
-            cur_substr = substr
-            break
-          end -= 1
-        if cur_substr is None:
-          is_bad = True
-          break
-        sub_tokens.append(cur_substr)
-        start = end
-
-      if is_bad:
-        output_tokens.append(self.unk_token)
-      else:
-        output_tokens.extend(sub_tokens)
-    return output_tokens