# coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utilities to convert slow tokenizers in their fast tokenizers counterparts. All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and allow to make our dependency on SentencePiece optional. """ import warnings from typing import Dict, List, Tuple from packaging import version from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece from .utils import is_protobuf_available, requires_backends from .utils.import_utils import PROTOBUF_IMPORT_ERROR def import_protobuf(error_message=""): if is_protobuf_available(): import google.protobuf if version.parse(google.protobuf.__version__) < version.parse("4.0.0"): from transformers.utils import sentencepiece_model_pb2 else: from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2 return sentencepiece_model_pb2 else: raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message)) def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: if add_prefix_space: prepend_scheme = "always" if not getattr(original_tokenizer, "legacy", True): prepend_scheme = "first" else: prepend_scheme = "never" return prepend_scheme def generate_merges(vocab, vocab_scores): reverse = vocab_scores is not None vocab_scores = dict(vocab_scores) if reverse else vocab merges = [] for merge, piece_score in vocab_scores.items(): local = [] for index in range(1, len(merge)): piece_l, piece_r = merge[:index], merge[index:] if piece_l in vocab and piece_r in vocab: local.append((piece_l, piece_r, piece_score)) local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]])) merges.extend(local) merges = sorted(merges, key=lambda val: (val[2], len(val[0]), len(val[1])), reverse=reverse) merges = [(val[0], val[1]) for val in merges] return merges class SentencePieceExtractor: """ Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece """ def __init__(self, model: str): requires_backends(self, "sentencepiece") from sentencepiece import SentencePieceProcessor self.sp = SentencePieceProcessor() self.sp.Load(model) def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]: """ By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to order the merges with respect to the piece scores instead. """ sp = self.sp vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())} merges = generate_merges(vocab, vocab_scores) return vocab, merges class GemmaSentencePieceExtractor(SentencePieceExtractor): def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]: """ By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to order the merges with respect to the piece scores instead. """ sp = self.sp vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())} # there is a missing token in the vocab. We have to do this to support merges # "<0x09>" is the bytefallback for `\t` vocab["\t"] = vocab.get("<0x09>") merges = generate_merges(vocab, vocab_scores) return vocab, merges def check_number_comma(piece: str) -> bool: return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit() class Converter: def __init__(self, original_tokenizer): self.original_tokenizer = original_tokenizer def converted(self) -> Tokenizer: raise NotImplementedError() class BertConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer class SplinterConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) question = str(self.original_tokenizer.question_token) dot = "." cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id question_token_id = self.original_tokenizer.question_token_id dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".") if self.original_tokenizer.padding_side == "right": pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1" else: pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1" tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=pair, special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), (question, question_token_id), (dot, dot_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer class FunnelConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer class MPNetConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1", # MPNet uses two [SEP] tokens special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer class OpenAIGPTConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, unk_token=str(unk_token), end_of_word_suffix="", fuse_unk=False, ) ) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix="") return tokenizer class GPT2Converter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, ) ) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() if self.original_tokenizer.add_bos_token: bos = self.original_tokenizer.bos_token bos_token_id = self.original_tokenizer.bos_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{bos}:0 $A:0", pair=f"{bos}:0 $A:0 $B:1", special_tokens=[ (bos, bos_token_id), ], ) else: # XXX trim_offsets=False actually means this post_processor doesn't # really do anything. tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer class HerbertConverter(Converter): def converted(self) -> Tokenizer: tokenizer_info_str = "#version:" token_suffix = "" vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) if tokenizer_info_str in merges[0][0]: merges = merges[1:] tokenizer = Tokenizer( BPE( vocab, merges, dropout=None, unk_token=self.original_tokenizer.unk_token, end_of_word_suffix=token_suffix, ) ) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix) tokenizer.post_processor = processors.BertProcessing( sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id), cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id), ) return tokenizer class Qwen2Converter(Converter): def converted(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None) -> Tokenizer: if not vocab: vocab = self.original_tokenizer.encoder if not merges: merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, unk_token=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, byte_fallback=False, ) ) tokenizer.normalizer = normalizers.NFC() tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Split( Regex( r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" ), behavior="isolated", invert=False, ), pre_tokenizers.ByteLevel( add_prefix_space=getattr(self.original_tokenizer, "add_prefix_space", False), use_regex=False, ), ] ) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer class RobertaConverter(Converter): def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, ) ) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.RobertaProcessing( sep=(ot.sep_token, ot.sep_token_id), cls=(ot.cls_token, ot.cls_token_id), add_prefix_space=ot.add_prefix_space, trim_offsets=True, # True by default on Roberta (historical) ) return tokenizer class RoFormerConverter(Converter): def converted(self) -> Tokenizer: from .models.roformer.tokenization_utils import JiebaPreTokenizer vocab = self.original_tokenizer.vocab tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=False, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(JiebaPreTokenizer(vocab)) cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer class DebertaConverter(Converter): def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, ) ) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) return tokenizer class SpmConverter(Converter): handle_byte_fallback = False SpmExtractor = SentencePieceExtractor special_tokens = {} def __init__(self, *args): requires_backends(self, "protobuf") super().__init__(*args) # from .utils import sentencepiece_model_pb2 as model_pb2 model_pb2 = import_protobuf() m = model_pb2.ModelProto() with open(self.original_tokenizer.vocab_file, "rb") as f: m.ParseFromString(f.read()) self.proto = m if self.proto.trainer_spec.byte_fallback and not self.handle_byte_fallback: warnings.warn( "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option" " which is not implemented in the fast tokenizers. In practice this means that the fast version of the" " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these " "unknown tokens into a sequence of byte tokens matching the original piece of text." ) def vocab(self, proto): return [(piece.piece, piece.score) for piece in proto.pieces] def unk_id(self, proto): return proto.trainer_spec.unk_id def tokenizer(self, proto): model_type = proto.trainer_spec.model_type vocab_scores = self.vocab(proto) if model_type == 1: tokenizer = Tokenizer( Unigram( vocab_scores, unk_id=self.unk_id(proto), byte_fallback=self.handle_byte_fallback, ) ) elif model_type == 2: _, merges = self.SpmExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores) bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)} tokenizer = Tokenizer( BPE( bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=self.handle_byte_fallback, dropout=None, ) ) else: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) # control tokens are special # user defined symbols are not # both user and control tokens are AddedTokens # Add user defined symbols (type == 4) from sentencepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33) spm_added_tokens = [ (id, p.piece, p.type == 3 or p.piece in self.special_tokens) for id, p in enumerate(proto.pieces) if p.type in [3, 4] ] tokens_to_add = [ AddedToken(token, normalized=False, special=special) for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) ] if len(tokens_to_add) > 0: # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for # individual tokens would repeatedly rebuild a trie, which can be slow. is_last_special = None tokens = [] for token in tokens_to_add: is_special = token.special if is_last_special is None or is_last_special == is_special: tokens.append(token) else: if is_last_special: tokenizer.add_special_tokens(tokens) else: tokenizer.add_tokens(tokens) tokens = [token] is_last_special = is_special if tokens: if is_last_special: tokenizer.add_special_tokens(tokens) else: tokenizer.add_tokens(tokens) return tokenizer def normalizer(self, proto): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap _normalizers = [ normalizers.Strip(left=False, right=True), # stripping is important normalizers.Replace(Regex(" {2,}"), "▁"), ] if not precompiled_charsmap: return normalizers.Sequence(_normalizers) else: return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers) def pre_tokenizer(self, replacement, add_prefix_space): prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) def post_processor(self): return None def decoder(self, replacement, add_prefix_space): prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) def converted(self) -> Tokenizer: tokenizer = self.tokenizer(self.proto) # Tokenizer assemble normalizer = self.normalizer(self.proto) if normalizer is not None: tokenizer.normalizer = normalizer replacement = "▁" add_prefix_space = True if hasattr(self.original_tokenizer, "add_prefix_space"): add_prefix_space = self.original_tokenizer.add_prefix_space pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) if pre_tokenizer is not None: tokenizer.pre_tokenizer = pre_tokenizer tokenizer.decoder = self.decoder(replacement, add_prefix_space) post_processor = self.post_processor() if post_processor: tokenizer.post_processor = post_processor return tokenizer class AlbertConverter(SpmConverter): def vocab(self, proto): return [ (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) for piece in proto.pieces ] def normalizer(self, proto): list_normalizers = [ normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), ] if not self.original_tokenizer.keep_accents: list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.StripAccents()) if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap if precompiled_charsmap: list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) return normalizers.Sequence(list_normalizers) def post_processor(self): return processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) class BarthezConverter(SpmConverter): def unk_id(self, proto): unk_id = 3 return unk_id def post_processor(self): return processors.TemplateProcessing( single=" $A ", pair=" $A $B ", special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class CamembertConverter(SpmConverter): def vocab(self, proto): vocab = [ ("NOTUSED", 0.0), ("", 0.0), ("NOTUSED", 0.0), ("", 0.0), ("NOTUSED", -100), ] # We down-grade the original SentencePiece by -100 to avoid using it and use our added token instead vocab += [(piece.piece, piece.score) for piece in proto.pieces[1:]] vocab += [("", 0.0)] return vocab def unk_id(self, proto): # See vocab unk position return 3 def post_processor(self): return processors.TemplateProcessing( single=" $A ", pair=" $A $B ", special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class DebertaV2Converter(SpmConverter): def pre_tokenizer(self, replacement, add_prefix_space): list_pretokenizers = [] if self.original_tokenizer.split_by_punct: list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated")) prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)) return pre_tokenizers.Sequence(list_pretokenizers) def normalizer(self, proto): list_normalizers = [] if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) list_normalizers.append(normalizers.Strip()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap if precompiled_charsmap: list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) return normalizers.Sequence(list_normalizers) def post_processor(self): return processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) class MBartConverter(SpmConverter): def vocab(self, proto): vocab = [ ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] vocab += [ ("ar_AR", 0.0), ("cs_CZ", 0.0), ("de_DE", 0.0), ("en_XX", 0.0), ("es_XX", 0.0), ("et_EE", 0.0), ("fi_FI", 0.0), ("fr_XX", 0.0), ("gu_IN", 0.0), ("hi_IN", 0.0), ("it_IT", 0.0), ("ja_XX", 0.0), ("kk_KZ", 0.0), ("ko_KR", 0.0), ("lt_LT", 0.0), ("lv_LV", 0.0), ("my_MM", 0.0), ("ne_NP", 0.0), ("nl_XX", 0.0), ("ro_RO", 0.0), ("ru_RU", 0.0), ("si_LK", 0.0), ("tr_TR", 0.0), ("vi_VN", 0.0), ("zh_CN", 0.0), ] vocab += [("", 0.0)] return vocab def unk_id(self, proto): return 3 def post_processor(self): return processors.TemplateProcessing( single="$A en_XX", pair="$A $B en_XX", special_tokens=[ ("en_XX", self.original_tokenizer.convert_tokens_to_ids("en_XX")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class MBart50Converter(SpmConverter): def vocab(self, proto): vocab = [ ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] vocab += [("ar_AR", 0.0), ("cs_CZ", 0.0), ("de_DE", 0.0), ("en_XX", 0.0), ("es_XX", 0.0), ("et_EE", 0.0), ("fi_FI", 0.0), ("fr_XX", 0.0), ("gu_IN", 0.0), ("hi_IN", 0.0), ("it_IT", 0.0), ("ja_XX", 0.0), ("kk_KZ", 0.0), ("ko_KR", 0.0), ("lt_LT", 0.0), ("lv_LV", 0.0), ("my_MM", 0.0), ("ne_NP", 0.0), ("nl_XX", 0.0), ("ro_RO", 0.0), ("ru_RU", 0.0), ("si_LK", 0.0), ("tr_TR", 0.0), ("vi_VN", 0.0), ("zh_CN", 0.0), ("af_ZA", 0.0), ("az_AZ", 0.0), ("bn_IN", 0.0), ("fa_IR", 0.0), ("he_IL", 0.0), ("hr_HR", 0.0), ("id_ID", 0.0), ("ka_GE", 0.0), ("km_KH", 0.0), ("mk_MK", 0.0), ("ml_IN", 0.0), ("mn_MN", 0.0), ("mr_IN", 0.0), ("pl_PL", 0.0), ("ps_AF", 0.0), ("pt_XX", 0.0), ("sv_SE", 0.0), ("sw_KE", 0.0), ("ta_IN", 0.0), ("te_IN", 0.0), ("th_TH", 0.0), ("tl_XX", 0.0), ("uk_UA", 0.0), ("ur_PK", 0.0), ("xh_ZA", 0.0), ("gl_ES", 0.0), ("sl_SI", 0.0)] # fmt: skip vocab += [("", 0.0)] return vocab def unk_id(self, proto): return 3 def post_processor(self): return processors.TemplateProcessing( single="en_XX $A ", pair="en_XX $A $B ", special_tokens=[ ("en_XX", self.original_tokenizer.convert_tokens_to_ids("en_XX")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class NllbConverter(SpmConverter): def vocab(self, proto): vocab = [ ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] return vocab def unk_id(self, proto): return 3 def post_processor(self): return processors.TemplateProcessing( single="eng_Latn $A ", pair="eng_Latn $A $B ", special_tokens=[ ("eng_Latn", self.original_tokenizer.convert_tokens_to_ids("eng_Latn")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class SeamlessM4TConverter(SpmConverter): def vocab(self, proto): vocab = [ ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] return vocab def unk_id(self, proto): return self.original_tokenizer.unk_token_id def post_processor(self): return processors.TemplateProcessing( single="__eng__ $A ", pair="__eng__ $A $B ", special_tokens=[ ("__eng__", self.original_tokenizer.convert_tokens_to_ids("__eng__")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class XLMRobertaConverter(SpmConverter): def vocab(self, proto): vocab = [ ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] vocab += [("", 0.0)] return vocab def unk_id(self, proto): unk_id = 3 return unk_id def post_processor(self): return processors.TemplateProcessing( single=" $A ", pair=" $A $B ", special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class XLNetConverter(SpmConverter): def vocab(self, proto): return [ (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) for piece in proto.pieces ] def normalizer(self, proto): list_normalizers = [ normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), ] if not self.original_tokenizer.keep_accents: list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.StripAccents()) if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap if precompiled_charsmap: list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) return normalizers.Sequence(list_normalizers) def post_processor(self): return processors.TemplateProcessing( single="$A:0 :0 :2", pair="$A:0 :0 $B:1 :1 :2", special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class ReformerConverter(SpmConverter): pass class RemBertConverter(SpmConverter): # Inspired from AlbertConverter def normalizer(self, proto): list_normalizers = [ normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), normalizers.Replace(Regex(" {2,}"), " "), ] if not self.original_tokenizer.keep_accents: list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.StripAccents()) if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap if precompiled_charsmap: list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) return normalizers.Sequence(list_normalizers) def post_processor(self): return processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) class BertGenerationConverter(SpmConverter): pass class PegasusConverter(SpmConverter): def vocab(self, proto): vocab = [ (self.original_tokenizer.pad_token, 0.0), (self.original_tokenizer.eos_token, 0.0), ] if self.original_tokenizer.mask_token_sent is not None: vocab += [(self.original_tokenizer.mask_token_sent, 0.0)] if ( self.original_tokenizer.mask_token is not None and self.original_tokenizer.mask_token_id < self.original_tokenizer.offset ): vocab += [(self.original_tokenizer.mask_token, 0.0)] vocab += [(f"", -100.0) for i in range(2, self.original_tokenizer.offset)] vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]] return vocab def unk_id(self, proto): return proto.trainer_spec.unk_id + self.original_tokenizer.offset def pre_tokenizer(self, replacement, add_prefix_space): prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) return pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme), ] ) def post_processor(self): eos = self.original_tokenizer.eos_token special_tokens = [ (eos, self.original_tokenizer.eos_token_id), ] return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens) class T5Converter(SpmConverter): def vocab(self, proto): num_extra_ids = self.original_tokenizer._extra_ids vocab = [(piece.piece, piece.score) for piece in proto.pieces] vocab += [(f"", 0.0) for i in range(num_extra_ids - 1, -1, -1)] return vocab def post_processor(self): return processors.TemplateProcessing( single=["$A", ""], pair=["$A", "", "$B", ""], special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class UdopConverter(SpmConverter): def post_processor(self): return processors.TemplateProcessing( single=["$A", ""], pair=["$A", "", "$B", ""], special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class WhisperConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, ) ) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() prefix_token_ids = self.original_tokenizer.prefix_tokens prefixes = self.original_tokenizer.convert_ids_to_tokens(prefix_token_ids) eos = self.original_tokenizer.eos_token eos_token_id = self.original_tokenizer.eos_token_id prefix_template = " ".join([f"{token}:0" for token in prefixes]) tokenizer.post_processor = processors.TemplateProcessing( single=f"{prefix_template} $A:0 {eos}:0", pair=f"{prefix_template} $A:0 $B:1 {eos}:1", special_tokens=[ (eos, eos_token_id), *zip(prefixes, prefix_token_ids), ], ) return tokenizer class BigBirdConverter(SpmConverter): def post_processor(self): return processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) class CLIPConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, unk_token=str(unk_token), ) ) tokenizer.normalizer = normalizers.Sequence( [normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase()] ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Split( Regex(r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""), behavior="removed", invert=True, ), pre_tokenizers.ByteLevel(add_prefix_space=False), ] ) tokenizer.decoder = decoders.ByteLevel() # Hack to have a ByteLevel and TemplaceProcessor tokenizer.post_processor = processors.RobertaProcessing( sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), add_prefix_space=False, trim_offsets=False, ) return tokenizer class LayoutLMv2Converter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = True if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer class BlenderbotConverter(Converter): def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, ) ) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.TemplateProcessing( single=f"$A:0 {ot.eos_token}:0", special_tokens=[ (ot.eos_token, ot.eos_token_id), ], ) return tokenizer class XGLMConverter(SpmConverter): def vocab(self, proto): vocab = [ ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] vocab += [("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0), ("", 0.0)] # fmt: skip return vocab def unk_id(self, proto): unk_id = 3 return unk_id def post_processor(self): return processors.TemplateProcessing( single=" $A", pair=" $A $B", special_tokens=[ ("", self.original_tokenizer.convert_tokens_to_ids("")), ("", self.original_tokenizer.convert_tokens_to_ids("")), ], ) class GemmaConvert(SpmConverter): handle_byte_fallback = True SpmExtractor = GemmaSentencePieceExtractor # start and end of turn tokens must be marked as special special_tokens = {"", ""} """" split_by_unicode_script: true split_by_number: true split_by_whitespace: true treat_whitespace_as_suffix: false allow_whitespace_only_pieces: true split_digits: true byte_fallback: true """ def normalizer(self, proto): return normalizers.Replace(" ", "▁") def vocab(self, proto): vocab = [ (self.original_tokenizer.pad_token, 0.0), (self.original_tokenizer.eos_token, 0.0), (self.original_tokenizer.bos_token, 0.0), ] for piece in proto.pieces[3:]: if piece.piece == "<0x09>": vocab += [("\t", piece.score)] else: vocab += [(piece.piece, piece.score)] # vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] return vocab def pre_tokenizer(self, replacement, add_prefix_space): return pre_tokenizers.Split(" ", "merged_with_previous") def unk_id(self, proto): unk_id = 3 return unk_id def decoder(self, replacement, add_prefix_space): return decoders.Sequence( [ decoders.Replace("▁", " "), decoders.ByteFallback(), decoders.Fuse(), ] ) class LlamaConverter(SpmConverter): handle_byte_fallback = True def vocab(self, proto): vocab = [ (self.original_tokenizer.convert_ids_to_tokens(0), 0.0), (self.original_tokenizer.convert_ids_to_tokens(1), 0.0), (self.original_tokenizer.convert_ids_to_tokens(2), 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] return vocab def unk_id(self, proto): unk_id = 0 return unk_id def decoder(self, replacement, add_prefix_space): sequence = [ decoders.Replace("▁", " "), decoders.ByteFallback(), decoders.Fuse(), ] if add_prefix_space: sequence += [decoders.Strip(content=" ", left=1)] return decoders.Sequence(sequence) def normalizer(self, proto): if getattr(self.original_tokenizer, "legacy", True): sequence = [] if getattr(self.original_tokenizer, "add_prefix_space", True): sequence += [normalizers.Prepend(prepend="▁")] sequence += [normalizers.Replace(pattern=" ", content="▁")] return normalizers.Sequence(sequence) return None # non-legacy, no normalizer def pre_tokenizer(self, replacement, add_prefix_space): if not getattr(self.original_tokenizer, "legacy", True): # non-legacy, we need a replace prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False) return None def post_processor(self): # the processor is defined in the LlamaTokenizerFast class. return None class MarkupLMConverter(Converter): def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, unk_token=self.original_tokenizer.unk_token, ) ) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls} $A {sep}", pair=f"{cls} $A {sep} $B {sep}", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) return tokenizer # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. """ bs = ( list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) class TikTokenConverter: """ A general tiktoken converter. """ def __init__( self, vocab_file=None, pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, *args, ): super().__init__(*args) self.vocab_file = vocab_file self.pattern = pattern self.add_prefix_space = add_prefix_space def extract_vocab_merges_from_model(self, tiktoken_url: str): try: from tiktoken.load import load_tiktoken_bpe except Exception: raise ValueError( "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`." ) bpe_ranks = load_tiktoken_bpe(tiktoken_url) byte_encoder = bytes_to_unicode() def token_bytes_to_string(b): return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) merges = [] vocab = {} for token, rank in bpe_ranks.items(): vocab[token_bytes_to_string(token)] = rank if len(token) == 1: continue local = [] for index in range(1, len(token)): piece_l, piece_r = token[:index], token[index:] if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks: local.append((piece_l, piece_r, rank)) local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False) merges.extend(local) merges = sorted(merges, key=lambda val: val[2], reverse=False) merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges] return vocab, merges def tokenizer(self): vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file) tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False)) if hasattr(tokenizer.model, "ignore_merges"): tokenizer.model.ignore_merges = True return tokenizer def converted(self) -> Tokenizer: tokenizer = self.tokenizer() tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False), pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False), ] ) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer SLOW_TO_FAST_CONVERTERS = { "AlbertTokenizer": AlbertConverter, "BartTokenizer": RobertaConverter, "BarthezTokenizer": BarthezConverter, "BertTokenizer": BertConverter, "BigBirdTokenizer": BigBirdConverter, "BlenderbotTokenizer": BlenderbotConverter, "CamembertTokenizer": CamembertConverter, "CLIPTokenizer": CLIPConverter, "CodeGenTokenizer": GPT2Converter, "ConvBertTokenizer": BertConverter, "DebertaTokenizer": DebertaConverter, "DebertaV2Tokenizer": DebertaV2Converter, "DistilBertTokenizer": BertConverter, "DPRReaderTokenizer": BertConverter, "DPRQuestionEncoderTokenizer": BertConverter, "DPRContextEncoderTokenizer": BertConverter, "ElectraTokenizer": BertConverter, "FNetTokenizer": AlbertConverter, "FunnelTokenizer": FunnelConverter, "GPT2Tokenizer": GPT2Converter, "HerbertTokenizer": HerbertConverter, "LayoutLMTokenizer": BertConverter, "LayoutLMv2Tokenizer": BertConverter, "LayoutLMv3Tokenizer": RobertaConverter, "LayoutXLMTokenizer": XLMRobertaConverter, "LongformerTokenizer": RobertaConverter, "LEDTokenizer": RobertaConverter, "LxmertTokenizer": BertConverter, "MarkupLMTokenizer": MarkupLMConverter, "MBartTokenizer": MBartConverter, "MBart50Tokenizer": MBart50Converter, "MPNetTokenizer": MPNetConverter, "MobileBertTokenizer": BertConverter, "MvpTokenizer": RobertaConverter, "NllbTokenizer": NllbConverter, "OpenAIGPTTokenizer": OpenAIGPTConverter, "PegasusTokenizer": PegasusConverter, "Qwen2Tokenizer": Qwen2Converter, "RealmTokenizer": BertConverter, "ReformerTokenizer": ReformerConverter, "RemBertTokenizer": RemBertConverter, "RetriBertTokenizer": BertConverter, "RobertaTokenizer": RobertaConverter, "RoFormerTokenizer": RoFormerConverter, "SeamlessM4TTokenizer": SeamlessM4TConverter, "SqueezeBertTokenizer": BertConverter, "T5Tokenizer": T5Converter, "UdopTokenizer": UdopConverter, "WhisperTokenizer": WhisperConverter, "XLMRobertaTokenizer": XLMRobertaConverter, "XLNetTokenizer": XLNetConverter, "SplinterTokenizer": SplinterConverter, "XGLMTokenizer": XGLMConverter, "LlamaTokenizer": LlamaConverter, "CodeLlamaTokenizer": LlamaConverter, "GemmaTokenizer": GemmaConvert, } def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: """ Utilities to convert a slow tokenizer instance in a fast tokenizer instance. Args: transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]): Instance of a slow tokenizer to convert in the backend tokenizer for [`~tokenization_utils_base.PreTrainedTokenizerFast`]. Return: A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a [`~tokenization_utils_base.PreTrainedTokenizerFast`] """ tokenizer_class_name = transformer_tokenizer.__class__.__name__ if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS: raise ValueError( f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance." " No converter was found. Currently available slow->fast convertors:" f" {list(SLOW_TO_FAST_CONVERTERS.keys())}" ) converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] return converter_class(transformer_tokenizer).converted()