Spaces:
Runtime error
Runtime error
File size: 6,189 Bytes
4d0eb62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# Copyright (c) OpenMMLab. All rights reserved.
import collections
import os
from mmengine.fileio import list_from_file
from transformers import (AutoTokenizer, BartTokenizer, BasicTokenizer,
BertTokenizer, BertTokenizerFast, LlamaTokenizer,
WordpieceTokenizer)
from mmpretrain.registry import TOKENIZER
from .huggingface import register_hf_tokenizer
register_hf_tokenizer(AutoTokenizer)
register_hf_tokenizer(LlamaTokenizer)
@register_hf_tokenizer()
class BlipTokenizer(BertTokenizerFast):
""""BlipTokenizer inherit BertTokenizerFast (fast, Rust-based)."""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
*init_inputs,
**kwargs,
):
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
tokenizer = super().from_pretrained(
pretrained_model_name_or_path,
*init_inputs,
**kwargs,
)
tokenizer.add_special_tokens({'bos_token': '[DEC]'})
tokenizer.add_special_tokens({'additional_special_tokens': ['[ENC]']})
return tokenizer
@register_hf_tokenizer()
class Blip2Tokenizer(BertTokenizer):
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
*init_inputs,
**kwargs,
):
tokenizer = super().from_pretrained(
pretrained_model_name_or_path,
*init_inputs,
**kwargs,
)
tokenizer.add_special_tokens({'bos_token': '[DEC]'})
return tokenizer
@register_hf_tokenizer()
class OFATokenizer(BartTokenizer):
vocab_files_names = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt'
}
pretrained_vocab_files_map = {
'vocab_file': {
'OFA-Sys/OFA-tiny':
'https://huggingface.co/OFA-Sys/OFA-tiny/blob/main/vocab.json',
'OFA-Sys/OFA-medium':
'https://huggingface.co/OFA-Sys/OFA-medium/blob/main/vocab.json',
'OFA-Sys/OFA-base':
'https://huggingface.co/OFA-Sys/OFA-base/blob/main/vocab.json',
'OFA-Sys/OFA-large':
'https://huggingface.co/OFA-Sys/OFA-large/blob/main/vocab.json',
},
'merges_file': {
'OFA-Sys/OFA-tiny':
'https://huggingface.co/OFA-Sys/OFA-tiny/blob/main/merges.txt',
'OFA-Sys/OFA-medium':
'https://huggingface.co/OFA-Sys/OFA-medium/blob/main/merges.txt',
'OFA-Sys/OFA-base':
'https://huggingface.co/OFA-Sys/OFA-base/blob/main/merges.txt',
'OFA-Sys/OFA-large':
'https://huggingface.co/OFA-Sys/OFA-large/blob/main/merges.txt',
},
}
max_model_input_sizes = {
'OFA-Sys/OFA-tiny': 1024,
'OFA-Sys/OFA-medium': 1024,
'OFA-Sys/OFA-base': 1024,
'OFA-Sys/OFA-large': 1024,
}
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
*init_inputs,
**kwargs,
):
num_bins = kwargs.pop('num_bins', 1000)
tokenizer = super().from_pretrained(
pretrained_model_name_or_path,
*init_inputs,
**kwargs,
)
length = len(tokenizer)
tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
tokenizer.code_offset = length
tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(num_bins)])
tokenizer.bin_offset = length + 8192
tokenizer.num_bins = num_bins
return tokenizer
@TOKENIZER.register_module()
class FullTokenizer(BertTokenizer):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = self.load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(
vocab=self.vocab, unk_token='[UNK]', max_input_chars_per_word=200)
def load_vocab(self, vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
vocab_list = list_from_file(vocab_file)
for token in vocab_list:
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_by_vocab(self, vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(self, tokens):
return self.convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return self.convert_by_vocab(self.inv_vocab, ids)
@staticmethod
def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
"""Converts a sequence of tokens (string) in a single string."""
def clean_up_tokenization(out_string):
"""Clean up a list of simple English tokenization artifacts like
spaces before punctuations and abbreviated forms."""
out_string = (
out_string.replace(' .', '.').replace(' ?', '?').replace(
' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
" n't", "n't").replace(" 'm", "'m").replace(
" 's", "'s").replace(" 've",
"'ve").replace(" 're", "'re"))
return out_string
text = ' '.join(tokens).replace(' ##', '').strip()
if clean_up_tokenization_spaces:
clean_text = clean_up_tokenization(text)
return clean_text
else:
return text
def vocab_size(self):
return len(self.vocab)
|