from transformers import BertTokenizerFast from transformers.models.bert_japanese.tokenization_bert_japanese import MecabTokenizer class MecabPreTokenizer(MecabTokenizer): def mecab_split(self,i,normalized_string): t=str(normalized_string) e=0 z=[] for c in self.tokenize(t): s=t.find(c,e) e=e if s<0 else s+len(c) z.append((0,0) if s<0 else (s,e)) return [normalized_string[s:e] for s,e in z if e>0] def pre_tokenize(self,pretok): pretok.split(self.mecab_split) class BertMecabTokenizerFast(BertTokenizerFast): def __init__(self,vocab_file,do_lower_case=False,tokenize_chinese_chars=False,**kwargs): from tokenizers import pre_tokenizers,normalizers super().__init__(vocab_file=vocab_file,do_lower_case=do_lower_case,tokenize_chinese_chars=tokenize_chinese_chars,**kwargs) d=kwargs["mecab_kwargs"] if "mecab_kwargs" in kwargs else {"mecab_dic":"ipadic"} self._tokenizer.normalizer=normalizers.Sequence([normalizers.Nmt(),normalizers.NFKC()]) self.custom_pre_tokenizer=pre_tokenizers.Sequence([pre_tokenizers.PreTokenizer.custom(MecabPreTokenizer(**d)),pre_tokenizers.BertPreTokenizer()]) self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer def save_pretrained(self,save_directory,**kwargs): import os import shutil from tokenizers.pre_tokenizers import Metaspace self._auto_map={"AutoTokenizer":[None,"mecab.BertMecabTokenizerFast"]} self._tokenizer.pre_tokenizer=Metaspace() super().save_pretrained(save_directory,**kwargs) self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"mecab.py"))