|
|
|
from transformers import DebertaV2TokenizerFast |
|
from transformers.models.bert_japanese.tokenization_bert_japanese import JumanppTokenizer |
|
class JumanppPreTokenizer(JumanppTokenizer): |
|
def jumanpp_split(self,i,normalized_string): |
|
import textspan |
|
t=str(normalized_string) |
|
k=self.tokenize(t) |
|
return [normalized_string[s:e] for c in textspan.get_original_spans(k,t) for s,e in c] |
|
def pre_tokenize(self,pretok): |
|
pretok.split(self.jumanpp_split) |
|
class JumanppDebertaV2TokenizerFast(DebertaV2TokenizerFast): |
|
def __init__(self,**kwargs): |
|
from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence |
|
super().__init__(**kwargs) |
|
self._tokenizer.pre_tokenizer=Sequence([PreTokenizer.custom(JumanppPreTokenizer()),Metaspace()]) |
|
def save_pretrained(self,save_directory,**kwargs): |
|
import os |
|
import shutil |
|
from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence |
|
self._auto_map={"AutoTokenizer":[None,"tokenizer.JumanppDebertaV2TokenizerFast"]} |
|
self._tokenizer.pre_tokenizer=Metaspace() |
|
super().save_pretrained(save_directory,**kwargs) |
|
self._tokenizer.pre_tokenizer=Sequence([PreTokenizer.custom(JumanppPreTokenizer()),Metaspace()]) |
|
shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"tokenizer.py")) |
|
|
|
|