|
import os |
|
from transformers import DebertaV2TokenizerFast |
|
from transformers.models.bert_japanese.tokenization_bert_japanese import MecabTokenizer |
|
try: |
|
from transformers.utils import cached_file |
|
except: |
|
from transformers.file_utils import cached_path,hf_bucket_url |
|
cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y)) |
|
|
|
class MecabPreTokenizer(MecabTokenizer): |
|
def mecab_split(self,i,normalized_string): |
|
t=str(normalized_string) |
|
z=[] |
|
e=0 |
|
for c in self.tokenize(t): |
|
s=t.find(c,e) |
|
e=e if s<0 else s+len(c) |
|
z.append((0,0) if s<0 else (s,e)) |
|
return [normalized_string[s:e] for s,e in z if e>0] |
|
def pre_tokenize(self,pretok): |
|
pretok.split(self.mecab_split) |
|
|
|
class JumanDebertaV2TokenizerFast(DebertaV2TokenizerFast): |
|
def __init__(self,**kwargs): |
|
from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence |
|
super().__init__(**kwargs) |
|
d,r="/var/lib/mecab/dic/juman-utf8","/etc/mecabrc" |
|
if not (os.path.isdir(d) and os.path.isfile(r)): |
|
import zipfile |
|
import tempfile |
|
self.dicdir=tempfile.TemporaryDirectory() |
|
d=self.dicdir.name |
|
with zipfile.ZipFile(cached_file(self.name_or_path,"mecab-jumandic-utf8.zip")) as z: |
|
z.extractall(d) |
|
r=os.path.join(d,"mecabrc") |
|
with open(r,"w",encoding="utf-8") as w: |
|
print("dicdir =",d,file=w) |
|
self.custom_pre_tokenizer=Sequence([PreTokenizer.custom(MecabPreTokenizer(mecab_dic=None,mecab_option="-d "+d+" -r "+r)),Metaspace()]) |
|
self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer |
|
def save_pretrained(self,save_directory,**kwargs): |
|
import shutil |
|
from tokenizers.pre_tokenizers import Metaspace |
|
self._auto_map={"AutoTokenizer":[None,"juman.JumanDebertaV2TokenizerFast"]} |
|
self._tokenizer.pre_tokenizer=Metaspace() |
|
super().save_pretrained(save_directory,**kwargs) |
|
self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer |
|
shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"juman.py")) |
|
shutil.copy(cached_file(self.name_or_path,"mecab-jumandic-utf8.zip"),os.path.join(save_directory,"mecab-jumandic-utf8.zip")) |
|
|
|
|