|
from transformers import BertTokenizer, WordpieceTokenizer |
|
from unicodedata import normalize |
|
|
|
def whitespace_tokenize(text): |
|
text = text.strip() |
|
if not text: |
|
return [] |
|
tokens = text.split() |
|
return tokens |
|
|
|
|
|
class KorWordpieceTokenizer(WordpieceTokenizer): |
|
def tokenize(self, text): |
|
output_tokens = [] |
|
for token in whitespace_tokenize(text): |
|
chars = list(normalize('NFC',token)) |
|
if len(chars) > self.max_input_chars_per_word: |
|
output_tokens.append(self.unk_token) |
|
continue |
|
|
|
is_bad = False |
|
start = 0 |
|
sub_tokens = [] |
|
while start < len(chars): |
|
end = len(chars) |
|
cur_substr = None |
|
while start < end: |
|
substr = "".join(chars[start:end]) |
|
if substr in self.vocab: |
|
cur_substr = substr |
|
break |
|
end -= 1 |
|
if cur_substr is None: |
|
is_bad = True |
|
break |
|
sub_tokens.append(cur_substr) |
|
start = end |
|
|
|
if is_bad: |
|
output_tokens.append(self.unk_token) |
|
else: |
|
output_tokens.extend(sub_tokens) |
|
return output_tokens |
|
|
|
|
|
|
|
class KorBertTokenizer(BertTokenizer): |
|
|
|
def __init__(self, |
|
vocab_file, |
|
do_lower_case=True, |
|
do_basic_tokenize=True, |
|
never_split=None, |
|
unk_token="[UNK]", |
|
sep_token="[SEP]", |
|
pad_token="[PAD]", |
|
cls_token="[CLS]", |
|
mask_token="[MASK]", |
|
tokenize_chinese_chars=True, |
|
strip_accents=None, |
|
**kwargs): |
|
super().__init__(vocab_file, |
|
do_lower_case=True, |
|
do_basic_tokenize=True, |
|
never_split=None, |
|
unk_token="[UNK]", |
|
sep_token="[SEP]", |
|
pad_token="[PAD]", |
|
cls_token="[CLS]", |
|
mask_token="[MASK]", |
|
tokenize_chinese_chars=True, |
|
strip_accents=None, |
|
**kwargs) |
|
self.wordpiece_tokenizer = KorWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) |
|
|
|
def _tokenize(self, text): |
|
split_tokens = [] |
|
if self.do_basic_tokenize: |
|
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): |
|
|
|
token += '_' |
|
|
|
if token in self.basic_tokenizer.never_split: |
|
split_tokens.append(token) |
|
else: |
|
split_tokens += self.wordpiece_tokenizer.tokenize(token) |
|
else: |
|
split_tokens = self.wordpiece_tokenizer.tokenize(text) |
|
return split_tokens |
|
|