Spaces:
Sleeping
Sleeping
File size: 700 Bytes
45311fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# -*- coding: utf-8 -*-
# @Time : 2021/12/8 12:07 a.m.
# @Author : JianingWang
# @File : JiebaTokenizer
import jieba
from transformers import BertTokenizer
class JiebaTokenizer(BertTokenizer):
def __init__(
self, pre_tokenizer=lambda x: jieba.cut(x, HMM=False), *args, **kwargs
):
super().__init__(*args, **kwargs)
self.pre_tokenizer = pre_tokenizer
def _tokenize(self, text, *arg, **kwargs):
split_tokens = []
for text in self.pre_tokenizer(text):
if text in self.vocab:
split_tokens.append(text)
else:
split_tokens.extend(super()._tokenize(text))
return split_tokens
|