Update distilbert_japanese_tokenizer.py
Browse filesAs [discussed in the community](https://huggingface.co/line-corporation/line-distilbert-base-japanese/discussions/3), current tokenizer code does not work with `transformers>=4.34`, this is because the [tokenizer refactoring](https://github.com/huggingface/transformers/pull/23909) introduced in that version.
With this change, `PreTrainedTokenizer.__init__()` starts to access `get_vocab()`, so `self.subword_tokenizer_type` needs to be initialized before `super().__init__()` of `DistilBertJapaneseTokenizer`.
This issue is already fixed in `transformers` with [2da8853](https://github.com/huggingface/transformers/commit/2da8853775b61cde0894dee17c6c713aba711688). This PR basically follows that change.
Confirmed it works with [my repository](https://huggingface.co/liwii/line-distilbert-base-japanese-fork) forked from line-corporation/line-distilbert-base-japanese.
- distilbert_japanese_tokenizer.py +22 -22
@@ -170,25 +170,6 @@ class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
|
|
170 |
jumanpp_kwargs=None,
|
171 |
**kwargs
|
172 |
):
|
173 |
-
super().__init__(
|
174 |
-
spm_file=spm_file,
|
175 |
-
unk_token=unk_token,
|
176 |
-
sep_token=sep_token,
|
177 |
-
pad_token=pad_token,
|
178 |
-
cls_token=cls_token,
|
179 |
-
mask_token=mask_token,
|
180 |
-
do_lower_case=do_lower_case,
|
181 |
-
do_word_tokenize=do_word_tokenize,
|
182 |
-
do_subword_tokenize=do_subword_tokenize,
|
183 |
-
word_tokenizer_type=word_tokenizer_type,
|
184 |
-
subword_tokenizer_type=subword_tokenizer_type,
|
185 |
-
never_split=never_split,
|
186 |
-
mecab_kwargs=mecab_kwargs,
|
187 |
-
sudachi_kwargs=sudachi_kwargs,
|
188 |
-
jumanpp_kwargs=jumanpp_kwargs,
|
189 |
-
**kwargs,
|
190 |
-
)
|
191 |
-
|
192 |
if subword_tokenizer_type == "sentencepiece":
|
193 |
if not os.path.isfile(spm_file):
|
194 |
raise ValueError(
|
@@ -236,14 +217,33 @@ class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
|
|
236 |
self.subword_tokenizer_type = subword_tokenizer_type
|
237 |
if do_subword_tokenize:
|
238 |
if subword_tokenizer_type == "wordpiece":
|
239 |
-
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=
|
240 |
elif subword_tokenizer_type == "character":
|
241 |
-
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=
|
242 |
elif subword_tokenizer_type == "sentencepiece":
|
243 |
-
self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=
|
244 |
else:
|
245 |
raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
|
246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
@property
|
248 |
def do_lower_case(self):
|
249 |
return self.lower_case
|
|
|
170 |
jumanpp_kwargs=None,
|
171 |
**kwargs
|
172 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
if subword_tokenizer_type == "sentencepiece":
|
174 |
if not os.path.isfile(spm_file):
|
175 |
raise ValueError(
|
|
|
217 |
self.subword_tokenizer_type = subword_tokenizer_type
|
218 |
if do_subword_tokenize:
|
219 |
if subword_tokenizer_type == "wordpiece":
|
220 |
+
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
221 |
elif subword_tokenizer_type == "character":
|
222 |
+
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
223 |
elif subword_tokenizer_type == "sentencepiece":
|
224 |
+
self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token))
|
225 |
else:
|
226 |
raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
|
227 |
|
228 |
+
super().__init__(
|
229 |
+
spm_file=spm_file,
|
230 |
+
unk_token=unk_token,
|
231 |
+
sep_token=sep_token,
|
232 |
+
pad_token=pad_token,
|
233 |
+
cls_token=cls_token,
|
234 |
+
mask_token=mask_token,
|
235 |
+
do_lower_case=do_lower_case,
|
236 |
+
do_word_tokenize=do_word_tokenize,
|
237 |
+
do_subword_tokenize=do_subword_tokenize,
|
238 |
+
word_tokenizer_type=word_tokenizer_type,
|
239 |
+
subword_tokenizer_type=subword_tokenizer_type,
|
240 |
+
never_split=never_split,
|
241 |
+
mecab_kwargs=mecab_kwargs,
|
242 |
+
sudachi_kwargs=sudachi_kwargs,
|
243 |
+
jumanpp_kwargs=jumanpp_kwargs,
|
244 |
+
**kwargs,
|
245 |
+
)
|
246 |
+
|
247 |
@property
|
248 |
def do_lower_case(self):
|
249 |
return self.lower_case
|