load from cache
Browse files- sentencepiece_ja.py +10 -4
sentencepiece_ja.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import os
|
2 |
from typing import Union, List, Optional, Tuple
|
3 |
|
4 |
-
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
5 |
-
|
6 |
class SentencePieceJA(PreTrainedTokenizer):
|
7 |
def __init__(self,
|
8 |
model_path = "./tokenizer.json",
|
@@ -11,9 +11,15 @@ class SentencePieceJA(PreTrainedTokenizer):
|
|
11 |
eos = "<EOS>",
|
12 |
unk = "<UNK>",
|
13 |
mask = "<MASK>",
|
14 |
-
**kwargs):
|
15 |
from tokenizers import Tokenizer
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
super().__init__(**kwargs)
|
18 |
self.add_special_tokens({
|
19 |
'pad_token': pad,
|
|
|
1 |
import os
|
2 |
from typing import Union, List, Optional, Tuple
|
3 |
|
4 |
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, AutoTokenizer
|
5 |
+
from transformers.utils.hub import cached_file
|
6 |
class SentencePieceJA(PreTrainedTokenizer):
|
7 |
def __init__(self,
|
8 |
model_path = "./tokenizer.json",
|
|
|
11 |
eos = "<EOS>",
|
12 |
unk = "<UNK>",
|
13 |
mask = "<MASK>",
|
14 |
+
**kwargs):
|
15 |
from tokenizers import Tokenizer
|
16 |
+
try:
|
17 |
+
self._tokenizer = Tokenizer.from_file(model_path)
|
18 |
+
except Exception as e:
|
19 |
+
print('exception: ', e)
|
20 |
+
print('load from cache...')
|
21 |
+
model_path = cached_file('if001/sentencepiece_ja', 'tokenizer.json')
|
22 |
+
self._tokenizer = Tokenizer.from_file(model_path)
|
23 |
super().__init__(**kwargs)
|
24 |
self.add_special_tokens({
|
25 |
'pad_token': pad,
|