Upload bert_tokenizer.py
Browse files- bert_tokenizer.py +5 -12
bert_tokenizer.py
CHANGED
@@ -19,13 +19,6 @@ from transformers import BertTokenizerFast
|
|
19 |
|
20 |
cache_path = Path(os.path.abspath(__file__)).parent
|
21 |
|
22 |
-
SOURCE_FILES_URL = {
|
23 |
-
"vocab.txt": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/vocab.txt",
|
24 |
-
"pinyin_map.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/pinyin_map.json",
|
25 |
-
"id2pinyin.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/id2pinyin.json",
|
26 |
-
"pinyin2tensor.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/id2pinyin.json",
|
27 |
-
}
|
28 |
-
|
29 |
|
30 |
def download_file(filename: str):
|
31 |
if os.path.exists(cache_path / filename):
|
@@ -45,8 +38,8 @@ class ChineseBertTokenizer(BertTokenizerFast):
|
|
45 |
def __init__(self, **kwargs):
|
46 |
super(ChineseBertTokenizer, self).__init__(**kwargs)
|
47 |
|
48 |
-
vocab_file =
|
49 |
-
config_path =
|
50 |
self.max_length = 512
|
51 |
|
52 |
download_file('vocab.txt')
|
@@ -54,17 +47,17 @@ class ChineseBertTokenizer(BertTokenizerFast):
|
|
54 |
|
55 |
# load pinyin map dict
|
56 |
download_file('config/pinyin_map.json')
|
57 |
-
with open(
|
58 |
self.pinyin_dict = json.load(fin)
|
59 |
|
60 |
# load char id map tensor
|
61 |
download_file('config/id2pinyin.json')
|
62 |
-
with open(
|
63 |
self.id2pinyin = json.load(fin)
|
64 |
|
65 |
# load pinyin map tensor
|
66 |
download_file('config/pinyin2tensor.json')
|
67 |
-
with open(
|
68 |
self.pinyin2tensor = json.load(fin)
|
69 |
|
70 |
def tokenize_sentence(self, sentence):
|
|
|
19 |
|
20 |
cache_path = Path(os.path.abspath(__file__)).parent
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def download_file(filename: str):
|
24 |
if os.path.exists(cache_path / filename):
|
|
|
38 |
def __init__(self, **kwargs):
|
39 |
super(ChineseBertTokenizer, self).__init__(**kwargs)
|
40 |
|
41 |
+
vocab_file = cache_path / 'vocab.txt'
|
42 |
+
config_path = cache_path / 'config'
|
43 |
self.max_length = 512
|
44 |
|
45 |
download_file('vocab.txt')
|
|
|
47 |
|
48 |
# load pinyin map dict
|
49 |
download_file('config/pinyin_map.json')
|
50 |
+
with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
|
51 |
self.pinyin_dict = json.load(fin)
|
52 |
|
53 |
# load char id map tensor
|
54 |
download_file('config/id2pinyin.json')
|
55 |
+
with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
|
56 |
self.id2pinyin = json.load(fin)
|
57 |
|
58 |
# load pinyin map tensor
|
59 |
download_file('config/pinyin2tensor.json')
|
60 |
+
with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
|
61 |
self.pinyin2tensor = json.load(fin)
|
62 |
|
63 |
def tokenize_sentence(self, sentence):
|