iioSnail
/

ChineseBERT-base

feature-extraction

Model card Files Files and versions Community

iioSnail commited on Jul 7, 2023

Commit

af43a53

·

1 Parent(s): f3a7c66

Upload bert_tokenizer.py

Files changed (1) hide show

bert_tokenizer.py +5 -12

bert_tokenizer.py CHANGED Viewed

@@ -19,13 +19,6 @@ from transformers import BertTokenizerFast
 cache_path = Path(os.path.abspath(__file__)).parent
-SOURCE_FILES_URL = {
-    "vocab.txt": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/vocab.txt",
-    "pinyin_map.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/pinyin_map.json",
-    "id2pinyin.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/id2pinyin.json",
-    "pinyin2tensor.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/id2pinyin.json",
-}
 def download_file(filename: str):
     if os.path.exists(cache_path / filename):
@@ -45,8 +38,8 @@ class ChineseBertTokenizer(BertTokenizerFast):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
-        vocab_file = os.path.join(cache_path, 'vocab.txt')
-        config_path = os.path.join(cache_path, 'config')
         self.max_length = 512
         download_file('vocab.txt')
@@ -54,17 +47,17 @@ class ChineseBertTokenizer(BertTokenizerFast):
         # load pinyin map dict
         download_file('config/pinyin_map.json')
-        with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
         download_file('config/id2pinyin.json')
-        with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
         download_file('config/pinyin2tensor.json')
-        with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)
     def tokenize_sentence(self, sentence):

 cache_path = Path(os.path.abspath(__file__)).parent
 def download_file(filename: str):
     if os.path.exists(cache_path / filename):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
+        vocab_file = cache_path / 'vocab.txt'
+        config_path = cache_path / 'config'
         self.max_length = 512
         download_file('vocab.txt')
         # load pinyin map dict
         download_file('config/pinyin_map.json')
+        with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
         download_file('config/id2pinyin.json')
+        with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
         download_file('config/pinyin2tensor.json')
+        with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)
     def tokenize_sentence(self, sentence):