iioSnail
/

ChineseBERT-base

@@ -1,11 +1,13 @@
 import json
 import os
 from pathlib import Path
 from typing import List
-import requests
 import tokenizers
 import torch
 from pypinyin import pinyin, Style
 try:
@@ -25,20 +27,17 @@ SOURCE_FILES_URL = {
 }
-def download_file(url, filename: str):
-    if os.path.exists(filename):
         return
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"
-    }
-    try:
-        res = requests.get(url, headers=headers)
-        res.raise_for_status()
-        with open(filename, 'wb') as file:
-            file.write(res.content)
-    except:
-        raise RuntimeError("Error download file from '" + url)
 class ChineseBertTokenizer(BertTokenizerFast):
@@ -50,21 +49,21 @@ class ChineseBertTokenizer(BertTokenizerFast):
         config_path = os.path.join(cache_path, 'config')
         self.max_length = 512
-        download_file(SOURCE_FILES_URL["vocab.txt"], vocab_file)
         self.tokenizer = BertWordPieceTokenizer(vocab_file)
         # load pinyin map dict
-        download_file(SOURCE_FILES_URL["pinyin_map.json"], os.path.join(config_path, 'pinyin_map.json'))
         with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
-        download_file(SOURCE_FILES_URL["id2pinyin.json"], os.path.join(config_path, 'id2pinyin.json'))
         with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
-        download_file(SOURCE_FILES_URL["pinyin2tensor.json"], os.path.join(config_path, 'pinyin2tensor.json'))
         with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)

 import json
 import os
+import time
 from pathlib import Path
 from typing import List
 import tokenizers
 import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.file_download import http_user_agent
 from pypinyin import pinyin, Style
 try:
 }
+def download_file(filename: str):
+    if os.path.exists(cache_path / filename):
         return
+    hf_hub_download(
+        "iioSnail/chinesebert-base",
+        filename,
+        cache_dir=cache_path,
+        user_agent=http_user_agent(None),
+    )
+    time.sleep(0.2)
 class ChineseBertTokenizer(BertTokenizerFast):
         config_path = os.path.join(cache_path, 'config')
         self.max_length = 512
+        download_file('vocab.txt')
         self.tokenizer = BertWordPieceTokenizer(vocab_file)
         # load pinyin map dict
+        download_file('config/pinyin_map.json')
         with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
             self.pinyin_dict = json.load(fin)
         # load char id map tensor
+        download_file('config/id2pinyin.json')
         with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
             self.id2pinyin = json.load(fin)
         # load pinyin map tensor
+        download_file('config/pinyin2tensor.json')
         with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
             self.pinyin2tensor = json.load(fin)