iioSnail
/

ChineseBERT-base

feature-extraction

Model card Files Files and versions Community

iioSnail commited on Jul 7, 2023

Commit

3217624

·

1 Parent(s): de3a7d8

Upload bert_tokenizer.py

Files changed (1) hide show

bert_tokenizer.py +17 -7

bert_tokenizer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
 from typing import List
 import requests
@@ -14,6 +15,8 @@ except:
 from transformers import BertTokenizerFast
 SOURCE_FILES_URL = {
     "vocab.txt": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/vocab.txt",
     "pinyin_map.json": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/config/pinyin_map.json",
@@ -22,13 +25,21 @@ SOURCE_FILES_URL = {
 }
-def download_file(url, filename):
     if os.path.exists(filename):
         return
-    res = requests.get(url)
-    with open(filename, 'wb') as file:
-        file.write(res.content)
 class ChineseBertTokenizer(BertTokenizerFast):
@@ -36,9 +47,8 @@ class ChineseBertTokenizer(BertTokenizerFast):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
-        bert_path = self.name_or_path
-        vocab_file = os.path.join(bert_path, 'vocab.txt')
-        config_path = os.path.join(bert_path, 'config')
         self.max_length = 512
         download_file(SOURCE_FILES_URL["vocab.txt"], vocab_file)

 import json
 import os
+from pathlib import Path
 from typing import List
 import requests
 from transformers import BertTokenizerFast
+cache_path = Path(os.path.abspath(__file__)).parent
 SOURCE_FILES_URL = {
     "vocab.txt": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/vocab.txt",
     "pinyin_map.json": "https://huggingface.co/iioSnail/chinesebert-base/blob/main/config/pinyin_map.json",
 }
+def download_file(url, filename: str):
     if os.path.exists(filename):
         return
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"
+    }
+    try:
+        res = requests.get(url, headers=headers)
+        res.raise_for_status()
+        with open(filename, 'wb') as file:
+            file.write(res.content)
+    except:
+        raise RuntimeError("Error download the file of '" + filename +
+                           "'. You can download the model file into the current directory and rerun it.")
 class ChineseBertTokenizer(BertTokenizerFast):
     def __init__(self, **kwargs):
         super(ChineseBertTokenizer, self).__init__(**kwargs)
+        vocab_file = os.path.join(cache_path, 'vocab.txt')
+        config_path = os.path.join(cache_path, 'config')
         self.max_length = 512
         download_file(SOURCE_FILES_URL["vocab.txt"], vocab_file)