iioSnail commited on
Commit
a30567c
·
1 Parent(s): 20bc2dd

Upload bert_tokenizer.py

Browse files
Files changed (1) hide show
  1. bert_tokenizer.py +16 -17
bert_tokenizer.py CHANGED
@@ -1,11 +1,13 @@
1
  import json
2
  import os
 
3
  from pathlib import Path
4
  from typing import List
5
 
6
- import requests
7
  import tokenizers
8
  import torch
 
 
9
  from pypinyin import pinyin, Style
10
 
11
  try:
@@ -25,20 +27,17 @@ SOURCE_FILES_URL = {
25
  }
26
 
27
 
28
- def download_file(url, filename: str):
29
- if os.path.exists(filename):
30
  return
31
 
32
- headers = {
33
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"
34
- }
35
- try:
36
- res = requests.get(url, headers=headers)
37
- res.raise_for_status()
38
- with open(filename, 'wb') as file:
39
- file.write(res.content)
40
- except:
41
- raise RuntimeError("Error download file from '" + url)
42
 
43
 
44
  class ChineseBertTokenizer(BertTokenizerFast):
@@ -50,21 +49,21 @@ class ChineseBertTokenizer(BertTokenizerFast):
50
  config_path = os.path.join(cache_path, 'config')
51
  self.max_length = 512
52
 
53
- download_file(SOURCE_FILES_URL["vocab.txt"], vocab_file)
54
  self.tokenizer = BertWordPieceTokenizer(vocab_file)
55
 
56
  # load pinyin map dict
57
- download_file(SOURCE_FILES_URL["pinyin_map.json"], os.path.join(config_path, 'pinyin_map.json'))
58
  with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
59
  self.pinyin_dict = json.load(fin)
60
 
61
  # load char id map tensor
62
- download_file(SOURCE_FILES_URL["id2pinyin.json"], os.path.join(config_path, 'id2pinyin.json'))
63
  with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
64
  self.id2pinyin = json.load(fin)
65
 
66
  # load pinyin map tensor
67
- download_file(SOURCE_FILES_URL["pinyin2tensor.json"], os.path.join(config_path, 'pinyin2tensor.json'))
68
  with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
69
  self.pinyin2tensor = json.load(fin)
70
 
 
1
  import json
2
  import os
3
+ import time
4
  from pathlib import Path
5
  from typing import List
6
 
 
7
  import tokenizers
8
  import torch
9
+ from huggingface_hub import hf_hub_download
10
+ from huggingface_hub.file_download import http_user_agent
11
  from pypinyin import pinyin, Style
12
 
13
  try:
 
27
  }
28
 
29
 
30
+ def download_file(filename: str):
31
+ if os.path.exists(cache_path / filename):
32
  return
33
 
34
+ hf_hub_download(
35
+ "iioSnail/chinesebert-base",
36
+ filename,
37
+ cache_dir=cache_path,
38
+ user_agent=http_user_agent(None),
39
+ )
40
+ time.sleep(0.2)
 
 
 
41
 
42
 
43
  class ChineseBertTokenizer(BertTokenizerFast):
 
49
  config_path = os.path.join(cache_path, 'config')
50
  self.max_length = 512
51
 
52
+ download_file('vocab.txt')
53
  self.tokenizer = BertWordPieceTokenizer(vocab_file)
54
 
55
  # load pinyin map dict
56
+ download_file('config/pinyin_map.json')
57
  with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
58
  self.pinyin_dict = json.load(fin)
59
 
60
  # load char id map tensor
61
+ download_file('config/id2pinyin.json')
62
  with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
63
  self.id2pinyin = json.load(fin)
64
 
65
  # load pinyin map tensor
66
+ download_file('config/pinyin2tensor.json')
67
  with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
68
  self.pinyin2tensor = json.load(fin)
69