iioSnail commited on
Commit
af43a53
·
1 Parent(s): f3a7c66

Upload bert_tokenizer.py

Browse files
Files changed (1) hide show
  1. bert_tokenizer.py +5 -12
bert_tokenizer.py CHANGED
@@ -19,13 +19,6 @@ from transformers import BertTokenizerFast
19
 
20
  cache_path = Path(os.path.abspath(__file__)).parent
21
 
22
- SOURCE_FILES_URL = {
23
- "vocab.txt": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/vocab.txt",
24
- "pinyin_map.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/pinyin_map.json",
25
- "id2pinyin.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/id2pinyin.json",
26
- "pinyin2tensor.json": "https://huggingface.co/iioSnail/chinesebert-base/resolve/main/config/id2pinyin.json",
27
- }
28
-
29
 
30
  def download_file(filename: str):
31
  if os.path.exists(cache_path / filename):
@@ -45,8 +38,8 @@ class ChineseBertTokenizer(BertTokenizerFast):
45
  def __init__(self, **kwargs):
46
  super(ChineseBertTokenizer, self).__init__(**kwargs)
47
 
48
- vocab_file = os.path.join(cache_path, 'vocab.txt')
49
- config_path = os.path.join(cache_path, 'config')
50
  self.max_length = 512
51
 
52
  download_file('vocab.txt')
@@ -54,17 +47,17 @@ class ChineseBertTokenizer(BertTokenizerFast):
54
 
55
  # load pinyin map dict
56
  download_file('config/pinyin_map.json')
57
- with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
58
  self.pinyin_dict = json.load(fin)
59
 
60
  # load char id map tensor
61
  download_file('config/id2pinyin.json')
62
- with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
63
  self.id2pinyin = json.load(fin)
64
 
65
  # load pinyin map tensor
66
  download_file('config/pinyin2tensor.json')
67
- with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
68
  self.pinyin2tensor = json.load(fin)
69
 
70
  def tokenize_sentence(self, sentence):
 
19
 
20
  cache_path = Path(os.path.abspath(__file__)).parent
21
 
 
 
 
 
 
 
 
22
 
23
  def download_file(filename: str):
24
  if os.path.exists(cache_path / filename):
 
38
  def __init__(self, **kwargs):
39
  super(ChineseBertTokenizer, self).__init__(**kwargs)
40
 
41
+ vocab_file = cache_path / 'vocab.txt'
42
+ config_path = cache_path / 'config'
43
  self.max_length = 512
44
 
45
  download_file('vocab.txt')
 
47
 
48
  # load pinyin map dict
49
  download_file('config/pinyin_map.json')
50
+ with open(config_path / 'pinyin_map.json', encoding='utf8') as fin:
51
  self.pinyin_dict = json.load(fin)
52
 
53
  # load char id map tensor
54
  download_file('config/id2pinyin.json')
55
+ with open(config_path / 'id2pinyin.json', encoding='utf8') as fin:
56
  self.id2pinyin = json.load(fin)
57
 
58
  # load pinyin map tensor
59
  download_file('config/pinyin2tensor.json')
60
+ with open(config_path / 'pinyin2tensor.json', encoding='utf8') as fin:
61
  self.pinyin2tensor = json.load(fin)
62
 
63
  def tokenize_sentence(self, sentence):