MagedSaeed commited on
Commit
e013219
·
verified ·
1 Parent(s): ae541cd

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer_script.py +12 -0
tokenizer_script.py CHANGED
@@ -109,6 +109,18 @@ class CharacterTokenizer(PreTrainedTokenizer):
109
 
110
  def convert_tokens_to_string(self, tokens):
111
  return "".join(tokens)
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  @classmethod
114
  def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
109
 
110
  def convert_tokens_to_string(self, tokens):
111
  return "".join(tokens)
112
+
113
+
114
+ @classmethod
115
+ def from_json(cls, vocab_file, **kwargs):
116
+ with open(vocab_file, 'r', encoding='utf-8') as f:
117
+ vocab = json.load(f)
118
+
119
+ return cls(vocab=vocab, **kwargs)
120
+
121
+ @classmethod
122
+ def from_vocab(cls, vocab, **kwargs):
123
+ return cls(vocab=vocab, **kwargs)
124
 
125
  @classmethod
126
  def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):