orkidea commited on
Commit
efc040a
·
1 Parent(s): 58830a8

config tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +15 -0
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "bpe",
3
+ "tokenizer_type": "bpe",
4
+ "unk_token": "[UNK]",
5
+ "bos_token": "[CLS]",
6
+ "eos_token": "[SEP]",
7
+ "pad_token": "[PAD]",
8
+ "mask_token": "[MASK]",
9
+ "normalizer_type": "Sequence",
10
+ "normalizers": ["NFKC", "Lowercase"],
11
+ "pre_tokenizer_type": "Sequence",
12
+ "pre_tokenizers": ["Whitespace", "Digits", "Punctuation"],
13
+ "vocab_size": 3000
14
+ }
15
+