nepp1d0 commited on
Commit
3b725aa
·
1 Parent(s): 1b52988

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -1
  2. tokenizer.json +13 -13
  3. tokenizer_config.json +1 -1
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
1
+ {}
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 1000,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
@@ -66,7 +66,7 @@
66
  "clean_text": true,
67
  "handle_chinese_chars": true,
68
  "strip_accents": null,
69
- "lowercase": true
70
  },
71
  "pre_tokenizer": {
72
  "type": "WhitespaceSplit"
@@ -183,17 +183,17 @@
183
  "r": 56,
184
  "s": 57,
185
  "##\"": 58,
186
- "##N": 59,
187
- "##B": 60,
188
- "##[": 61,
189
- "##F": 62,
190
- "##O": 63,
191
- "##S": 64,
192
- "##E": 65,
193
- "##P": 66,
194
- "##]": 67,
195
- "##c": 68,
196
- "##C": 69
197
  }
198
  }
199
  }
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 512,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
66
  "clean_text": true,
67
  "handle_chinese_chars": true,
68
  "strip_accents": null,
69
+ "lowercase": false
70
  },
71
  "pre_tokenizer": {
72
  "type": "WhitespaceSplit"
 
183
  "r": 56,
184
  "s": 57,
185
  "##\"": 58,
186
+ "##S": 59,
187
+ "##E": 60,
188
+ "##P": 61,
189
+ "##]": 62,
190
+ "##C": 63,
191
+ "##O": 64,
192
+ "##F": 65,
193
+ "##[": 66,
194
+ "##N": 67,
195
+ "##B": 68,
196
+ "##c": 69
197
  }
198
  }
199
  }
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "tokenizer_class": "BertTokenizer"}
 
1
+ {"tokenizer_class": "PreTrainedTokenizerFast"}