add tokenizer
Browse files- special_tokens_map.json +1 -1
- tokenizer.json +13 -13
- tokenizer_config.json +1 -1
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{
|
|
|
1 |
+
{}
|
tokenizer.json
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"version": "1.0",
|
3 |
"truncation": {
|
4 |
"direction": "Right",
|
5 |
-
"max_length":
|
6 |
"strategy": "LongestFirst",
|
7 |
"stride": 0
|
8 |
},
|
@@ -66,7 +66,7 @@
|
|
66 |
"clean_text": true,
|
67 |
"handle_chinese_chars": true,
|
68 |
"strip_accents": null,
|
69 |
-
"lowercase":
|
70 |
},
|
71 |
"pre_tokenizer": {
|
72 |
"type": "WhitespaceSplit"
|
@@ -183,17 +183,17 @@
|
|
183 |
"r": 56,
|
184 |
"s": 57,
|
185 |
"##\"": 58,
|
186 |
-
"##
|
187 |
-
"##
|
188 |
-
"##
|
189 |
-
"##
|
190 |
-
"##
|
191 |
-
"##
|
192 |
-
"##
|
193 |
-
"##
|
194 |
-
"##
|
195 |
-
"##
|
196 |
-
"##
|
197 |
}
|
198 |
}
|
199 |
}
|
|
|
2 |
"version": "1.0",
|
3 |
"truncation": {
|
4 |
"direction": "Right",
|
5 |
+
"max_length": 512,
|
6 |
"strategy": "LongestFirst",
|
7 |
"stride": 0
|
8 |
},
|
|
|
66 |
"clean_text": true,
|
67 |
"handle_chinese_chars": true,
|
68 |
"strip_accents": null,
|
69 |
+
"lowercase": false
|
70 |
},
|
71 |
"pre_tokenizer": {
|
72 |
"type": "WhitespaceSplit"
|
|
|
183 |
"r": 56,
|
184 |
"s": 57,
|
185 |
"##\"": 58,
|
186 |
+
"##S": 59,
|
187 |
+
"##E": 60,
|
188 |
+
"##P": 61,
|
189 |
+
"##]": 62,
|
190 |
+
"##C": 63,
|
191 |
+
"##O": 64,
|
192 |
+
"##F": 65,
|
193 |
+
"##[": 66,
|
194 |
+
"##N": 67,
|
195 |
+
"##B": 68,
|
196 |
+
"##c": 69
|
197 |
}
|
198 |
}
|
199 |
}
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"tokenizer_class": "PreTrainedTokenizerFast"}
|