georgiyozhegov commited on
Commit
f222399
·
verified ·
1 Parent(s): 85b48ee

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +30 -0
  2. tokenizer.json +591 -0
  3. tokenizer_config.json +75 -0
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<sos>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<eos>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<pad>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "find",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "step",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "answer",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 128,
71
+ "content": "<bos>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ }
78
+ ],
79
+ "normalizer": null,
80
+ "pre_tokenizer": {
81
+ "type": "Metaspace",
82
+ "replacement": "▁",
83
+ "prepend_scheme": "always",
84
+ "split": true
85
+ },
86
+ "post_processor": null,
87
+ "decoder": {
88
+ "type": "Metaspace",
89
+ "replacement": "▁",
90
+ "prepend_scheme": "always",
91
+ "split": true
92
+ },
93
+ "model": {
94
+ "type": "BPE",
95
+ "dropout": null,
96
+ "unk_token": null,
97
+ "continuing_subword_prefix": null,
98
+ "end_of_word_suffix": null,
99
+ "fuse_unk": false,
100
+ "byte_fallback": false,
101
+ "ignore_merges": false,
102
+ "vocab": {
103
+ "<sos>": 0,
104
+ "<eos>": 1,
105
+ "<pad>": 2,
106
+ "<unk>": 3,
107
+ "find": 4,
108
+ "step": 5,
109
+ "answer": 6,
110
+ "\n": 7,
111
+ "(": 8,
112
+ ")": 9,
113
+ "*": 10,
114
+ "+": 11,
115
+ "-": 12,
116
+ ".": 13,
117
+ "/": 14,
118
+ "0": 15,
119
+ "1": 16,
120
+ "2": 17,
121
+ "3": 18,
122
+ "4": 19,
123
+ "5": 20,
124
+ "6": 21,
125
+ "7": 22,
126
+ "8": 23,
127
+ "9": 24,
128
+ "=": 25,
129
+ "a": 26,
130
+ "d": 27,
131
+ "e": 28,
132
+ "f": 29,
133
+ "i": 30,
134
+ "n": 31,
135
+ "o": 32,
136
+ "p": 33,
137
+ "r": 34,
138
+ "s": 35,
139
+ "t": 36,
140
+ "w": 37,
141
+ "▁": 38,
142
+ "▁-": 39,
143
+ "\ns": 40,
144
+ "ep": 41,
145
+ "tep": 42,
146
+ "▁=": 43,
147
+ "\nstep": 44,
148
+ "0.": 45,
149
+ "▁1": 46,
150
+ "er": 47,
151
+ "▁/": 48,
152
+ "▁*": 49,
153
+ "\na": 50,
154
+ "fi": 51,
155
+ "nd": 52,
156
+ "ns": 53,
157
+ "wer": 54,
158
+ "▁fi": 55,
159
+ "\nans": 56,
160
+ "▁find": 57,
161
+ "\nanswer": 58,
162
+ "▁+": 59,
163
+ "▁-1": 60,
164
+ "▁2": 61,
165
+ "▁0.": 62,
166
+ "▁3": 63,
167
+ "▁4": 64,
168
+ "▁5": 65,
169
+ "▁6": 66,
170
+ "▁7": 67,
171
+ "▁8": 68,
172
+ "▁9": 69,
173
+ "▁-2": 70,
174
+ "▁-0.": 71,
175
+ "▁-3": 72,
176
+ "▁-4": 73,
177
+ "▁-5": 74,
178
+ "▁-6": 75,
179
+ "▁-7": 76,
180
+ ".5": 77,
181
+ ".2": 78,
182
+ ".1": 79,
183
+ "▁-8": 80,
184
+ "▁-9": 81,
185
+ ".3": 82,
186
+ ".6": 83,
187
+ ".8": 84,
188
+ ".4": 85,
189
+ ".7": 86,
190
+ ".9": 87,
191
+ "33": 88,
192
+ "66": 89,
193
+ "5\nstep": 90,
194
+ "0\nstep": 91,
195
+ "2\nstep": 92,
196
+ "11": 93,
197
+ "4\nstep": 94,
198
+ "8\nstep": 95,
199
+ "6\nstep": 96,
200
+ "▁10": 97,
201
+ "25": 98,
202
+ "28": 99,
203
+ "14": 100,
204
+ "7\nstep": 101,
205
+ "99": 102,
206
+ "75": 103,
207
+ "3\nstep": 104,
208
+ "1\nstep": 105,
209
+ "▁(": 106,
210
+ "18": 107,
211
+ "29": 108,
212
+ "15": 109,
213
+ "44": 110,
214
+ "16": 111,
215
+ "19": 112,
216
+ "23": 113,
217
+ "78": 114,
218
+ "76": 115,
219
+ "13": 116,
220
+ "26": 117,
221
+ "24": 118,
222
+ "08": 119,
223
+ "79": 120,
224
+ "05": 121,
225
+ "09": 122,
226
+ "04": 123,
227
+ "74": 124,
228
+ "73": 125,
229
+ "06": 126,
230
+ "88": 127
231
+ },
232
+ "merges": [
233
+ [
234
+ "▁",
235
+ "-"
236
+ ],
237
+ [
238
+ "\n",
239
+ "s"
240
+ ],
241
+ [
242
+ "e",
243
+ "p"
244
+ ],
245
+ [
246
+ "t",
247
+ "ep"
248
+ ],
249
+ [
250
+ "▁",
251
+ "="
252
+ ],
253
+ [
254
+ "\ns",
255
+ "tep"
256
+ ],
257
+ [
258
+ "0",
259
+ "."
260
+ ],
261
+ [
262
+ "▁",
263
+ "1"
264
+ ],
265
+ [
266
+ "e",
267
+ "r"
268
+ ],
269
+ [
270
+ "▁",
271
+ "/"
272
+ ],
273
+ [
274
+ "▁",
275
+ "*"
276
+ ],
277
+ [
278
+ "\n",
279
+ "a"
280
+ ],
281
+ [
282
+ "f",
283
+ "i"
284
+ ],
285
+ [
286
+ "n",
287
+ "d"
288
+ ],
289
+ [
290
+ "n",
291
+ "s"
292
+ ],
293
+ [
294
+ "w",
295
+ "er"
296
+ ],
297
+ [
298
+ "▁",
299
+ "fi"
300
+ ],
301
+ [
302
+ "\na",
303
+ "ns"
304
+ ],
305
+ [
306
+ "▁fi",
307
+ "nd"
308
+ ],
309
+ [
310
+ "\nans",
311
+ "wer"
312
+ ],
313
+ [
314
+ "▁",
315
+ "+"
316
+ ],
317
+ [
318
+ "▁-",
319
+ "1"
320
+ ],
321
+ [
322
+ "▁",
323
+ "2"
324
+ ],
325
+ [
326
+ "▁",
327
+ "0."
328
+ ],
329
+ [
330
+ "▁",
331
+ "3"
332
+ ],
333
+ [
334
+ "▁",
335
+ "4"
336
+ ],
337
+ [
338
+ "▁",
339
+ "5"
340
+ ],
341
+ [
342
+ "▁",
343
+ "6"
344
+ ],
345
+ [
346
+ "▁",
347
+ "7"
348
+ ],
349
+ [
350
+ "▁",
351
+ "8"
352
+ ],
353
+ [
354
+ "▁",
355
+ "9"
356
+ ],
357
+ [
358
+ "▁-",
359
+ "2"
360
+ ],
361
+ [
362
+ "▁-",
363
+ "0."
364
+ ],
365
+ [
366
+ "▁-",
367
+ "3"
368
+ ],
369
+ [
370
+ "▁-",
371
+ "4"
372
+ ],
373
+ [
374
+ "▁-",
375
+ "5"
376
+ ],
377
+ [
378
+ "▁-",
379
+ "6"
380
+ ],
381
+ [
382
+ "▁-",
383
+ "7"
384
+ ],
385
+ [
386
+ ".",
387
+ "5"
388
+ ],
389
+ [
390
+ ".",
391
+ "2"
392
+ ],
393
+ [
394
+ ".",
395
+ "1"
396
+ ],
397
+ [
398
+ "▁-",
399
+ "8"
400
+ ],
401
+ [
402
+ "▁-",
403
+ "9"
404
+ ],
405
+ [
406
+ ".",
407
+ "3"
408
+ ],
409
+ [
410
+ ".",
411
+ "6"
412
+ ],
413
+ [
414
+ ".",
415
+ "8"
416
+ ],
417
+ [
418
+ ".",
419
+ "4"
420
+ ],
421
+ [
422
+ ".",
423
+ "7"
424
+ ],
425
+ [
426
+ ".",
427
+ "9"
428
+ ],
429
+ [
430
+ "3",
431
+ "3"
432
+ ],
433
+ [
434
+ "6",
435
+ "6"
436
+ ],
437
+ [
438
+ "5",
439
+ "\nstep"
440
+ ],
441
+ [
442
+ "0",
443
+ "\nstep"
444
+ ],
445
+ [
446
+ "2",
447
+ "\nstep"
448
+ ],
449
+ [
450
+ "1",
451
+ "1"
452
+ ],
453
+ [
454
+ "4",
455
+ "\nstep"
456
+ ],
457
+ [
458
+ "8",
459
+ "\nstep"
460
+ ],
461
+ [
462
+ "6",
463
+ "\nstep"
464
+ ],
465
+ [
466
+ "▁1",
467
+ "0"
468
+ ],
469
+ [
470
+ "2",
471
+ "5"
472
+ ],
473
+ [
474
+ "2",
475
+ "8"
476
+ ],
477
+ [
478
+ "1",
479
+ "4"
480
+ ],
481
+ [
482
+ "7",
483
+ "\nstep"
484
+ ],
485
+ [
486
+ "9",
487
+ "9"
488
+ ],
489
+ [
490
+ "7",
491
+ "5"
492
+ ],
493
+ [
494
+ "3",
495
+ "\nstep"
496
+ ],
497
+ [
498
+ "1",
499
+ "\nstep"
500
+ ],
501
+ [
502
+ "▁",
503
+ "("
504
+ ],
505
+ [
506
+ "1",
507
+ "8"
508
+ ],
509
+ [
510
+ "2",
511
+ "9"
512
+ ],
513
+ [
514
+ "1",
515
+ "5"
516
+ ],
517
+ [
518
+ "4",
519
+ "4"
520
+ ],
521
+ [
522
+ "1",
523
+ "6"
524
+ ],
525
+ [
526
+ "1",
527
+ "9"
528
+ ],
529
+ [
530
+ "2",
531
+ "3"
532
+ ],
533
+ [
534
+ "7",
535
+ "8"
536
+ ],
537
+ [
538
+ "7",
539
+ "6"
540
+ ],
541
+ [
542
+ "1",
543
+ "3"
544
+ ],
545
+ [
546
+ "2",
547
+ "6"
548
+ ],
549
+ [
550
+ "2",
551
+ "4"
552
+ ],
553
+ [
554
+ "0",
555
+ "8"
556
+ ],
557
+ [
558
+ "7",
559
+ "9"
560
+ ],
561
+ [
562
+ "0",
563
+ "5"
564
+ ],
565
+ [
566
+ "0",
567
+ "9"
568
+ ],
569
+ [
570
+ "0",
571
+ "4"
572
+ ],
573
+ [
574
+ "7",
575
+ "4"
576
+ ],
577
+ [
578
+ "7",
579
+ "3"
580
+ ],
581
+ [
582
+ "0",
583
+ "6"
584
+ ],
585
+ [
586
+ "8",
587
+ "8"
588
+ ]
589
+ ]
590
+ }
591
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<sos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "find",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "step",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "answer",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128": {
60
+ "content": "<bos>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ }
67
+ },
68
+ "bos_token": "<bos>",
69
+ "clean_up_tokenization_spaces": false,
70
+ "eos_token": "<eos>",
71
+ "model_max_length": 1000000000000000019884624838656,
72
+ "pad_token": "<pad>",
73
+ "tokenizer_class": "PreTrainedTokenizerFast",
74
+ "unk_token": "<unk>"
75
+ }