maykcaldas
commited on
Commit
·
24ddeaf
1
Parent(s):
a027702
Upload tokenizer
Browse files- tokenizer.json +244 -2
- tokenizer_config.json +3 -1
tokenizer.json
CHANGED
@@ -2,7 +2,53 @@
|
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
"padding": null,
|
5 |
-
"added_tokens": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"normalizer": null,
|
7 |
"pre_tokenizer": {
|
8 |
"type": "WhitespaceSplit"
|
@@ -291,7 +337,203 @@
|
|
291 |
"[=Branch3]": 273,
|
292 |
"[\\C@]": 274,
|
293 |
"[#N+1]": 275,
|
294 |
-
"[=Ring3]": 276
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
}
|
296 |
}
|
297 |
}
|
|
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
"padding": null,
|
5 |
+
"added_tokens": [
|
6 |
+
{
|
7 |
+
"id": 0,
|
8 |
+
"content": "[mask]",
|
9 |
+
"single_word": false,
|
10 |
+
"lstrip": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"id": 1,
|
17 |
+
"content": "[bos]",
|
18 |
+
"single_word": false,
|
19 |
+
"lstrip": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"normalized": false,
|
22 |
+
"special": true
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id": 2,
|
26 |
+
"content": "[eos]",
|
27 |
+
"single_word": false,
|
28 |
+
"lstrip": false,
|
29 |
+
"rstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"special": true
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"id": 3,
|
35 |
+
"content": "[unk]",
|
36 |
+
"single_word": false,
|
37 |
+
"lstrip": false,
|
38 |
+
"rstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"special": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"id": 4,
|
44 |
+
"content": "[nop]",
|
45 |
+
"single_word": false,
|
46 |
+
"lstrip": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"special": true
|
50 |
+
}
|
51 |
+
],
|
52 |
"normalizer": null,
|
53 |
"pre_tokenizer": {
|
54 |
"type": "WhitespaceSplit"
|
|
|
337 |
"[=Branch3]": 273,
|
338 |
"[\\C@]": 274,
|
339 |
"[#N+1]": 275,
|
340 |
+
"[=Ring3]": 276,
|
341 |
+
"[19F]": 277,
|
342 |
+
"[As-1]": 278,
|
343 |
+
"[#14C]": 279,
|
344 |
+
"[\\O-1]": 280,
|
345 |
+
"[KH1]": 281,
|
346 |
+
"[AsH3]": 282,
|
347 |
+
"[127Xe]": 283,
|
348 |
+
"[S@+1]": 284,
|
349 |
+
"[I+3]": 285,
|
350 |
+
"[82Rb]": 286,
|
351 |
+
"[\\S+1]": 287,
|
352 |
+
"[10B]": 288,
|
353 |
+
"[Cl+1]": 289,
|
354 |
+
"[=11C]": 290,
|
355 |
+
"[SiH1-1]": 291,
|
356 |
+
"[125I-1]": 292,
|
357 |
+
"[=Al]": 293,
|
358 |
+
"[=Se+1]": 294,
|
359 |
+
"[82Rb+1]": 295,
|
360 |
+
"[129Xe]": 296,
|
361 |
+
"[18F]": 297,
|
362 |
+
"[123Te]": 298,
|
363 |
+
"[OH1+1]": 299,
|
364 |
+
"[127I]": 300,
|
365 |
+
"[15NH1]": 301,
|
366 |
+
"[Zn-2]": 302,
|
367 |
+
"[\\Si]": 303,
|
368 |
+
"[NH1-1]": 304,
|
369 |
+
"[\\131I]": 305,
|
370 |
+
"[#Al]": 306,
|
371 |
+
"[81Kr]": 307,
|
372 |
+
"[Br+2]": 308,
|
373 |
+
"[/131I]": 309,
|
374 |
+
"[Mg+1]": 310,
|
375 |
+
"[P@@+1]": 311,
|
376 |
+
"[PH2]": 312,
|
377 |
+
"[\\C-1]": 313,
|
378 |
+
"[123IH1]": 314,
|
379 |
+
"[\\B]": 315,
|
380 |
+
"[=13CH1]": 316,
|
381 |
+
"[Cl+2]": 317,
|
382 |
+
"[SiH4]": 318,
|
383 |
+
"[Ba]": 319,
|
384 |
+
"[=PH2]": 320,
|
385 |
+
"[Ag-4]": 321,
|
386 |
+
"[73Se]": 322,
|
387 |
+
"[OH1]": 323,
|
388 |
+
"[SrH2]": 324,
|
389 |
+
"[223Ra]": 325,
|
390 |
+
"[15OH2]": 326,
|
391 |
+
"[13CH1]": 327,
|
392 |
+
"[/123I]": 328,
|
393 |
+
"[\\C@@]": 329,
|
394 |
+
"[18FH1]": 330,
|
395 |
+
"[B@-1]": 331,
|
396 |
+
"[=SH1]": 332,
|
397 |
+
"[14CH2]": 333,
|
398 |
+
"[Se-2]": 334,
|
399 |
+
"[=P@@]": 335,
|
400 |
+
"[SH2]": 336,
|
401 |
+
"[133Xe]": 337,
|
402 |
+
"[#Ring2]": 338,
|
403 |
+
"[AsH1]": 339,
|
404 |
+
"[47Ca+2]": 340,
|
405 |
+
"[=P@]": 341,
|
406 |
+
"[14C@H1]": 342,
|
407 |
+
"[15N]": 343,
|
408 |
+
"[Te+1]": 344,
|
409 |
+
"[Al-3]": 345,
|
410 |
+
"[14CH1]": 346,
|
411 |
+
"[B@@-1]": 347,
|
412 |
+
"[Te-1]": 348,
|
413 |
+
"[Si-1]": 349,
|
414 |
+
"[\\S-1]": 350,
|
415 |
+
"[Se-1]": 351,
|
416 |
+
"[18OH1]": 352,
|
417 |
+
"[=NH2+1]": 353,
|
418 |
+
"[11CH1]": 354,
|
419 |
+
"[=B-1]": 355,
|
420 |
+
"[11CH3]": 356,
|
421 |
+
"[S@@+1]": 357,
|
422 |
+
"[\\3H]": 358,
|
423 |
+
"[17F]": 359,
|
424 |
+
"[3H]": 360,
|
425 |
+
"[S@]": 361,
|
426 |
+
"[He]": 362,
|
427 |
+
"[/N-1]": 363,
|
428 |
+
"[42K+1]": 364,
|
429 |
+
"[11C]": 365,
|
430 |
+
"[\\NH1-1]": 366,
|
431 |
+
"[13CH3]": 367,
|
432 |
+
"[BH2-1]": 368,
|
433 |
+
"[/S-1]": 369,
|
434 |
+
"[11C@H1]": 370,
|
435 |
+
"[\\123I]": 371,
|
436 |
+
"[Be+2]": 372,
|
437 |
+
"[/13CH1]": 373,
|
438 |
+
"[135I]": 374,
|
439 |
+
"[14C@@H1]": 375,
|
440 |
+
"[/Te]": 376,
|
441 |
+
"[BH1-1]": 377,
|
442 |
+
"[Kr]": 378,
|
443 |
+
"[13NH3]": 379,
|
444 |
+
"[/13C]": 380,
|
445 |
+
"[13C]": 381,
|
446 |
+
"[=Mg]": 382,
|
447 |
+
"[/14CH1]": 383,
|
448 |
+
"[N@@+1]": 384,
|
449 |
+
"[SeH1]": 385,
|
450 |
+
"[-\\Ring2]": 386,
|
451 |
+
"[SiH3-1]": 387,
|
452 |
+
"[N@@]": 388,
|
453 |
+
"[123I-1]": 389,
|
454 |
+
"[I+1]": 390,
|
455 |
+
"[32PH1]": 391,
|
456 |
+
"[SeH2]": 392,
|
457 |
+
"[45Ca+2]": 393,
|
458 |
+
"[\\P]": 394,
|
459 |
+
"[22Na+1]": 395,
|
460 |
+
"[11CH2]": 396,
|
461 |
+
"[76BrH1]": 397,
|
462 |
+
"[/O-1]": 398,
|
463 |
+
"[\\P+1]": 399,
|
464 |
+
"[LiH1]": 400,
|
465 |
+
"[/P@]": 401,
|
466 |
+
"[=13C]": 402,
|
467 |
+
"[/B]": 403,
|
468 |
+
"[35S]": 404,
|
469 |
+
"[Xe]": 405,
|
470 |
+
"[=Te+1]": 406,
|
471 |
+
"[#Ring1]": 407,
|
472 |
+
"[Rb]": 408,
|
473 |
+
"[=S@@]": 409,
|
474 |
+
"[HH1]": 410,
|
475 |
+
"[124I]": 411,
|
476 |
+
"[/Si]": 412,
|
477 |
+
"[S@@]": 413,
|
478 |
+
"[Se+1]": 414,
|
479 |
+
"[/P]": 415,
|
480 |
+
"[85SrH2]": 416,
|
481 |
+
"[I+2]": 417,
|
482 |
+
"[32P]": 418,
|
483 |
+
"[/125I]": 419,
|
484 |
+
"[85Sr+2]": 420,
|
485 |
+
"[4H]": 421,
|
486 |
+
"[\\SeH1]": 422,
|
487 |
+
"[14CH3]": 423,
|
488 |
+
"[SH1]": 424,
|
489 |
+
"[124I-1]": 425,
|
490 |
+
"[=18O]": 426,
|
491 |
+
"[Zn+1]": 427,
|
492 |
+
"[N@+1]": 428,
|
493 |
+
"[125I]": 429,
|
494 |
+
"[/S+1]": 430,
|
495 |
+
"[SH1+1]": 431,
|
496 |
+
"[131I-1]": 432,
|
497 |
+
"[P@+1]": 433,
|
498 |
+
"[\\CH1-1]": 434,
|
499 |
+
"[/11CH3]": 435,
|
500 |
+
"[131Cs]": 436,
|
501 |
+
"[131I]": 437,
|
502 |
+
"[18F-1]": 438,
|
503 |
+
"[\\Se]": 439,
|
504 |
+
"[=CH1]": 440,
|
505 |
+
"[/F]": 441,
|
506 |
+
"[PH2+1]": 442,
|
507 |
+
"[TeH1]": 443,
|
508 |
+
"[Ra]": 444,
|
509 |
+
"[123I]": 445,
|
510 |
+
"[13CH2]": 446,
|
511 |
+
"[Rb+1]": 447,
|
512 |
+
"[/C-1]": 448,
|
513 |
+
"[=14C]": 449,
|
514 |
+
"[BH3-1]": 450,
|
515 |
+
"[125IH1]": 451,
|
516 |
+
"[/Se]": 452,
|
517 |
+
"[75Se]": 453,
|
518 |
+
"[/14C]": 454,
|
519 |
+
"[=S@]": 455,
|
520 |
+
"[\\PH1]": 456,
|
521 |
+
"[SiH2]": 457,
|
522 |
+
"[Ra+2]": 458,
|
523 |
+
"[NaH1]": 459,
|
524 |
+
"[14C]": 460,
|
525 |
+
"[76Br]": 461,
|
526 |
+
"[=14CH1]": 462,
|
527 |
+
"[223Ra+2]": 463,
|
528 |
+
"[/CH1-1]": 464,
|
529 |
+
"[As+1]": 465,
|
530 |
+
"[\\125I]": 466,
|
531 |
+
"[TeH2]": 467,
|
532 |
+
"[\\F]": 468,
|
533 |
+
"[14C@]": 469,
|
534 |
+
"[\\Te]": 470,
|
535 |
+
"[89Sr+2]": 471,
|
536 |
+
"[#11C-1]": 472
|
537 |
}
|
538 |
}
|
539 |
}
|
tokenizer_config.json
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
{
|
2 |
"cls_token": "[bos]",
|
3 |
"mask_token": "[mask]",
|
4 |
-
"model_max_length":
|
|
|
5 |
"pad_token": "[nop]",
|
6 |
"padding_side": "right",
|
7 |
"sep_token": "[eos]",
|
|
|
8 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
9 |
"truncation_side": "right",
|
10 |
"unk_token": "[unk]"
|
|
|
1 |
{
|
2 |
"cls_token": "[bos]",
|
3 |
"mask_token": "[mask]",
|
4 |
+
"model_max_length": 1325,
|
5 |
+
"name_or_path": "/content/drive/MyDrive/WhiteLab/selfies-encoder/tokenizer",
|
6 |
"pad_token": "[nop]",
|
7 |
"padding_side": "right",
|
8 |
"sep_token": "[eos]",
|
9 |
+
"special_tokens_map_file": "/content/drive/MyDrive/WhiteLab/selfies-encoder/tokenizer/special_tokens_map.json",
|
10 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
11 |
"truncation_side": "right",
|
12 |
"unk_token": "[unk]"
|