maykcaldas commited on
Commit
24ddeaf
·
1 Parent(s): a027702

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +244 -2
  2. tokenizer_config.json +3 -1
tokenizer.json CHANGED
@@ -2,7 +2,53 @@
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
- "added_tokens": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "normalizer": null,
7
  "pre_tokenizer": {
8
  "type": "WhitespaceSplit"
@@ -291,7 +337,203 @@
291
  "[=Branch3]": 273,
292
  "[\\C@]": 274,
293
  "[#N+1]": 275,
294
- "[=Ring3]": 276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  }
296
  }
297
  }
 
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[mask]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[bos]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[eos]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[unk]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[nop]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
  "type": "WhitespaceSplit"
 
337
  "[=Branch3]": 273,
338
  "[\\C@]": 274,
339
  "[#N+1]": 275,
340
+ "[=Ring3]": 276,
341
+ "[19F]": 277,
342
+ "[As-1]": 278,
343
+ "[#14C]": 279,
344
+ "[\\O-1]": 280,
345
+ "[KH1]": 281,
346
+ "[AsH3]": 282,
347
+ "[127Xe]": 283,
348
+ "[S@+1]": 284,
349
+ "[I+3]": 285,
350
+ "[82Rb]": 286,
351
+ "[\\S+1]": 287,
352
+ "[10B]": 288,
353
+ "[Cl+1]": 289,
354
+ "[=11C]": 290,
355
+ "[SiH1-1]": 291,
356
+ "[125I-1]": 292,
357
+ "[=Al]": 293,
358
+ "[=Se+1]": 294,
359
+ "[82Rb+1]": 295,
360
+ "[129Xe]": 296,
361
+ "[18F]": 297,
362
+ "[123Te]": 298,
363
+ "[OH1+1]": 299,
364
+ "[127I]": 300,
365
+ "[15NH1]": 301,
366
+ "[Zn-2]": 302,
367
+ "[\\Si]": 303,
368
+ "[NH1-1]": 304,
369
+ "[\\131I]": 305,
370
+ "[#Al]": 306,
371
+ "[81Kr]": 307,
372
+ "[Br+2]": 308,
373
+ "[/131I]": 309,
374
+ "[Mg+1]": 310,
375
+ "[P@@+1]": 311,
376
+ "[PH2]": 312,
377
+ "[\\C-1]": 313,
378
+ "[123IH1]": 314,
379
+ "[\\B]": 315,
380
+ "[=13CH1]": 316,
381
+ "[Cl+2]": 317,
382
+ "[SiH4]": 318,
383
+ "[Ba]": 319,
384
+ "[=PH2]": 320,
385
+ "[Ag-4]": 321,
386
+ "[73Se]": 322,
387
+ "[OH1]": 323,
388
+ "[SrH2]": 324,
389
+ "[223Ra]": 325,
390
+ "[15OH2]": 326,
391
+ "[13CH1]": 327,
392
+ "[/123I]": 328,
393
+ "[\\C@@]": 329,
394
+ "[18FH1]": 330,
395
+ "[B@-1]": 331,
396
+ "[=SH1]": 332,
397
+ "[14CH2]": 333,
398
+ "[Se-2]": 334,
399
+ "[=P@@]": 335,
400
+ "[SH2]": 336,
401
+ "[133Xe]": 337,
402
+ "[#Ring2]": 338,
403
+ "[AsH1]": 339,
404
+ "[47Ca+2]": 340,
405
+ "[=P@]": 341,
406
+ "[14C@H1]": 342,
407
+ "[15N]": 343,
408
+ "[Te+1]": 344,
409
+ "[Al-3]": 345,
410
+ "[14CH1]": 346,
411
+ "[B@@-1]": 347,
412
+ "[Te-1]": 348,
413
+ "[Si-1]": 349,
414
+ "[\\S-1]": 350,
415
+ "[Se-1]": 351,
416
+ "[18OH1]": 352,
417
+ "[=NH2+1]": 353,
418
+ "[11CH1]": 354,
419
+ "[=B-1]": 355,
420
+ "[11CH3]": 356,
421
+ "[S@@+1]": 357,
422
+ "[\\3H]": 358,
423
+ "[17F]": 359,
424
+ "[3H]": 360,
425
+ "[S@]": 361,
426
+ "[He]": 362,
427
+ "[/N-1]": 363,
428
+ "[42K+1]": 364,
429
+ "[11C]": 365,
430
+ "[\\NH1-1]": 366,
431
+ "[13CH3]": 367,
432
+ "[BH2-1]": 368,
433
+ "[/S-1]": 369,
434
+ "[11C@H1]": 370,
435
+ "[\\123I]": 371,
436
+ "[Be+2]": 372,
437
+ "[/13CH1]": 373,
438
+ "[135I]": 374,
439
+ "[14C@@H1]": 375,
440
+ "[/Te]": 376,
441
+ "[BH1-1]": 377,
442
+ "[Kr]": 378,
443
+ "[13NH3]": 379,
444
+ "[/13C]": 380,
445
+ "[13C]": 381,
446
+ "[=Mg]": 382,
447
+ "[/14CH1]": 383,
448
+ "[N@@+1]": 384,
449
+ "[SeH1]": 385,
450
+ "[-\\Ring2]": 386,
451
+ "[SiH3-1]": 387,
452
+ "[N@@]": 388,
453
+ "[123I-1]": 389,
454
+ "[I+1]": 390,
455
+ "[32PH1]": 391,
456
+ "[SeH2]": 392,
457
+ "[45Ca+2]": 393,
458
+ "[\\P]": 394,
459
+ "[22Na+1]": 395,
460
+ "[11CH2]": 396,
461
+ "[76BrH1]": 397,
462
+ "[/O-1]": 398,
463
+ "[\\P+1]": 399,
464
+ "[LiH1]": 400,
465
+ "[/P@]": 401,
466
+ "[=13C]": 402,
467
+ "[/B]": 403,
468
+ "[35S]": 404,
469
+ "[Xe]": 405,
470
+ "[=Te+1]": 406,
471
+ "[#Ring1]": 407,
472
+ "[Rb]": 408,
473
+ "[=S@@]": 409,
474
+ "[HH1]": 410,
475
+ "[124I]": 411,
476
+ "[/Si]": 412,
477
+ "[S@@]": 413,
478
+ "[Se+1]": 414,
479
+ "[/P]": 415,
480
+ "[85SrH2]": 416,
481
+ "[I+2]": 417,
482
+ "[32P]": 418,
483
+ "[/125I]": 419,
484
+ "[85Sr+2]": 420,
485
+ "[4H]": 421,
486
+ "[\\SeH1]": 422,
487
+ "[14CH3]": 423,
488
+ "[SH1]": 424,
489
+ "[124I-1]": 425,
490
+ "[=18O]": 426,
491
+ "[Zn+1]": 427,
492
+ "[N@+1]": 428,
493
+ "[125I]": 429,
494
+ "[/S+1]": 430,
495
+ "[SH1+1]": 431,
496
+ "[131I-1]": 432,
497
+ "[P@+1]": 433,
498
+ "[\\CH1-1]": 434,
499
+ "[/11CH3]": 435,
500
+ "[131Cs]": 436,
501
+ "[131I]": 437,
502
+ "[18F-1]": 438,
503
+ "[\\Se]": 439,
504
+ "[=CH1]": 440,
505
+ "[/F]": 441,
506
+ "[PH2+1]": 442,
507
+ "[TeH1]": 443,
508
+ "[Ra]": 444,
509
+ "[123I]": 445,
510
+ "[13CH2]": 446,
511
+ "[Rb+1]": 447,
512
+ "[/C-1]": 448,
513
+ "[=14C]": 449,
514
+ "[BH3-1]": 450,
515
+ "[125IH1]": 451,
516
+ "[/Se]": 452,
517
+ "[75Se]": 453,
518
+ "[/14C]": 454,
519
+ "[=S@]": 455,
520
+ "[\\PH1]": 456,
521
+ "[SiH2]": 457,
522
+ "[Ra+2]": 458,
523
+ "[NaH1]": 459,
524
+ "[14C]": 460,
525
+ "[76Br]": 461,
526
+ "[=14CH1]": 462,
527
+ "[223Ra+2]": 463,
528
+ "[/CH1-1]": 464,
529
+ "[As+1]": 465,
530
+ "[\\125I]": 466,
531
+ "[TeH2]": 467,
532
+ "[\\F]": 468,
533
+ "[14C@]": 469,
534
+ "[\\Te]": 470,
535
+ "[89Sr+2]": 471,
536
+ "[#11C-1]": 472
537
  }
538
  }
539
  }
tokenizer_config.json CHANGED
@@ -1,10 +1,12 @@
1
  {
2
  "cls_token": "[bos]",
3
  "mask_token": "[mask]",
4
- "model_max_length": 135,
 
5
  "pad_token": "[nop]",
6
  "padding_side": "right",
7
  "sep_token": "[eos]",
 
8
  "tokenizer_class": "PreTrainedTokenizerFast",
9
  "truncation_side": "right",
10
  "unk_token": "[unk]"
 
1
  {
2
  "cls_token": "[bos]",
3
  "mask_token": "[mask]",
4
+ "model_max_length": 1325,
5
+ "name_or_path": "/content/drive/MyDrive/WhiteLab/selfies-encoder/tokenizer",
6
  "pad_token": "[nop]",
7
  "padding_side": "right",
8
  "sep_token": "[eos]",
9
+ "special_tokens_map_file": "/content/drive/MyDrive/WhiteLab/selfies-encoder/tokenizer/special_tokens_map.json",
10
  "tokenizer_class": "PreTrainedTokenizerFast",
11
  "truncation_side": "right",
12
  "unk_token": "[unk]"