hyunjongkimmath commited on
Commit
57d6255
1 Parent(s): 8988d6a

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +21 -3
  2. tokenizer.json +60 -3
  3. tokenizer_config.json +0 -0
special_tokens_map.json CHANGED
@@ -101,7 +101,25 @@
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
- "eos_token": "</s>",
105
- "pad_token": "<pad>",
106
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  }
 
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
+ "eos_token": {
105
+ "content": "</s>",
106
+ "lstrip": false,
107
+ "normalized": false,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "pad_token": {
112
+ "content": "<pad>",
113
+ "lstrip": false,
114
+ "normalized": false,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "unk_token": {
119
+ "content": "<unk>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ }
125
  }
tokenizer.json CHANGED
@@ -35,6 +35,24 @@
35
  "normalized": false,
36
  "special": true
37
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  {
39
  "id": 32000,
40
  "content": "<extra_id_99>",
@@ -5776,6 +5794,42 @@
5776
  "rstrip": false,
5777
  "normalized": true,
5778
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5779
  }
5780
  ],
5781
  "normalizer": {
@@ -5791,7 +5845,8 @@
5791
  {
5792
  "type": "Metaspace",
5793
  "replacement": "▁",
5794
- "add_prefix_space": true
 
5795
  }
5796
  ]
5797
  },
@@ -5852,7 +5907,8 @@
5852
  "decoder": {
5853
  "type": "Metaspace",
5854
  "replacement": "▁",
5855
- "add_prefix_space": true
 
5856
  },
5857
  "model": {
5858
  "type": "Unigram",
@@ -134258,6 +134314,7 @@
134258
  "<extra_id_0>",
134259
  0.0
134260
  ]
134261
- ]
 
134262
  }
134263
  }
 
35
  "normalized": false,
36
  "special": true
37
  },
38
+ {
39
+ "id": 834,
40
+ "content": "_",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": true,
45
+ "special": false
46
+ },
47
+ {
48
+ "id": 3229,
49
+ "content": "$",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": true,
54
+ "special": false
55
+ },
56
  {
57
  "id": 32000,
58
  "content": "<extra_id_99>",
 
5794
  "rstrip": false,
5795
  "normalized": true,
5796
  "special": false
5797
+ },
5798
+ {
5799
+ "id": 32638,
5800
+ "content": "\\",
5801
+ "single_word": false,
5802
+ "lstrip": false,
5803
+ "rstrip": false,
5804
+ "normalized": true,
5805
+ "special": false
5806
+ },
5807
+ {
5808
+ "id": 32639,
5809
+ "content": "^",
5810
+ "single_word": false,
5811
+ "lstrip": false,
5812
+ "rstrip": false,
5813
+ "normalized": true,
5814
+ "special": false
5815
+ },
5816
+ {
5817
+ "id": 32640,
5818
+ "content": "{",
5819
+ "single_word": false,
5820
+ "lstrip": false,
5821
+ "rstrip": false,
5822
+ "normalized": true,
5823
+ "special": false
5824
+ },
5825
+ {
5826
+ "id": 32641,
5827
+ "content": "}",
5828
+ "single_word": false,
5829
+ "lstrip": false,
5830
+ "rstrip": false,
5831
+ "normalized": true,
5832
+ "special": false
5833
  }
5834
  ],
5835
  "normalizer": {
 
5845
  {
5846
  "type": "Metaspace",
5847
  "replacement": "▁",
5848
+ "prepend_scheme": "always",
5849
+ "split": true
5850
  }
5851
  ]
5852
  },
 
5907
  "decoder": {
5908
  "type": "Metaspace",
5909
  "replacement": "▁",
5910
+ "prepend_scheme": "always",
5911
+ "split": true
5912
  },
5913
  "model": {
5914
  "type": "Unigram",
 
134314
  "<extra_id_0>",
134315
  0.0
134316
  ]
134317
+ ],
134318
+ "byte_fallback": false
134319
  }
134320
  }
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff