hyunjongkimmath
commited on
Commit
•
57d6255
1
Parent(s):
8988d6a
Upload tokenizer
Browse files- special_tokens_map.json +21 -3
- tokenizer.json +60 -3
- tokenizer_config.json +0 -0
special_tokens_map.json
CHANGED
@@ -101,7 +101,25 @@
|
|
101 |
"<extra_id_98>",
|
102 |
"<extra_id_99>"
|
103 |
],
|
104 |
-
"eos_token":
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
}
|
|
|
101 |
"<extra_id_98>",
|
102 |
"<extra_id_99>"
|
103 |
],
|
104 |
+
"eos_token": {
|
105 |
+
"content": "</s>",
|
106 |
+
"lstrip": false,
|
107 |
+
"normalized": false,
|
108 |
+
"rstrip": false,
|
109 |
+
"single_word": false
|
110 |
+
},
|
111 |
+
"pad_token": {
|
112 |
+
"content": "<pad>",
|
113 |
+
"lstrip": false,
|
114 |
+
"normalized": false,
|
115 |
+
"rstrip": false,
|
116 |
+
"single_word": false
|
117 |
+
},
|
118 |
+
"unk_token": {
|
119 |
+
"content": "<unk>",
|
120 |
+
"lstrip": false,
|
121 |
+
"normalized": false,
|
122 |
+
"rstrip": false,
|
123 |
+
"single_word": false
|
124 |
+
}
|
125 |
}
|
tokenizer.json
CHANGED
@@ -35,6 +35,24 @@
|
|
35 |
"normalized": false,
|
36 |
"special": true
|
37 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
{
|
39 |
"id": 32000,
|
40 |
"content": "<extra_id_99>",
|
@@ -5776,6 +5794,42 @@
|
|
5776 |
"rstrip": false,
|
5777 |
"normalized": true,
|
5778 |
"special": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5779 |
}
|
5780 |
],
|
5781 |
"normalizer": {
|
@@ -5791,7 +5845,8 @@
|
|
5791 |
{
|
5792 |
"type": "Metaspace",
|
5793 |
"replacement": "▁",
|
5794 |
-
"
|
|
|
5795 |
}
|
5796 |
]
|
5797 |
},
|
@@ -5852,7 +5907,8 @@
|
|
5852 |
"decoder": {
|
5853 |
"type": "Metaspace",
|
5854 |
"replacement": "▁",
|
5855 |
-
"
|
|
|
5856 |
},
|
5857 |
"model": {
|
5858 |
"type": "Unigram",
|
@@ -134258,6 +134314,7 @@
|
|
134258 |
"<extra_id_0>",
|
134259 |
0.0
|
134260 |
]
|
134261 |
-
]
|
|
|
134262 |
}
|
134263 |
}
|
|
|
35 |
"normalized": false,
|
36 |
"special": true
|
37 |
},
|
38 |
+
{
|
39 |
+
"id": 834,
|
40 |
+
"content": "_",
|
41 |
+
"single_word": false,
|
42 |
+
"lstrip": false,
|
43 |
+
"rstrip": false,
|
44 |
+
"normalized": true,
|
45 |
+
"special": false
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"id": 3229,
|
49 |
+
"content": "$",
|
50 |
+
"single_word": false,
|
51 |
+
"lstrip": false,
|
52 |
+
"rstrip": false,
|
53 |
+
"normalized": true,
|
54 |
+
"special": false
|
55 |
+
},
|
56 |
{
|
57 |
"id": 32000,
|
58 |
"content": "<extra_id_99>",
|
|
|
5794 |
"rstrip": false,
|
5795 |
"normalized": true,
|
5796 |
"special": false
|
5797 |
+
},
|
5798 |
+
{
|
5799 |
+
"id": 32638,
|
5800 |
+
"content": "\\",
|
5801 |
+
"single_word": false,
|
5802 |
+
"lstrip": false,
|
5803 |
+
"rstrip": false,
|
5804 |
+
"normalized": true,
|
5805 |
+
"special": false
|
5806 |
+
},
|
5807 |
+
{
|
5808 |
+
"id": 32639,
|
5809 |
+
"content": "^",
|
5810 |
+
"single_word": false,
|
5811 |
+
"lstrip": false,
|
5812 |
+
"rstrip": false,
|
5813 |
+
"normalized": true,
|
5814 |
+
"special": false
|
5815 |
+
},
|
5816 |
+
{
|
5817 |
+
"id": 32640,
|
5818 |
+
"content": "{",
|
5819 |
+
"single_word": false,
|
5820 |
+
"lstrip": false,
|
5821 |
+
"rstrip": false,
|
5822 |
+
"normalized": true,
|
5823 |
+
"special": false
|
5824 |
+
},
|
5825 |
+
{
|
5826 |
+
"id": 32641,
|
5827 |
+
"content": "}",
|
5828 |
+
"single_word": false,
|
5829 |
+
"lstrip": false,
|
5830 |
+
"rstrip": false,
|
5831 |
+
"normalized": true,
|
5832 |
+
"special": false
|
5833 |
}
|
5834 |
],
|
5835 |
"normalizer": {
|
|
|
5845 |
{
|
5846 |
"type": "Metaspace",
|
5847 |
"replacement": "▁",
|
5848 |
+
"prepend_scheme": "always",
|
5849 |
+
"split": true
|
5850 |
}
|
5851 |
]
|
5852 |
},
|
|
|
5907 |
"decoder": {
|
5908 |
"type": "Metaspace",
|
5909 |
"replacement": "▁",
|
5910 |
+
"prepend_scheme": "always",
|
5911 |
+
"split": true
|
5912 |
},
|
5913 |
"model": {
|
5914 |
"type": "Unigram",
|
|
|
134314 |
"<extra_id_0>",
|
134315 |
0.0
|
134316 |
]
|
134317 |
+
],
|
134318 |
+
"byte_fallback": false
|
134319 |
}
|
134320 |
}
|
tokenizer_config.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|