Fixed tokenizer (hopefully)
Browse files[SYSTEM_PROMPT] and [/SYSTEM_PROMPT] moved from token IDs 131072/131073 to 17/18, as was originally intended. Extraneous tokens removed from config. Embedding weights and lm_head altered with the Python Transformers toolkit to accommodate these changes.
- config.json +1 -1
- model-00001-of-00005.safetensors +2 -2
- model-00005-of-00005.safetensors +2 -2
- tokenizer.json +2 -2
- tokenizer_config.json +3 -19
config.json
CHANGED
@@ -23,5 +23,5 @@
|
|
23 |
"torch_dtype": "bfloat16",
|
24 |
"transformers_version": "4.49.0.dev0",
|
25 |
"use_cache": false,
|
26 |
-
"vocab_size":
|
27 |
}
|
|
|
23 |
"torch_dtype": "bfloat16",
|
24 |
"transformers_version": "4.49.0.dev0",
|
25 |
"use_cache": false,
|
26 |
+
"vocab_size": 131072
|
27 |
}
|
model-00001-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:741eb08fd622a4cc8331a59fc7262f9e61f1a53f24d439e6a820bb2ea3c22422
|
3 |
+
size 4865522464
|
model-00005-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34e82af09f94e8a3f8c9959f7623fe404660e2e6eb56a72ddffbf1d859e031dd
|
3 |
+
size 4907496240
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38b3cc4f6ed44a0c1e023d21fcb148b555d288cb74dac642eb49bdfe58a3a70e
|
3 |
+
size 21525621
|
tokenizer_config.json
CHANGED
@@ -140,7 +140,7 @@
|
|
140 |
"special": true
|
141 |
},
|
142 |
"17": {
|
143 |
-
"content": "
|
144 |
"lstrip": false,
|
145 |
"normalized": false,
|
146 |
"rstrip": false,
|
@@ -148,7 +148,7 @@
|
|
148 |
"special": true
|
149 |
},
|
150 |
"18": {
|
151 |
-
"content": "
|
152 |
"lstrip": false,
|
153 |
"normalized": false,
|
154 |
"rstrip": false,
|
@@ -8002,22 +8002,6 @@
|
|
8002 |
"rstrip": false,
|
8003 |
"single_word": false,
|
8004 |
"special": true
|
8005 |
-
},
|
8006 |
-
"131072": {
|
8007 |
-
"content": "[SYSTEM_PROMPT]",
|
8008 |
-
"lstrip": false,
|
8009 |
-
"normalized": false,
|
8010 |
-
"rstrip": false,
|
8011 |
-
"single_word": false,
|
8012 |
-
"special": false
|
8013 |
-
},
|
8014 |
-
"131073": {
|
8015 |
-
"content": "[/SYSTEM_PROMPT]",
|
8016 |
-
"lstrip": false,
|
8017 |
-
"normalized": false,
|
8018 |
-
"rstrip": false,
|
8019 |
-
"single_word": false,
|
8020 |
-
"special": false
|
8021 |
}
|
8022 |
},
|
8023 |
"bos_token": "<s>",
|
@@ -8031,6 +8015,6 @@
|
|
8031 |
],
|
8032 |
"model_max_length": 1000000000000000019884624838656,
|
8033 |
"pad_token": "<pad>",
|
8034 |
-
"tokenizer_class": "
|
8035 |
"unk_token": "<unk>"
|
8036 |
}
|
|
|
140 |
"special": true
|
141 |
},
|
142 |
"17": {
|
143 |
+
"content": "[SYSTEM_PROMPT]",
|
144 |
"lstrip": false,
|
145 |
"normalized": false,
|
146 |
"rstrip": false,
|
|
|
148 |
"special": true
|
149 |
},
|
150 |
"18": {
|
151 |
+
"content": "[/SYSTEM_PROMPT]",
|
152 |
"lstrip": false,
|
153 |
"normalized": false,
|
154 |
"rstrip": false,
|
|
|
8002 |
"rstrip": false,
|
8003 |
"single_word": false,
|
8004 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8005 |
}
|
8006 |
},
|
8007 |
"bos_token": "<s>",
|
|
|
8015 |
],
|
8016 |
"model_max_length": 1000000000000000019884624838656,
|
8017 |
"pad_token": "<pad>",
|
8018 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
8019 |
"unk_token": "<unk>"
|
8020 |
}
|