inflatebot commited on
Commit
e7b469d
·
verified ·
1 Parent(s): a8035f8

Fixed tokenizer (hopefully)

Browse files

[SYSTEM_PROMPT] and [/SYSTEM_PROMPT] moved from token IDs 131072/131073 to 17/18, as was originally intended. Extraneous tokens removed from config. Embedding weights and lm_head altered with the Python Transformers toolkit to accommodate these changes.

config.json CHANGED
@@ -23,5 +23,5 @@
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.49.0.dev0",
25
  "use_cache": false,
26
- "vocab_size": 131074
27
  }
 
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.49.0.dev0",
25
  "use_cache": false,
26
+ "vocab_size": 131072
27
  }
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a131f81d2b31fefbe344c904e613ef8161baee5036a59888d2b0ab523ad165ec
3
- size 4865542976
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:741eb08fd622a4cc8331a59fc7262f9e61f1a53f24d439e6a820bb2ea3c22422
3
+ size 4865522464
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4794b542a546b181bcc7708a04c4c70d278e13cec5abd20cd74fe12e9b228369
3
- size 4907516752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e82af09f94e8a3f8c9959f7623fe404660e2e6eb56a72ddffbf1d859e031dd
3
+ size 4907496240
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a26d19ba141138e260796b82a35d1e4a783d007706820702ba03ec160c3b2efc
3
- size 17078679
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38b3cc4f6ed44a0c1e023d21fcb148b555d288cb74dac642eb49bdfe58a3a70e
3
+ size 21525621
tokenizer_config.json CHANGED
@@ -140,7 +140,7 @@
140
  "special": true
141
  },
142
  "17": {
143
- "content": "<SPECIAL_17>",
144
  "lstrip": false,
145
  "normalized": false,
146
  "rstrip": false,
@@ -148,7 +148,7 @@
148
  "special": true
149
  },
150
  "18": {
151
- "content": "<SPECIAL_18>",
152
  "lstrip": false,
153
  "normalized": false,
154
  "rstrip": false,
@@ -8002,22 +8002,6 @@
8002
  "rstrip": false,
8003
  "single_word": false,
8004
  "special": true
8005
- },
8006
- "131072": {
8007
- "content": "[SYSTEM_PROMPT]",
8008
- "lstrip": false,
8009
- "normalized": false,
8010
- "rstrip": false,
8011
- "single_word": false,
8012
- "special": false
8013
- },
8014
- "131073": {
8015
- "content": "[/SYSTEM_PROMPT]",
8016
- "lstrip": false,
8017
- "normalized": false,
8018
- "rstrip": false,
8019
- "single_word": false,
8020
- "special": false
8021
  }
8022
  },
8023
  "bos_token": "<s>",
@@ -8031,6 +8015,6 @@
8031
  ],
8032
  "model_max_length": 1000000000000000019884624838656,
8033
  "pad_token": "<pad>",
8034
- "tokenizer_class": "PreTrainedTokenizer",
8035
  "unk_token": "<unk>"
8036
  }
 
140
  "special": true
141
  },
142
  "17": {
143
+ "content": "[SYSTEM_PROMPT]",
144
  "lstrip": false,
145
  "normalized": false,
146
  "rstrip": false,
 
148
  "special": true
149
  },
150
  "18": {
151
+ "content": "[/SYSTEM_PROMPT]",
152
  "lstrip": false,
153
  "normalized": false,
154
  "rstrip": false,
 
8002
  "rstrip": false,
8003
  "single_word": false,
8004
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8005
  }
8006
  },
8007
  "bos_token": "<s>",
 
8015
  ],
8016
  "model_max_length": 1000000000000000019884624838656,
8017
  "pad_token": "<pad>",
8018
+ "tokenizer_class": "PreTrainedTokenizerFast",
8019
  "unk_token": "<unk>"
8020
  }