diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..3bdeb4b6404a7a78bc342f435ec4dbb781a72f9d
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+media/Passing[[:space:]]the[[:space:]]Torch[[:space:]](SorcererLM[[:space:]]Theme).mp3 filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9a03753d5d88624ad825ba39a935825073444a9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+---
+license: apache-2.0
+base_model: alpindale/WizardLM-2-8x22B
+---
+
+# SorcererLM-8x22b-bf16
+
+<img src="https://files.catbox.moe/1kohx8.png" width="400"/>
+
+<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/6569a4ed2419be6072890cf8/L_uGojVkNUsK6QHvWgs9o.mpga"></audio>
+
+Oh boy, here we go. Low-rank (`r=16, alpha=32`) 16bit-LoRA on top of [WizardLM-2-8x22B](https://huggingface.co/alpindale/WizardLM-2-8x22B), trained on 2 epochs of (cleaned & deduped) c2-logs. As far as I can tell, this is an upgrade from `WizardLM-2-8x22B` for RP purposes.
+
+Alongside this ready-to-use release I'm also releasing [the LoRA itself](https://huggingface.co/rAIfle/SorcererLM-8x22b-epoch2-LoRA) as well as [the earlier `epoch1`-checkpoint of the LoRA](https://huggingface.co/rAIfle/SorcererLM-8x22b-epoch1-LoRA).
+
+## Why A LoRA?
+
+The choice was fully intentional. I briefly considered a FFT but for this particular use-case a LoRA seemed a better fit. `WizardLM-2-8x22B` is smart by itself but its used vocabulary leaves much to be desired when it comes to RP. By training a low-rank LoRA on top of it to teach it some of Claude's writing style, we remedy that.
+
+## Prompting
+
+- Use the templates in [Quant-Cartel/Recommended-Settings](https://huggingface.co/Quant-Cartel/Recommended-Settings) under the `SorcererLM`-folder.
+- Or Vicuna 1.1 and a sane context template. It's somewhat sensitive to samplers, I'd recommend Temperature 1, MinP 0.05 and a dash of DRY but YMMV. Shorter prompts seem to work better, too.
+
+## Quantized Versions
+
+- [iMat GGUFs](https://huggingface.co/Quant-Cartel/SorcererLM-8x22b-iMat-GGUF)
+- [longcal exl2s](https://huggingface.co/Quant-Cartel/SorcererLM-8x22b-exl2-longcal)
+
+## Acknowledgments
+
+The main shoutout I want to make is to my [Cartel](https://huggingface.co/Quant-Cartel) bros, [Envoid](https://huggingface.co/Envoid) and particularly [I^2](https://huggingface.co/InferenceIllusionist), for being amazing. I count this as a team effort, so they deserve kudos too if you like this.
+
+
+## Training
+
+Trained using [qlora-pipe](https://github.com/tdrussell/qlora-pipe). Configs included in the `train`-subfolder.
+
+## Safety
+
+... n/a
+
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8de4eaf4fb84698ba6cee6e468e1658274da6a2
--- /dev/null
+++ b/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "alpindale/WizardLM-2-8x22B",
+  "architectures": [
+    "MixtralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "initializer_range": 0.02,
+  "intermediate_size": 16384,
+  "max_position_embeddings": 65536,
+  "model_type": "mixtral",
+  "num_attention_heads": 48,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 56,
+  "num_key_value_heads": 8,
+  "num_local_experts": 8,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0.dev0",
+  "use_cache": false,
+  "vocab_size": 32000
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d0dd04cdb8df77be89256e848431284ed853f2a
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.45.0.dev0"
+}
diff --git a/media/Passing the Torch (SorcererLM Theme).mp3 b/media/Passing the Torch (SorcererLM Theme).mp3
new file mode 100644
index 0000000000000000000000000000000000000000..14e245845dcad91266e6963892df3a598967aac7
--- /dev/null
+++ b/media/Passing the Torch (SorcererLM Theme).mp3	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72fe4d2f099ddc5c3b3f4969c98c7ff04995cf9eb6ea9bda08fb7618aac2097e
+size 3283087
diff --git a/model-00001-of-00059.safetensors b/model-00001-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d4b1e55b16b54451f62033febc81e9e6107c462b
--- /dev/null
+++ b/model-00001-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d7d3d5d04aa64ba71d25d250f327c727c74c41a2bfe2059473d5da08ae0562f
+size 4998663696
diff --git a/model-00002-of-00059.safetensors b/model-00002-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0d410db7242f88d81a7d1b2f1a175d1546aebca5
--- /dev/null
+++ b/model-00002-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5f291ecb2ca6bda09f238b552d94df1de8bde2573873fa34ed4b6db825044a7
+size 4806799120
diff --git a/model-00003-of-00059.safetensors b/model-00003-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6f66c9b2d86b61f5b09ff13b8eaa4715161ed7fe
--- /dev/null
+++ b/model-00003-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bad12c7affc5bc8fa540e18f1e215473f7bc178fd037bbabc87fd0a1e5cb2eb
+size 4806799120
diff --git a/model-00004-of-00059.safetensors b/model-00004-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6bbb1321a10897e0dbc56c375e5860981ac4afc
--- /dev/null
+++ b/model-00004-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:620fe0f510fa465bc06946c275673640e341ce7ea7912ecaffabd7afed88e544
+size 4806799120
diff --git a/model-00005-of-00059.safetensors b/model-00005-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..10c392ee7de4cf9f0ffa727dcb1a997066c57d5d
--- /dev/null
+++ b/model-00005-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5d552b8f44a2d0fc8956e60a2a33cf0679954158f9546b6c14425377d09eb8e
+size 4806799120
diff --git a/model-00006-of-00059.safetensors b/model-00006-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b846e78959b9c51926efa68a59dc171590b72d59
--- /dev/null
+++ b/model-00006-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:398e1bfda69967ada4d27ff86001a05817fd9d48fc4d97b872e37bcad099e608
+size 4806799120
diff --git a/model-00007-of-00059.safetensors b/model-00007-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1a762117dd3cf0ad2091c83ab72c2640a8dacb3c
--- /dev/null
+++ b/model-00007-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67f65f83fd01d575921e79232d0cd7c43130339b0f057a34894b2e2f2ea229d4
+size 4806799120
diff --git a/model-00008-of-00059.safetensors b/model-00008-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9547d053d7b865e2c647120c53d1db632ef22b43
--- /dev/null
+++ b/model-00008-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7eef0aced50f483e3977002608dbe69742e50083a2223ef02a893efef2e1fc1
+size 4806799120
diff --git a/model-00009-of-00059.safetensors b/model-00009-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a9293d0aa6a6216cc987f9926046156a62d28623
--- /dev/null
+++ b/model-00009-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab2455033a4a602265b276eb3d201ab446eab1f21aafa45971c84357ab858aaf
+size 4806799120
diff --git a/model-00010-of-00059.safetensors b/model-00010-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b87887bcb8526d11921aa6fe3cd36ba18b07b6da
--- /dev/null
+++ b/model-00010-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e6a1ae7c53dd1ab4785435403c78f5c195619dbeae62dd59a9a3b8256d49db4
+size 4806799120
diff --git a/model-00011-of-00059.safetensors b/model-00011-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d705d0796178b46492f26d1cb4e68875c053976d
--- /dev/null
+++ b/model-00011-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da7b3b0f4ff5c5b240d412424b9df5a4bd12dfb77847b84987d51204e682a407
+size 4806799136
diff --git a/model-00012-of-00059.safetensors b/model-00012-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..982a1eb62241bf20b4ea358a06f132c914a4c820
--- /dev/null
+++ b/model-00012-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:893b06d532e9c451e08bafa8e9f54776a0d5f939806108468e459ea9f6999552
+size 4806799152
diff --git a/model-00013-of-00059.safetensors b/model-00013-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a747738e2b29830e65ae8035876b90df8c952b05
--- /dev/null
+++ b/model-00013-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a59f9785587ced21ef498f73d20b26334b24b1701978fdb616c8293e122b4ca
+size 4806799152
diff --git a/model-00014-of-00059.safetensors b/model-00014-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e250b01edaaacb1751ea10ef41475a1410d4a51d
--- /dev/null
+++ b/model-00014-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1cb9d52f4f4ab5f3740f7a82acc7c82b5ed5b757f9cad1abb0ad4350cf9f41e
+size 4806799152
diff --git a/model-00015-of-00059.safetensors b/model-00015-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..48257cfec2056a93be7082e0d4e6e58b0919711f
--- /dev/null
+++ b/model-00015-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efbbf3c0039a1ebf7e26aab96e776c540e886533ba746568eab15f96ce44ee38
+size 4806799152
diff --git a/model-00016-of-00059.safetensors b/model-00016-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..404714ebc39a66f1ff204da8a844d42852c922a5
--- /dev/null
+++ b/model-00016-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9819d0aa6cdfabad7b86c42ff9777a674f969019f4fe2aaa6a9e98e2929c1637
+size 4806799152
diff --git a/model-00017-of-00059.safetensors b/model-00017-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..012dbd70a026ca7003b24d03868212fb151274fd
--- /dev/null
+++ b/model-00017-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67fdf0216749a459671dbdfefa6c8beee468acd78ada1e952d563aeacc8fec8b
+size 4806799152
diff --git a/model-00018-of-00059.safetensors b/model-00018-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ffaf78ce9c1466b64758dad29d4ae1a3d9d5ba98
--- /dev/null
+++ b/model-00018-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:935812c05f6aa6e36c015b7ed675d72e81bc2d4f95a09e94bca7c5593b1b7955
+size 4806799152
diff --git a/model-00019-of-00059.safetensors b/model-00019-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ec983f085b7fcc3d7a4415b3d3f7e5e139a8e778
--- /dev/null
+++ b/model-00019-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a31cd3be7d421893dd586fd8220a7be95af0d58a9d14d1f395f40531eff0a10
+size 4806799152
diff --git a/model-00020-of-00059.safetensors b/model-00020-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..80dd0c9e0b0b0fd3552823cf318d9226cf5a14e5
--- /dev/null
+++ b/model-00020-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55e0d6f28661de4ad8ba8e6e0bf3f4bbe9565ffbff703af265b66c04b0baf69c
+size 4806799152
diff --git a/model-00021-of-00059.safetensors b/model-00021-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ddbe15feceeac19fe4a8ea91be7475e5384cd0d9
--- /dev/null
+++ b/model-00021-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dd1595aac673b69ee361fe700b999896862f792d5630659649a572e1bb43711
+size 4806799152
diff --git a/model-00022-of-00059.safetensors b/model-00022-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..988152aa0c789b4984857aab5c00baecfec2e233
--- /dev/null
+++ b/model-00022-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b5f1a40a2942e0cd72775397947feb7bddbda182985eaf79daa90da9c0e20e7
+size 4806799152
diff --git a/model-00023-of-00059.safetensors b/model-00023-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bec6647e6bfe0fa97d380bd392006a38eee4c55
--- /dev/null
+++ b/model-00023-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd53346c96de33ccb9387f4b173c34004512226c34c06323ef0fe7578877d8d
+size 4806799152
diff --git a/model-00024-of-00059.safetensors b/model-00024-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2c8624dee80339bf917bec9ffd19a01a94d0b674
--- /dev/null
+++ b/model-00024-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33a12a9db9323cc44b0a3101f461da26cd23e0721c15f0351349632be5ff1e36
+size 4932529864
diff --git a/model-00025-of-00059.safetensors b/model-00025-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9ab10ef4747689ee014d8b8b3250b35fa1ecdd20
--- /dev/null
+++ b/model-00025-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1d4ecbad71ff079612a1c5fb0e7ea1f98b712f1bcced164f6b53c580c3dcdf4
+size 4995542848
diff --git a/model-00026-of-00059.safetensors b/model-00026-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b9f8df1514382542568c8e81a0e0e1643f058628
--- /dev/null
+++ b/model-00026-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e83a92085f9325957d80aab4c2b27d7519e637b62a9e2591a91526e3924b6e1c
+size 4995542848
diff --git a/model-00027-of-00059.safetensors b/model-00027-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..17e78d140e850eb7925fc6da38c77f3b7be91821
--- /dev/null
+++ b/model-00027-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97900e8e9522545497945fc89d5f488ce70869c30e32f286236b2c46b6d36249
+size 4932628288
diff --git a/model-00028-of-00059.safetensors b/model-00028-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e29379029b65742363dbf5cece13fb45b564d26c
--- /dev/null
+++ b/model-00028-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49156a048d247a4aec7b8f7ea8ec497c836edd0c5b599a37a5c606806dfc9ae0
+size 4806774344
diff --git a/model-00029-of-00059.safetensors b/model-00029-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f4e8405f67e236b0a38f8202800c8de2a340166d
--- /dev/null
+++ b/model-00029-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f120d5f53081bc8ce8339e5aaaae226f00eec5909414c296f16dd42a0c3865c8
+size 4806799144
diff --git a/model-00030-of-00059.safetensors b/model-00030-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4240a349e28d385a4b92d2e5dc61867908bb4f0d
--- /dev/null
+++ b/model-00030-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c73d242b90c426bc66dfe827b441deb73d68d86fbfd55a9370a1ddb6aa769c9e
+size 4806799144
diff --git a/model-00031-of-00059.safetensors b/model-00031-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bba7286d6b8907758a42bb0a3214c9ff87ce7af0
--- /dev/null
+++ b/model-00031-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83030c28c568a05b5dd8b17e1bd870672921acf1e3ff6a4978e8f336db382aff
+size 4806799144
diff --git a/model-00032-of-00059.safetensors b/model-00032-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bc7fb1ea471278a5c02e76c8c9cf261318b039e4
--- /dev/null
+++ b/model-00032-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4389108be53523985433dcd171b7e22fe9e51ee7add36be8c4df100b5ad4eed1
+size 4806799144
diff --git a/model-00033-of-00059.safetensors b/model-00033-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..93286cc975a7742d205540f4d692e7a44ac89b3f
--- /dev/null
+++ b/model-00033-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c131596bfd971e605de7aed55495bf9be723a159e03d619c091738df46edbea
+size 4806799152
diff --git a/model-00034-of-00059.safetensors b/model-00034-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b6b879afdb1c262b145be20f407e883e378f61d1
--- /dev/null
+++ b/model-00034-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee114be8ba934f9aced075356dbde02b7bcd5adc36cb6ab0c8de35b343a42577
+size 4806799152
diff --git a/model-00035-of-00059.safetensors b/model-00035-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..43afc3735eba0a480208e9cec64a7dca92462e8b
--- /dev/null
+++ b/model-00035-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c723d67bdedb824af4d246f122cc6e14c645ef1bb7960fd8c25d02f596beb18
+size 4806799152
diff --git a/model-00036-of-00059.safetensors b/model-00036-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01a61dc7ddd71ef816f33ed0a841c130f4633f1e
--- /dev/null
+++ b/model-00036-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:508228cda4960dac196b058125cfbd8fc50a50520e42174680a532fea441e496
+size 4806799152
diff --git a/model-00037-of-00059.safetensors b/model-00037-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5e31adfc5ba482df1c708c09fa3d7c49f2bef3e5
--- /dev/null
+++ b/model-00037-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d145a7fbf7cedf87fb1c17cddd0a740399777bed27d6861fb0bb1de12b54a05c
+size 4806799152
diff --git a/model-00038-of-00059.safetensors b/model-00038-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..55bc159b658b8621c0afbe076c844899dd183aa8
--- /dev/null
+++ b/model-00038-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15d801e05a669b53971cf6a69e058c6c5d9e59bf7b399195ab89421f12f84b85
+size 4806799152
diff --git a/model-00039-of-00059.safetensors b/model-00039-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8a8d346015954226eecd9d4175fc740c79b81793
--- /dev/null
+++ b/model-00039-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1249dee208fb3fded28c70c825e7bc9e3fa7cfc91f38139cf265b73fdaf3bd4
+size 4806799152
diff --git a/model-00040-of-00059.safetensors b/model-00040-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..05ed11f86eb54f9bb89e36b3472322478043a5cc
--- /dev/null
+++ b/model-00040-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38cf573512843fd554388c2b451ae05fdde0ce35746ebcacda6fae7d9e5e3f7a
+size 4806799152
diff --git a/model-00041-of-00059.safetensors b/model-00041-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6fe8017e7bbf8a16a2c0b8a3fda5a3bd6cb694be
--- /dev/null
+++ b/model-00041-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4656908e119cd5fc1898b3547e28778e3b6b21108d2d1de0a22aa458aa8d5ee1
+size 4806799152
diff --git a/model-00042-of-00059.safetensors b/model-00042-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f1c15fa26d60eeeeb40fd82c39a6308ae865b98f
--- /dev/null
+++ b/model-00042-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d13e13fb51c1934eed90f40bf310d5d430d3fe9ece7fe0557a15daf94407a8f
+size 4806799152
diff --git a/model-00043-of-00059.safetensors b/model-00043-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b72dd7516910cf13c133b95d0c04e0e2a1b6f371
--- /dev/null
+++ b/model-00043-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b42523511ba6fcf7231469b15bae6af3cdccd0ac5f6a50bc780dc2321c7258e
+size 4806799152
diff --git a/model-00044-of-00059.safetensors b/model-00044-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4200f219e152fa76c82f234123b158f955cc393a
--- /dev/null
+++ b/model-00044-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e723e9cdc7d276df62775929952b1b25b38c21b75a63b4beb0a6d02cd617e3d9
+size 4806799152
diff --git a/model-00045-of-00059.safetensors b/model-00045-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f45179f7fc4d7604ab51c32866ac5130ae9e430d
--- /dev/null
+++ b/model-00045-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01c14d1c87da6128abfb084937d2d11b04925ea133bd28b3adb114d11a33ceb3
+size 4806799152
diff --git a/model-00046-of-00059.safetensors b/model-00046-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7032e5a7101b0fa21a0a6af5e7313a3fae150337
--- /dev/null
+++ b/model-00046-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6923f08ff8247b9755da04268038025ddb4a1eef8fe50caf8c78e5b0ec17547
+size 4806799152
diff --git a/model-00047-of-00059.safetensors b/model-00047-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2954220391752e24704837393602b8e4a65d8b5f
--- /dev/null
+++ b/model-00047-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18be23ebc7836331f3692e252a34a78aa003c952437b8ba483738bcc0d08371a
+size 4806799152
diff --git a/model-00048-of-00059.safetensors b/model-00048-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9775bd304371edfba80c68459c9c283e3c6ee17b
--- /dev/null
+++ b/model-00048-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16ef6a2d7efaf15246159ee851786cb270a7b78f49c1878dcda74619a5bd8b87
+size 4806799152
diff --git a/model-00049-of-00059.safetensors b/model-00049-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cd6f255dee46783626c062ca535446b720ec8259
--- /dev/null
+++ b/model-00049-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd008956b3e4c77cd755139afe5c4ec37a63e7db5c84feafc71e88835310e507
+size 4806799152
diff --git a/model-00050-of-00059.safetensors b/model-00050-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0dda087abd6c57e53460832aac90fce6bf7fd4de
--- /dev/null
+++ b/model-00050-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa7eb425d2d6133cde5be41b263138e1efbfad682fd56b5ff174ea2173bad9e4
+size 4806799152
diff --git a/model-00051-of-00059.safetensors b/model-00051-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ed51a57fc5fd989eac585123de9b9ed08332f811
--- /dev/null
+++ b/model-00051-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b082bd764f1e99de20b9eb059ff5a9079f70520319d9e3b1ff564b552af18324
+size 4806799152
diff --git a/model-00052-of-00059.safetensors b/model-00052-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e956c6fb8f347cd9cde046a8e125f051da572727
--- /dev/null
+++ b/model-00052-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3c7d0077e9b198308ac6ef6f57a1f9d84446a68e09a4929d7446d3f668e24b
+size 4932529864
diff --git a/model-00053-of-00059.safetensors b/model-00053-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..79ebaeeb065152f8eb12144f14a87a97f5f52f1a
--- /dev/null
+++ b/model-00053-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb113ec4883a5a66329bda3abacdda57baa56c7585d1b4bdce168763c63e3582
+size 4995542848
diff --git a/model-00054-of-00059.safetensors b/model-00054-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..83217d858025f82201937798ceff8f62ce299ee2
--- /dev/null
+++ b/model-00054-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:266264b68c5a1447a01c40ae4673cd7194ec262abdf1e40b703785f260e9fc3a
+size 4995542848
diff --git a/model-00055-of-00059.safetensors b/model-00055-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..12e9e8d2f66d4a29d6eb9261bef760ccf14aa20f
--- /dev/null
+++ b/model-00055-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3d2c796c64e5e3df6022e9b7dd06c220c1d8645da60ccac68762ea53f2c3c31
+size 4932628288
diff --git a/model-00056-of-00059.safetensors b/model-00056-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a9f338ccfd75af81bf5066a7867bb18c4c5e320c
--- /dev/null
+++ b/model-00056-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87e791c107c68821450316a8ed07d1949a942f77686f2bb466028198fed02e63
+size 4806774344
diff --git a/model-00057-of-00059.safetensors b/model-00057-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c0cc6a272d3b7d4195ac38f9fa1fe205313edec8
--- /dev/null
+++ b/model-00057-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f2ce018efb32fcc79ef0a3646b560fc06e0cfd02cbe1f83c149ce39ad490d25
+size 4806799144
diff --git a/model-00058-of-00059.safetensors b/model-00058-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..59ba51605435a11e0575c9bd3a57721e6db82391
--- /dev/null
+++ b/model-00058-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15aa2bdab964fda786fd56d233dba306570f5fb77f0b632f8cd408b02958ad89
+size 4806799144
diff --git a/model-00059-of-00059.safetensors b/model-00059-of-00059.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6a1a92ea4b485bafd31a1303aa4fe68aaff54052
--- /dev/null
+++ b/model-00059-of-00059.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d252e46ddc98799062ead170128a24260ea63b53144ae38da2ec816251ab099
+size 997233472
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..805c18819ee3c4a51164b634df4f4033c74583ad
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,1746 @@
+{
+  "metadata": {
+    "total_size": 281241268224
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00059-of-00059.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.0.block_sparse_moe.gate.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00002-of-00059.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00059.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00059.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.1.block_sparse_moe.gate.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00003-of-00059.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00059.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00059.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.3.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.3.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.3.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.4.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.4.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.4.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.5.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.5.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.5.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.6.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.6.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.6.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.7.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.7.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.experts.7.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.block_sparse_moe.gate.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00012-of-00059.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00011-of-00059.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00011-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.3.w1.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.3.w2.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.3.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.4.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.4.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.4.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.5.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.5.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.5.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.6.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.6.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.6.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.7.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.7.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.experts.7.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.block_sparse_moe.gate.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00013-of-00059.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00012-of-00059.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00012-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.3.w1.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.3.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.3.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.4.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.4.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.4.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.5.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.5.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.5.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.6.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.6.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.6.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.7.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.7.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.experts.7.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.block_sparse_moe.gate.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00014-of-00059.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00013-of-00059.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00013-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.4.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.4.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.4.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.5.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.5.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.5.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.6.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.6.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.6.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.7.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.7.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.experts.7.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.block_sparse_moe.gate.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00015-of-00059.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00014-of-00059.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00014-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.3.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.3.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.3.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.4.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.4.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.4.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.5.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.5.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.5.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.6.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.6.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.6.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.7.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.7.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.experts.7.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.block_sparse_moe.gate.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00016-of-00059.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00015-of-00059.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00015-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.3.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.3.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.3.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.4.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.4.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.4.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.5.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.5.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.5.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.6.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.6.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.6.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.7.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.7.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.experts.7.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.block_sparse_moe.gate.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00017-of-00059.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00016-of-00059.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00016-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.0.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.0.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.0.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.1.w1.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.1.w2.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.1.w3.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.2.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.2.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.2.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.3.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.3.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.3.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.4.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.4.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.4.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.5.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.5.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.5.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.6.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.6.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.6.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.7.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.7.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.experts.7.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.block_sparse_moe.gate.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00018-of-00059.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00017-of-00059.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00017-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.0.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.0.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.0.w3.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.1.w1.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.1.w2.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.1.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.2.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.2.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.2.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.3.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.3.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.3.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.4.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.4.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.4.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.5.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.5.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.5.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.6.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.6.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.6.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.7.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.7.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.experts.7.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.block_sparse_moe.gate.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00019-of-00059.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00018-of-00059.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00018-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.0.w2.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.0.w3.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.1.w1.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.1.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.1.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.2.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.2.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.2.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.3.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.3.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.3.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.4.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.4.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.4.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.5.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.5.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.5.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.6.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.6.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.6.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.7.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.7.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.experts.7.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.block_sparse_moe.gate.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00020-of-00059.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00019-of-00059.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00019-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.0.w1.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.0.w2.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.0.w3.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.1.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.1.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.1.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.2.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.2.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.2.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.3.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.3.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.3.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.4.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.4.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.4.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.5.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.5.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.5.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.6.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.6.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.6.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.7.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.7.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.experts.7.w3.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.block_sparse_moe.gate.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00021-of-00059.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00020-of-00059.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00020-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.2.block_sparse_moe.gate.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00004-of-00059.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00004-of-00059.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00059.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.0.w1.weight": "model-00021-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.0.w2.weight": "model-00021-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.0.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.3.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.3.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.3.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.4.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.4.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.4.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.5.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.5.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.5.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.6.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.6.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.6.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.7.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.7.w2.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.experts.7.w3.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.block_sparse_moe.gate.weight": "model-00021-of-00059.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00022-of-00059.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00021-of-00059.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00021-of-00059.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00021-of-00059.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00021-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.0.w1.weight": "model-00022-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.0.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.0.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.1.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.1.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.1.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.2.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.2.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.2.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.3.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.3.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.3.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.4.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.4.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.4.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.5.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.5.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.5.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.6.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.6.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.6.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.7.w1.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.7.w2.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.experts.7.w3.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.block_sparse_moe.gate.weight": "model-00022-of-00059.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00023-of-00059.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00022-of-00059.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00022-of-00059.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00022-of-00059.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00022-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.0.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.0.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.0.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.1.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.1.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.1.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.2.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.2.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.2.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.3.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.3.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.3.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.4.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.4.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.4.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.5.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.5.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.5.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.6.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.6.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.6.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.7.w1.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.7.w2.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.experts.7.w3.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.block_sparse_moe.gate.weight": "model-00023-of-00059.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00024-of-00059.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00023-of-00059.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00023-of-00059.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00023-of-00059.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00023-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.0.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.0.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.0.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.1.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.1.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.1.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.2.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.2.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.2.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.3.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.3.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.3.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.4.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.4.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.4.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.5.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.5.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.5.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.6.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.6.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.6.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.7.w1.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.7.w2.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.experts.7.w3.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.block_sparse_moe.gate.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00024-of-00059.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00025-of-00059.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00024-of-00059.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00024-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.0.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.0.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.0.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.1.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.1.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.1.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.2.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.2.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.2.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.3.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.3.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.3.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.4.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.4.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.4.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.5.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.5.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.5.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.6.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.6.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.6.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.7.w1.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.7.w2.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.experts.7.w3.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.block_sparse_moe.gate.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00025-of-00059.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00026-of-00059.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00025-of-00059.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00026-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.0.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.0.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.0.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.1.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.1.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.1.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.2.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.2.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.2.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.3.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.3.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.3.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.4.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.4.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.4.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.5.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.5.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.5.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.6.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.6.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.6.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.7.w1.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.7.w2.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.experts.7.w3.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.block_sparse_moe.gate.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00027-of-00059.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00026-of-00059.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00027-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.0.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.0.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.0.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.1.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.1.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.1.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.2.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.2.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.2.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.3.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.3.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.3.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.4.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.4.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.4.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.5.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.5.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.5.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.6.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.6.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.6.w3.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.7.w1.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.7.w2.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.experts.7.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.26.block_sparse_moe.gate.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00029-of-00059.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00029-of-00059.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00028-of-00059.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00028-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.2.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.3.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.3.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.3.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.4.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.4.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.4.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.5.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.5.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.5.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.6.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.6.w2.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.6.w3.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.7.w1.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.7.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.experts.7.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.27.block_sparse_moe.gate.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00030-of-00059.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00030-of-00059.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00029-of-00059.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00029-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.0.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.0.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.1.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.1.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.1.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.2.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.2.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.2.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.3.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.3.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.3.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.4.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.4.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.4.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.5.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.5.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.5.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.6.w1.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.6.w2.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.6.w3.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.7.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.7.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.experts.7.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.28.block_sparse_moe.gate.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00031-of-00059.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00031-of-00059.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00030-of-00059.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00030-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.0.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.0.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.0.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.1.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.1.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.1.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.2.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.2.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.2.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.3.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.3.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.3.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.4.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.4.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.4.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.5.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.5.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.5.w3.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.6.w1.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.6.w2.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.6.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.7.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.7.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.experts.7.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.29.block_sparse_moe.gate.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00032-of-00059.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00032-of-00059.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00031-of-00059.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00031-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.block_sparse_moe.gate.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00005-of-00059.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00004-of-00059.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00004-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.0.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.0.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.0.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.1.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.1.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.1.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.2.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.2.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.3.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.3.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.3.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.4.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.4.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.4.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.5.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.5.w2.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.5.w3.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.6.w1.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.6.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.6.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.7.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.7.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.experts.7.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.block_sparse_moe.gate.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00033-of-00059.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00032-of-00059.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00032-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.0.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.0.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.0.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.1.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.1.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.1.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.2.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.2.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.2.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.3.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.3.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.3.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.4.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.4.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.4.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.5.w1.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.5.w2.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.5.w3.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.6.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.6.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.6.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.7.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.7.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.experts.7.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.block_sparse_moe.gate.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00034-of-00059.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00033-of-00059.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00033-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.0.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.0.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.0.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.1.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.1.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.1.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.2.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.2.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.2.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.3.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.3.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.3.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.4.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.4.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.4.w3.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.5.w1.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.5.w2.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.5.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.6.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.6.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.6.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.7.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.7.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.experts.7.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.block_sparse_moe.gate.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00035-of-00059.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00034-of-00059.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00034-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.0.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.0.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.0.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.1.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.1.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.1.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.2.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.2.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.2.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.3.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.3.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.3.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.4.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.4.w2.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.4.w3.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.5.w1.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.5.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.5.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.6.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.6.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.6.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.7.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.7.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.experts.7.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.block_sparse_moe.gate.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00036-of-00059.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00035-of-00059.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00035-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.0.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.0.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.0.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.1.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.1.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.1.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.2.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.2.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.2.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.3.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.3.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.3.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.4.w1.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.4.w2.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.4.w3.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.5.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.5.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.5.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.6.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.6.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.6.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.7.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.7.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.experts.7.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.block_sparse_moe.gate.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00037-of-00059.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00036-of-00059.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00036-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.0.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.0.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.0.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.1.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.1.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.1.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.2.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.2.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.2.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.3.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.3.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.3.w3.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.4.w1.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.4.w2.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.4.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.5.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.5.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.5.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.6.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.6.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.6.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.7.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.7.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.experts.7.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.block_sparse_moe.gate.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00038-of-00059.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00037-of-00059.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00037-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.0.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.0.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.0.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.1.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.1.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.1.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.2.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.2.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.2.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.3.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.3.w2.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.3.w3.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.4.w1.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.4.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.4.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.5.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.5.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.5.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.6.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.6.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.6.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.7.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.7.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.experts.7.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.block_sparse_moe.gate.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00039-of-00059.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00038-of-00059.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00038-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.0.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.0.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.0.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.1.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.1.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.1.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.2.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.2.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.2.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.3.w1.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.3.w2.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.3.w3.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.4.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.4.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.4.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.5.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.5.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.5.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.6.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.6.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.6.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.7.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.7.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.experts.7.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.block_sparse_moe.gate.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00040-of-00059.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00039-of-00059.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00039-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.0.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.0.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.0.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.1.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.1.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.1.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.2.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.2.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.2.w3.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.3.w1.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.3.w2.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.3.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.4.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.4.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.4.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.5.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.5.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.5.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.6.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.6.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.6.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.7.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.7.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.experts.7.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.block_sparse_moe.gate.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00041-of-00059.safetensors",
+    "model.layers.38.self_attn.k_proj.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.self_attn.o_proj.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.self_attn.q_proj.weight": "model-00040-of-00059.safetensors",
+    "model.layers.38.self_attn.v_proj.weight": "model-00040-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.0.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.0.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.0.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.1.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.1.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.1.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.2.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.2.w2.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.2.w3.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.3.w1.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.3.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.3.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.4.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.4.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.4.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.5.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.5.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.5.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.6.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.6.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.6.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.7.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.7.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.experts.7.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.block_sparse_moe.gate.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00042-of-00059.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00041-of-00059.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00041-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.block_sparse_moe.gate.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00006-of-00059.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00005-of-00059.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00005-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.0.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.0.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.0.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.1.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.1.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.1.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.2.w1.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.2.w2.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.2.w3.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.3.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.3.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.3.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.4.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.4.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.4.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.5.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.5.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.5.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.6.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.6.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.6.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.7.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.7.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.experts.7.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.block_sparse_moe.gate.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.input_layernorm.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.post_attention_layernorm.weight": "model-00043-of-00059.safetensors",
+    "model.layers.40.self_attn.k_proj.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.self_attn.o_proj.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.self_attn.q_proj.weight": "model-00042-of-00059.safetensors",
+    "model.layers.40.self_attn.v_proj.weight": "model-00042-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.0.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.0.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.0.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.1.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.1.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.1.w3.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.2.w1.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.2.w2.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.2.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.3.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.3.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.3.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.4.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.4.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.4.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.5.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.5.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.5.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.6.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.6.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.6.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.7.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.7.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.experts.7.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.block_sparse_moe.gate.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.input_layernorm.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.post_attention_layernorm.weight": "model-00044-of-00059.safetensors",
+    "model.layers.41.self_attn.k_proj.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.self_attn.o_proj.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.self_attn.q_proj.weight": "model-00043-of-00059.safetensors",
+    "model.layers.41.self_attn.v_proj.weight": "model-00043-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.0.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.0.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.0.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.1.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.1.w2.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.1.w3.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.2.w1.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.2.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.2.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.3.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.3.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.3.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.4.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.4.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.4.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.5.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.5.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.5.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.6.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.6.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.6.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.7.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.7.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.experts.7.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.block_sparse_moe.gate.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.input_layernorm.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.post_attention_layernorm.weight": "model-00045-of-00059.safetensors",
+    "model.layers.42.self_attn.k_proj.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.self_attn.o_proj.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.self_attn.q_proj.weight": "model-00044-of-00059.safetensors",
+    "model.layers.42.self_attn.v_proj.weight": "model-00044-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.0.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.0.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.0.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.1.w1.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.1.w2.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.1.w3.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.2.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.2.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.2.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.3.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.3.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.3.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.4.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.4.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.4.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.5.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.5.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.5.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.6.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.6.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.6.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.7.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.7.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.experts.7.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.block_sparse_moe.gate.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.input_layernorm.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.post_attention_layernorm.weight": "model-00046-of-00059.safetensors",
+    "model.layers.43.self_attn.k_proj.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.self_attn.o_proj.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.self_attn.q_proj.weight": "model-00045-of-00059.safetensors",
+    "model.layers.43.self_attn.v_proj.weight": "model-00045-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.0.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.0.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.0.w3.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.1.w1.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.1.w2.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.1.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.2.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.2.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.2.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.3.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.3.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.3.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.4.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.4.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.4.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.5.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.5.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.5.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.6.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.6.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.6.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.7.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.7.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.experts.7.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.block_sparse_moe.gate.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.input_layernorm.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.post_attention_layernorm.weight": "model-00047-of-00059.safetensors",
+    "model.layers.44.self_attn.k_proj.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.self_attn.o_proj.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.self_attn.q_proj.weight": "model-00046-of-00059.safetensors",
+    "model.layers.44.self_attn.v_proj.weight": "model-00046-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.0.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.0.w2.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.0.w3.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.1.w1.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.1.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.1.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.2.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.2.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.2.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.3.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.3.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.3.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.4.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.4.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.4.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.5.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.5.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.5.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.6.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.6.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.6.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.7.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.7.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.experts.7.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.block_sparse_moe.gate.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.input_layernorm.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.post_attention_layernorm.weight": "model-00048-of-00059.safetensors",
+    "model.layers.45.self_attn.k_proj.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.self_attn.o_proj.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.self_attn.q_proj.weight": "model-00047-of-00059.safetensors",
+    "model.layers.45.self_attn.v_proj.weight": "model-00047-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.0.w1.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.0.w2.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.0.w3.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.1.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.1.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.1.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.2.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.2.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.2.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.3.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.3.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.3.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.4.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.4.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.4.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.5.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.5.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.5.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.6.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.6.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.6.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.7.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.7.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.experts.7.w3.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.block_sparse_moe.gate.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.input_layernorm.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.post_attention_layernorm.weight": "model-00049-of-00059.safetensors",
+    "model.layers.46.self_attn.k_proj.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.self_attn.o_proj.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.self_attn.q_proj.weight": "model-00048-of-00059.safetensors",
+    "model.layers.46.self_attn.v_proj.weight": "model-00048-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.0.w1.weight": "model-00049-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.0.w2.weight": "model-00049-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.0.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.1.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.1.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.1.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.2.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.2.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.2.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.3.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.3.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.3.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.4.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.4.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.4.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.5.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.5.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.5.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.6.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.6.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.6.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.7.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.7.w2.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.experts.7.w3.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.block_sparse_moe.gate.weight": "model-00049-of-00059.safetensors",
+    "model.layers.47.input_layernorm.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.post_attention_layernorm.weight": "model-00050-of-00059.safetensors",
+    "model.layers.47.self_attn.k_proj.weight": "model-00049-of-00059.safetensors",
+    "model.layers.47.self_attn.o_proj.weight": "model-00049-of-00059.safetensors",
+    "model.layers.47.self_attn.q_proj.weight": "model-00049-of-00059.safetensors",
+    "model.layers.47.self_attn.v_proj.weight": "model-00049-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.0.w1.weight": "model-00050-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.0.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.0.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.1.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.1.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.1.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.2.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.2.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.2.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.3.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.3.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.3.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.4.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.4.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.4.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.5.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.5.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.5.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.6.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.6.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.6.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.7.w1.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.7.w2.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.experts.7.w3.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.block_sparse_moe.gate.weight": "model-00050-of-00059.safetensors",
+    "model.layers.48.input_layernorm.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.post_attention_layernorm.weight": "model-00051-of-00059.safetensors",
+    "model.layers.48.self_attn.k_proj.weight": "model-00050-of-00059.safetensors",
+    "model.layers.48.self_attn.o_proj.weight": "model-00050-of-00059.safetensors",
+    "model.layers.48.self_attn.q_proj.weight": "model-00050-of-00059.safetensors",
+    "model.layers.48.self_attn.v_proj.weight": "model-00050-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.0.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.0.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.0.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.1.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.1.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.1.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.2.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.2.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.2.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.3.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.3.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.3.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.4.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.4.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.4.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.5.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.5.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.5.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.6.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.6.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.6.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.7.w1.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.7.w2.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.experts.7.w3.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.block_sparse_moe.gate.weight": "model-00051-of-00059.safetensors",
+    "model.layers.49.input_layernorm.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.post_attention_layernorm.weight": "model-00052-of-00059.safetensors",
+    "model.layers.49.self_attn.k_proj.weight": "model-00051-of-00059.safetensors",
+    "model.layers.49.self_attn.o_proj.weight": "model-00051-of-00059.safetensors",
+    "model.layers.49.self_attn.q_proj.weight": "model-00051-of-00059.safetensors",
+    "model.layers.49.self_attn.v_proj.weight": "model-00051-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.5.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.6.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.6.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.6.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.7.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.7.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.experts.7.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.block_sparse_moe.gate.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00007-of-00059.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00006-of-00059.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00006-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.0.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.0.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.0.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.1.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.1.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.1.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.2.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.2.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.2.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.3.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.3.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.3.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.4.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.4.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.4.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.5.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.5.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.5.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.6.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.6.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.6.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.7.w1.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.7.w2.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.experts.7.w3.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.block_sparse_moe.gate.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.input_layernorm.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.post_attention_layernorm.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.self_attn.k_proj.weight": "model-00052-of-00059.safetensors",
+    "model.layers.50.self_attn.o_proj.weight": "model-00053-of-00059.safetensors",
+    "model.layers.50.self_attn.q_proj.weight": "model-00052-of-00059.safetensors",
+    "model.layers.50.self_attn.v_proj.weight": "model-00052-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.0.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.0.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.0.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.1.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.1.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.1.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.2.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.2.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.2.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.3.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.3.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.3.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.4.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.4.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.4.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.5.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.5.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.5.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.6.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.6.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.6.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.7.w1.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.7.w2.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.experts.7.w3.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.block_sparse_moe.gate.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.input_layernorm.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.post_attention_layernorm.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.self_attn.k_proj.weight": "model-00053-of-00059.safetensors",
+    "model.layers.51.self_attn.o_proj.weight": "model-00054-of-00059.safetensors",
+    "model.layers.51.self_attn.q_proj.weight": "model-00053-of-00059.safetensors",
+    "model.layers.51.self_attn.v_proj.weight": "model-00054-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.0.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.0.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.0.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.1.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.1.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.1.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.2.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.2.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.2.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.3.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.3.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.3.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.4.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.4.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.4.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.5.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.5.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.5.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.6.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.6.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.6.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.7.w1.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.7.w2.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.experts.7.w3.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.block_sparse_moe.gate.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.input_layernorm.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.post_attention_layernorm.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.self_attn.k_proj.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.self_attn.o_proj.weight": "model-00055-of-00059.safetensors",
+    "model.layers.52.self_attn.q_proj.weight": "model-00054-of-00059.safetensors",
+    "model.layers.52.self_attn.v_proj.weight": "model-00055-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.0.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.0.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.0.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.1.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.1.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.1.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.2.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.2.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.2.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.3.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.3.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.3.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.4.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.4.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.4.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.5.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.5.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.5.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.6.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.6.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.6.w3.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.7.w1.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.7.w2.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.experts.7.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.53.block_sparse_moe.gate.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.input_layernorm.weight": "model-00057-of-00059.safetensors",
+    "model.layers.53.post_attention_layernorm.weight": "model-00057-of-00059.safetensors",
+    "model.layers.53.self_attn.k_proj.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.self_attn.o_proj.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.self_attn.q_proj.weight": "model-00056-of-00059.safetensors",
+    "model.layers.53.self_attn.v_proj.weight": "model-00056-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.0.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.0.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.0.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.1.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.1.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.1.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.2.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.2.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.2.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.3.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.3.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.3.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.4.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.4.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.4.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.5.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.5.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.5.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.6.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.6.w2.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.6.w3.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.7.w1.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.7.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.experts.7.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.54.block_sparse_moe.gate.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.input_layernorm.weight": "model-00058-of-00059.safetensors",
+    "model.layers.54.post_attention_layernorm.weight": "model-00058-of-00059.safetensors",
+    "model.layers.54.self_attn.k_proj.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.self_attn.o_proj.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.self_attn.q_proj.weight": "model-00057-of-00059.safetensors",
+    "model.layers.54.self_attn.v_proj.weight": "model-00057-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.0.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.0.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.0.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.1.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.1.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.1.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.2.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.2.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.2.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.3.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.3.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.3.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.4.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.4.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.4.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.5.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.5.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.5.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.6.w1.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.6.w2.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.6.w3.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.7.w1.weight": "model-00059-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.7.w2.weight": "model-00059-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.experts.7.w3.weight": "model-00059-of-00059.safetensors",
+    "model.layers.55.block_sparse_moe.gate.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.input_layernorm.weight": "model-00059-of-00059.safetensors",
+    "model.layers.55.post_attention_layernorm.weight": "model-00059-of-00059.safetensors",
+    "model.layers.55.self_attn.k_proj.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.self_attn.o_proj.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.self_attn.q_proj.weight": "model-00058-of-00059.safetensors",
+    "model.layers.55.self_attn.v_proj.weight": "model-00058-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.3.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.4.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.4.w2.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.4.w3.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.5.w1.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.5.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.5.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.6.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.6.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.6.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.7.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.7.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.experts.7.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.block_sparse_moe.gate.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00008-of-00059.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00007-of-00059.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00007-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.3.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.3.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.3.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.4.w1.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.4.w2.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.4.w3.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.5.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.5.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.5.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.6.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.6.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.6.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.7.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.7.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.experts.7.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.block_sparse_moe.gate.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00009-of-00059.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00008-of-00059.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00008-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.3.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.3.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.3.w3.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.4.w1.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.4.w2.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.4.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.5.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.5.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.5.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.6.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.6.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.6.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.7.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.7.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.experts.7.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.block_sparse_moe.gate.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00010-of-00059.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00009-of-00059.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00009-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.3.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.4.w1.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.4.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.4.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.5.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.5.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.5.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.6.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.6.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.6.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.7.w1.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.7.w2.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.experts.7.w3.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.block_sparse_moe.gate.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00011-of-00059.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00010-of-00059.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00010-of-00059.safetensors",
+    "model.norm.weight": "model-00059-of-00059.safetensors"
+  }
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8b443ef19c2a19acc3ac64fb9c3db4a72921dff6
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd92a7b4ef5fd2a5b85ed21c2458d3f79ce9ba90
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,44 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}
diff --git a/train/sorc.toml b/train/sorc.toml
new file mode 100644
index 0000000000000000000000000000000000000000..4a948ca49c00948047de2b09323c60d8bba1c12f
--- /dev/null
+++ b/train/sorc.toml
@@ -0,0 +1,154 @@
+# Paths
+model = '/workspace/model'
+output_dir = '/workspace/out'
+
+# Lora configuration
+# can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA
+#full_fine_tune = true
+lora_rank = 16
+lora_alpha = 32
+lora_dropout = 0.05
+
+# Train only specific modules. This is passed to the parameter of the same name in the LoraConfig.
+# If not set, adapt all linear modules.
+# Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one
+# of these keys as substring will have requires_grad. If not set everything is trained.
+#target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
+
+# can specify layers to adapt with LoRA if you want
+#layers_to_transform = '16:31'
+
+# for Mixtral, set the load balancing coefficient
+# load_balancing_loss_coef = 0.02
+
+# Optimization configuration
+epochs = 2
+lr_scheduler = 'cosine'  # can also be 'constant'
+warmup_steps = 50
+
+# might be useful if resuming from a checkpoint and you want to change the LR and force it to something
+#force_constant_lr = 5e-5
+
+# hard clamp the magnitude of the LoRA weights
+#scale_weight_norms = 1.0
+
+# dynamic batch size, targeting this many tokens per batch, per device
+# if set, completely ignores the batch size in the deepspeed JSON config file
+# can be thought of as a replacement for sample packing
+batch_size_tokens = 10000
+
+# Performance settings
+pipeline_stages = 8  # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with
+logging_steps = 10  # how often to log in Tensorboard
+eval_steps = 500
+save_steps = 500
+checkpoint_every_n_minutes = 60
+eval_before_first_step = false  # do an eval before any training happens
+# dtype to load the underlying model weights in
+model_weight_dtype = 'bfloat16'
+# dtype for the LoRA weights
+lora_weight_dtype = 'bfloat16'
+# Can have the saved weights be different dtype. Don't need to set this. Could be useful for
+# training in float32 but saving with float16.
+#save_dtype = 'bfloat16'
+# Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest
+# (this only applies to the current training session, and resumed training sessions will not touch
+# old saves)
+keep_states = 5
+
+# sort examples by length before dividing them into batches
+# this makes all examples in a batch approximately the same length, to minimize padding
+# the batches are still shuffled after that
+# you should probably always have this set to true
+group_by_length = true
+
+# This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM
+# for a minor performance hit.
+# Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length.
+# true: 75s step time, 19.7G peak per-GPU VRAM usage.
+# 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage.
+activation_checkpointing = 'unsloth'
+
+# Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a
+# moderate hit to performance. If using an MoE model, this can also be an integer, in
+# which case only that many experts are offloaded (tradeoff between VRAM and speed).
+offload_mlp_to_cpu = 2
+
+# Resume a prior run
+# if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps)
+# so, to resume, just run the exact same command but set this to true first
+resume_from_checkpoint = false
+
+# Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak.
+# It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the
+# limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment
+# this to not load the optimizer states and hopefully the resumption won't OOM.
+#load_optimizer_states = false
+
+
+# Dataset configuration
+
+# How to combine multiple datasets if you have more than one.
+# Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set.
+dataset_combination_mode = 'interleave'
+# When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'.
+# Default if not set: 'first_exhausted'
+dataset_interleave_stopping_strategy = 'all_exhausted'
+# Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches.
+# Default if not set: same as training GAS.
+eval_gradient_accumulation_steps = 1
+
+# bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig.
+#[quantization.bnb]
+#load_in_4bit = true
+#bnb_4bit_use_double_quant = false
+#bnb_4bit_compute_dtype = 'bfloat16'
+
+# HQQ quantization. The parameters here become arguments to CustomHQQConfig.
+# [quantization.hqq]
+# nbits = 4
+# group_size = 64
+# compute_dtype = 'bfloat16'
+
+# (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj'
+# is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq].
+# [quantization.hqq.dynamic_config]
+# gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
+# up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
+# down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
+
+[optimizer]
+# options: adamw_kahan, AdamW, AdamW8bit
+type = 'adamw_kahan'
+lr = 5e-5
+beta1 = 0.9
+beta2 = 0.99
+weight_decay = 0.1
+
+[[datasets]]
+# Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set.
+name = 'c2'
+dataset_type = 'axolotl'
+dataset_path = '../axolotl/sorc.yml'
+sequence_len = 8192
+eval_size = 0.01
+# Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set.
+sample_weight = 1
+
+#[[datasets]]
+#name = 'capybara'
+#dataset_type = 'axolotl'
+#dataset_path = 'examples/capybara.yml'
+#sequence_len = 2048
+#eval_size = 0.02
+#sample_weight = 1.5
+
+# In addition to using eval_size which splits off some of the dataset, we can have completely separate datasets for eval.
+# This can be useful if you're training on raw text data, so that the eval set remains completely fixed, even if
+# you change training sequence_len, etc.
+# This is just an example, typically you wouldn't have this overlap a training dataset.
+# [[eval_datasets]]
+# name = 'capybara'
+# dataset_type = 'axolotl'
+# dataset_path = 'examples/capybara.yml'
+# sequence_len = 2048
diff --git a/train/sorc_ds.json b/train/sorc_ds.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1b6ea22218a9bef21ae36fbb2643c89e86362c0
--- /dev/null
+++ b/train/sorc_ds.json
@@ -0,0 +1,6 @@
+{
+    "train_micro_batch_size_per_gpu": 1,
+    "gradient_accumulation_steps": 2,
+    "gradient_clipping": 1.0,
+    "steps_per_print": 1
+}