diff --git "a/ndarray-cache-b16.json" "b/ndarray-cache-b16.json" new file mode 100644--- /dev/null +++ "b/ndarray-cache-b16.json" @@ -0,0 +1,6843 @@ +{ + "metadata": { + "ParamSize": 540, + "ParamBytes": 9828481024.0, + "BitsPerParam": 5.006075648161553 + }, + "records": [ + { + "dataPath": "params_shard_0.bin", + "format": "raw-shard", + "nbytes": 104857600, + "records": [ + { + "name": "model.embed_tokens.q_weight", + "shape": [ + 102400, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 104857600, + "byteOffset": 0 + } + ], + "md5sum": "aecba7f129dff063694b75eb9df73aef" + }, + { + "dataPath": "params_shard_1.bin", + "format": "raw-shard", + "nbytes": 104857600, + "records": [ + { + "name": "lm_head.q_weight", + "shape": [ + 102400, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 104857600, + "byteOffset": 0 + } + ], + "md5sum": "9c777fb06365103e905464b96ab47072" + }, + { + "dataPath": "params_shard_2.bin", + "format": "raw-shard", + "nbytes": 31601664, + "records": [ + { + "name": "model.embed_tokens.q_scale", + "shape": [ + 102400, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 13107200, + "byteOffset": 0 + }, + { + "name": "model.norm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 13107200 + }, + { + "name": "lm_head.q_scale", + "shape": [ + 102400, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 13107200, + "byteOffset": 13111296 + }, + { + "name": "model.layers.0.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 26218496 + }, + { + "name": "model.layers.0.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 29364224 + }, + { + "name": "model.layers.0.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 29757440 + }, + { + "name": "model.layers.0.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 30347264 + }, + { + "name": "model.layers.0.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 30420992 + }, + { + "name": "model.layers.0.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 30422016 + }, + { + "name": "model.layers.0.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 31470592 + } + ], + "md5sum": "3edbb7d4fea234666177e0002d8012e1" + }, + { + "dataPath": "params_shard_3.bin", + "format": "raw-shard", + "nbytes": 27574272, + "records": [ + { + "name": "model.layers.0.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 0 + }, + { + "name": "model.layers.0.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 2097152 + }, + { + "name": "model.layers.0.mlp.gate_up_proj.q_weight", + "shape": [ + 21888, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 22413312, + "byteOffset": 2359296 + }, + { + "name": "model.layers.0.mlp.gate_up_proj.q_scale", + "shape": [ + 21888, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2801664, + "byteOffset": 24772608 + } + ], + "md5sum": "5c550c39b3503f125dc0e8939f6db19e" + }, + { + "dataPath": "params_shard_4.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.1.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "2cb32a075e79c2e7977117d320c7158a" + }, + { + "dataPath": "params_shard_5.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.1.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "e119fa8ffb9e4331d8e160b7787551d9" + }, + { + "dataPath": "params_shard_6.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.1.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "130e1f34c853f056a36481565657501f" + }, + { + "dataPath": "params_shard_7.bin", + "format": "raw-shard", + "nbytes": 30352384, + "records": [ + { + "name": "model.layers.0.mlp.down_proj.q_weight", + "shape": [ + 2048, + 1368 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 11206656, + "byteOffset": 0 + }, + { + "name": "model.layers.0.mlp.down_proj.q_scale", + "shape": [ + 2048, + 342 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1400832, + "byteOffset": 11206656 + }, + { + "name": "model.layers.0.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 12607488 + }, + { + "name": "model.layers.0.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 12611584 + }, + { + "name": "model.layers.1.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 12615680 + }, + { + "name": "model.layers.1.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 15761408 + }, + { + "name": "model.layers.1.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 16154624 + }, + { + "name": "model.layers.1.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 16744448 + }, + { + "name": "model.layers.1.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 16818176 + }, + { + "name": "model.layers.1.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 16819200 + }, + { + "name": "model.layers.1.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 17867776 + }, + { + "name": "model.layers.1.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 17998848 + }, + { + "name": "model.layers.1.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 20096000 + }, + { + "name": "model.layers.1.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 20358144 + }, + { + "name": "model.layers.1.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 20620288 + }, + { + "name": "model.layers.1.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 26387456 + }, + { + "name": "model.layers.1.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 27108352 + }, + { + "name": "model.layers.1.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 29991936 + } + ], + "md5sum": "b44c15db5d762e1e20c897ce21ab125c" + }, + { + "dataPath": "params_shard_8.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.2.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "bf13ebf4204d41ac2c696d2746cb8109" + }, + { + "dataPath": "params_shard_9.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.2.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "c4fe0af0dd214a94b671e4fa54c53199" + }, + { + "dataPath": "params_shard_10.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.2.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "a5858f0200fec84b0f14556c372a0380" + }, + { + "dataPath": "params_shard_11.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.1.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.1.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.1.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.2.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.2.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.2.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.2.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.2.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.2.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.2.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.2.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.2.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.2.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.2.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.2.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.2.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.2.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "5a1448ed18f2f0d9c4e15ca551cbbf8b" + }, + { + "dataPath": "params_shard_12.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.3.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "8e265f4994f7b62590c5e6f0081359cb" + }, + { + "dataPath": "params_shard_13.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.3.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "6836686ab66e573994f265475180fec2" + }, + { + "dataPath": "params_shard_14.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.3.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "1963f0f3e0b5420326fa62c238c6c937" + }, + { + "dataPath": "params_shard_15.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.2.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.2.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.2.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.3.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.3.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.3.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.3.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.3.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.3.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.3.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.3.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.3.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.3.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.3.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.3.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.3.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.3.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "04724f86db655f3a1c1205c2bd5a62cd" + }, + { + "dataPath": "params_shard_16.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.4.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "2dcb861888a6ef52f333dcee6afa955f" + }, + { + "dataPath": "params_shard_17.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.4.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "241a231e639e927c55bb30a338ecb319" + }, + { + "dataPath": "params_shard_18.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.4.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "9ee7b1730034f7d27a08ad750c633faa" + }, + { + "dataPath": "params_shard_19.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.3.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.3.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.3.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.4.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.4.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.4.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.4.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.4.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.4.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.4.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.4.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.4.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.4.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.4.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.4.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.4.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.4.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "2ef24a770fc746dceaefbefa282dc530" + }, + { + "dataPath": "params_shard_20.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.5.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "3a225c76960137d8241b870ebe4726bd" + }, + { + "dataPath": "params_shard_21.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.5.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "7808ba6cd5d693d41b62fb7db932c2be" + }, + { + "dataPath": "params_shard_22.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.5.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "5425fd0fd748c4f8899f0a4c0d02060c" + }, + { + "dataPath": "params_shard_23.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.4.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.4.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.4.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.5.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.5.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.5.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.5.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.5.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.5.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.5.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.5.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.5.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.5.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.5.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.5.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.5.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.5.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "12f2aad47ba38fea275645c8f952ca35" + }, + { + "dataPath": "params_shard_24.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.6.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "2c5200430f16add6e4d28373b8eb5042" + }, + { + "dataPath": "params_shard_25.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.6.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "7df4cef8e3c4fa2b8bc082389d0e416a" + }, + { + "dataPath": "params_shard_26.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.6.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "db0c6dcd77691f83cd499c3394c6d503" + }, + { + "dataPath": "params_shard_27.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.5.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.5.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.5.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.6.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.6.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.6.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.6.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.6.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.6.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.6.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.6.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.6.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.6.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.6.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.6.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.6.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.6.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "d55c4cc540d9aad19a052efaaa1a1de6" + }, + { + "dataPath": "params_shard_28.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.7.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "341e0ea06cbb6260bc0150a7ef58719e" + }, + { + "dataPath": "params_shard_29.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.7.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "e4105bb9a01df049f1df6f9f4b8b84e1" + }, + { + "dataPath": "params_shard_30.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.7.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "7376bb84a86d5815102a7b8c2c52e4ee" + }, + { + "dataPath": "params_shard_31.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.6.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.6.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.6.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.7.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.7.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.7.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.7.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.7.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.7.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.7.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.7.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.7.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.7.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.7.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.7.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.7.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.7.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "42739b8922ca745d33d7e07e3375d857" + }, + { + "dataPath": "params_shard_32.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.8.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "d0b521b707b9eaf7780a9b9b5864708d" + }, + { + "dataPath": "params_shard_33.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.8.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "83e8bd7d70cc18b7696745b343922231" + }, + { + "dataPath": "params_shard_34.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.8.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "0b37805532160a011505e7ac5a99b5bc" + }, + { + "dataPath": "params_shard_35.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.7.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.7.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.7.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.8.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.8.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.8.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.8.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.8.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.8.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.8.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.8.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.8.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.8.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.8.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.8.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.8.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.8.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "2cdca14a871e54eb7fecb5bba21a7bc1" + }, + { + "dataPath": "params_shard_36.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.9.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "0f513f9a0d03775b32821402934b7676" + }, + { + "dataPath": "params_shard_37.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.9.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "d3b9afb8b1f8460fe9af2f496d0cf6bb" + }, + { + "dataPath": "params_shard_38.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.9.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "5dbd8536bf1148115653b0bc436fd155" + }, + { + "dataPath": "params_shard_39.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.8.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.8.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.8.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.9.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.9.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.9.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.9.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.9.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.9.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.9.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.9.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.9.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.9.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.9.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.9.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.9.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.9.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "feecefb1c56bf343ffe3714060c27550" + }, + { + "dataPath": "params_shard_40.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.10.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "d16eaba8c42bdf5d2bfd17dd51e95659" + }, + { + "dataPath": "params_shard_41.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.10.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "dafb0ac8d6a35846a4cf4b237f71d2d6" + }, + { + "dataPath": "params_shard_42.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.10.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "9df235058e7f7d718a6eaa1db5764e53" + }, + { + "dataPath": "params_shard_43.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.9.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.9.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.9.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.10.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.10.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.10.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.10.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.10.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.10.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.10.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.10.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.10.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.10.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.10.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.10.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.10.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.10.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "40baf16d40e0fe719d6c90e13435bb56" + }, + { + "dataPath": "params_shard_44.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.11.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "4a5e3d29c874445c2b1e691799703552" + }, + { + "dataPath": "params_shard_45.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.11.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "f7d823c624b6069389338731955626d9" + }, + { + "dataPath": "params_shard_46.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.11.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "6b4e8b74259e5cfc71ec8cac9ce88e81" + }, + { + "dataPath": "params_shard_47.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.10.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.10.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.10.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.11.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.11.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.11.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.11.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.11.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.11.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.11.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.11.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.11.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.11.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.11.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.11.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.11.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.11.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "6efdb660233d84db84220a0df941207e" + }, + { + "dataPath": "params_shard_48.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.12.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "ec3c62c29a5c880dc1676b6136911932" + }, + { + "dataPath": "params_shard_49.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.12.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "f786eedd3709cb65e05567c7b1c2e968" + }, + { + "dataPath": "params_shard_50.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.12.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "93dc1b693e9442e9d1802297bc9d8080" + }, + { + "dataPath": "params_shard_51.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.11.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.11.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.11.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.12.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.12.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.12.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.12.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.12.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.12.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.12.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.12.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.12.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.12.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.12.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.12.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.12.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.12.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "f5cf59fe827ba499af3622ef68b53104" + }, + { + "dataPath": "params_shard_52.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.13.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "4c4d6cb1461facc9765a268ba04fa6ee" + }, + { + "dataPath": "params_shard_53.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.13.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "641ebb59a0d5426d628c8c9c5ca64361" + }, + { + "dataPath": "params_shard_54.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.13.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "bd6316e9a7e6f95bfa26a2d3f9ccd2e3" + }, + { + "dataPath": "params_shard_55.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.12.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.12.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.12.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.13.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.13.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.13.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.13.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.13.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.13.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.13.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.13.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.13.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.13.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.13.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.13.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.13.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.13.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "360346127e1a16e9dff6db2ae9055787" + }, + { + "dataPath": "params_shard_56.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.14.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "9a2ca6638a396fbb824388bb7a4ed793" + }, + { + "dataPath": "params_shard_57.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.14.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "b832d5df54d210efacd31fd310915b31" + }, + { + "dataPath": "params_shard_58.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.14.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "dc976e7be7cfaf6325e789a1af2ebe3e" + }, + { + "dataPath": "params_shard_59.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.13.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.13.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.13.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.14.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.14.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.14.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.14.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.14.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.14.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.14.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.14.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.14.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.14.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.14.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.14.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.14.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.14.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "683f47a8fc4093d4a015a0541b2be473" + }, + { + "dataPath": "params_shard_60.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.15.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "5c823c361d0272b251f5ffb006c6fa4a" + }, + { + "dataPath": "params_shard_61.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.15.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "1300ab997707e3bac452b549f6f1b791" + }, + { + "dataPath": "params_shard_62.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.15.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "daf7b029dac6a8ca4f625e1643d2eeed" + }, + { + "dataPath": "params_shard_63.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.14.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.14.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.14.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.15.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.15.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.15.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.15.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.15.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.15.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.15.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.15.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.15.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.15.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.15.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.15.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.15.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.15.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "7e38de5399b867c755340e40361477d8" + }, + { + "dataPath": "params_shard_64.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.16.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "7683e954027028f578b46e774f2e4e0d" + }, + { + "dataPath": "params_shard_65.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.16.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "c30b17863a3bc75384e74e9c07cd6d1d" + }, + { + "dataPath": "params_shard_66.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.16.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "7df0c8975e0966abcaec8a6d8dc3666e" + }, + { + "dataPath": "params_shard_67.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.15.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.15.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.15.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.16.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.16.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.16.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.16.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.16.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.16.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.16.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.16.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.16.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.16.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.16.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.16.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.16.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.16.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "7f16dd445dfb75203d04689a8de74733" + }, + { + "dataPath": "params_shard_68.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.17.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "72458d312469b9c35a845ad0ea760977" + }, + { + "dataPath": "params_shard_69.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.17.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "4da660ec13d5680d859634572e3a790b" + }, + { + "dataPath": "params_shard_70.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.17.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "7823fda41ec073e680b3f4a3b1d9b2bf" + }, + { + "dataPath": "params_shard_71.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.16.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.16.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.16.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.17.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.17.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.17.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.17.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.17.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.17.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.17.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.17.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.17.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.17.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.17.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.17.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.17.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.17.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "c77169eca0567ead25d2b4f807f85eb6" + }, + { + "dataPath": "params_shard_72.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.18.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "c3e81e764b5456e48a603942b80274a6" + }, + { + "dataPath": "params_shard_73.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.18.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "23411bbfabdec689846a2cd064ec08f6" + }, + { + "dataPath": "params_shard_74.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.18.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "358262ba79ac0495b92f7e6fcb5e2ae7" + }, + { + "dataPath": "params_shard_75.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.17.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.17.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.17.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.18.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.18.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.18.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.18.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.18.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.18.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.18.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.18.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.18.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.18.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.18.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.18.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.18.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.18.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "87a3f84d55f0964a343e6d255c30a29b" + }, + { + "dataPath": "params_shard_76.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.19.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "e1cc38cb3dec8a8fb8fa08af72eae37c" + }, + { + "dataPath": "params_shard_77.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.19.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "45a0f508d0bb33224b060fdde2fab9fd" + }, + { + "dataPath": "params_shard_78.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.19.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "dbbeb4832b7529767841dda7eb703467" + }, + { + "dataPath": "params_shard_79.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.18.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.18.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.18.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.19.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.19.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.19.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.19.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.19.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.19.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.19.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.19.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.19.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.19.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.19.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.19.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.19.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.19.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "cb00bc5bc69031f2baa0a818593a6308" + }, + { + "dataPath": "params_shard_80.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.20.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "89040711e6aa2b61e405e0b977f0e5c7" + }, + { + "dataPath": "params_shard_81.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.20.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "8da6d8dc34dcfc03c6c08905cfe79437" + }, + { + "dataPath": "params_shard_82.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.20.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "da5cf3a28182036c810447738a13a66b" + }, + { + "dataPath": "params_shard_83.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.19.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.19.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.19.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.20.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.20.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.20.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.20.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.20.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.20.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.20.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.20.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.20.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.20.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.20.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.20.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.20.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.20.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "bd0bc40aa2beaa94a66609ba234eb98a" + }, + { + "dataPath": "params_shard_84.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.21.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "6fe4ea05e1d514efc23566ad8897c1a4" + }, + { + "dataPath": "params_shard_85.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.21.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "edb10d3a0362259c844abd681f00b57f" + }, + { + "dataPath": "params_shard_86.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.21.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "8cf790f09998c2ee8726dadbee27776d" + }, + { + "dataPath": "params_shard_87.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.20.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.20.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.20.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.21.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.21.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.21.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.21.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.21.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.21.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.21.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.21.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.21.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.21.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.21.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.21.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.21.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.21.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "8f9356ac0ce5c7a19349c4090f59fb30" + }, + { + "dataPath": "params_shard_88.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.22.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "e68534b8ad5519b90538aa42b53c043e" + }, + { + "dataPath": "params_shard_89.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.22.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "48e8b10ac27055abfffa65dcc62dcdff" + }, + { + "dataPath": "params_shard_90.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.22.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "be93840acabfc15bed8960afd9788afb" + }, + { + "dataPath": "params_shard_91.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.21.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.21.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.21.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.22.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.22.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.22.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.22.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.22.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.22.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.22.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.22.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.22.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.22.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.22.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.22.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.22.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.22.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "79215ec8e21962243d7840ddc1ce758f" + }, + { + "dataPath": "params_shard_92.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.23.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "db7acc0bacb9969ca6b7df2b96a81188" + }, + { + "dataPath": "params_shard_93.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.23.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "ddc3538ccdf293d97478b8cf1ae44c93" + }, + { + "dataPath": "params_shard_94.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.23.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "533d2d37d864f37693c672b4e168294f" + }, + { + "dataPath": "params_shard_95.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.22.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.22.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.22.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.23.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.23.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.23.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.23.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.23.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.23.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.23.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.23.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.23.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.23.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.23.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.23.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.23.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.23.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "240d7a194858bfbcd7faa56a87bcbeb5" + }, + { + "dataPath": "params_shard_96.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.24.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "033972268afd719b33a7b0d0c4ee69ea" + }, + { + "dataPath": "params_shard_97.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.24.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "ed51159afa8c766a3fc9a5632680060b" + }, + { + "dataPath": "params_shard_98.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.24.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "83233d1114a93cc13911a0d8c67dd07d" + }, + { + "dataPath": "params_shard_99.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.23.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.23.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.23.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.24.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.24.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.24.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.24.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.24.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.24.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.24.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.24.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.24.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.24.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.24.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.24.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.24.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.24.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "5a2280321bdb6cd01a8d0b1809a32c46" + }, + { + "dataPath": "params_shard_100.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.25.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "adcadc9e1b5aa18901949119936e60c8" + }, + { + "dataPath": "params_shard_101.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.25.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "0367e10b699e7d7629f0ca112235a940" + }, + { + "dataPath": "params_shard_102.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.25.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "b43ae0388648936b710472b2a9e3e431" + }, + { + "dataPath": "params_shard_103.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.24.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.24.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.24.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.25.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.25.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.25.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.25.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.25.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.25.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.25.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.25.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.25.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.25.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.25.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.25.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.25.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.25.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "8f4e4e60c9df90e548a0520c047af261" + }, + { + "dataPath": "params_shard_104.bin", + "format": "raw-shard", + "nbytes": 184549376, + "records": [ + { + "name": "model.layers.26.mlp.moe_gate_up_proj.q_weight", + "shape": [ + 64, + 2816, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 184549376, + "byteOffset": 0 + } + ], + "md5sum": "aeddb21991e6573caede2d7c68185015" + }, + { + "dataPath": "params_shard_105.bin", + "format": "raw-shard", + "nbytes": 23068672, + "records": [ + { + "name": "model.layers.26.mlp.moe_gate_up_proj.q_scale", + "shape": [ + 64, + 2816, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 23068672, + "byteOffset": 0 + } + ], + "md5sum": "ec689d3391d173aace5532bf59515e50" + }, + { + "dataPath": "params_shard_106.bin", + "format": "raw-shard", + "nbytes": 92274688, + "records": [ + { + "name": "model.layers.26.mlp.moe_down_proj.q_weight", + "shape": [ + 64, + 2048, + 176 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 92274688, + "byteOffset": 0 + } + ], + "md5sum": "f88861ef7b7936346d543a2d69b50ac8" + }, + { + "dataPath": "params_shard_107.bin", + "format": "raw-shard", + "nbytes": 29279232, + "records": [ + { + "name": "model.layers.25.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.25.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.25.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + }, + { + "name": "model.layers.26.self_attn.q_proj.q_weight", + "shape": [ + 3072, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 3145728, + "byteOffset": 11542528 + }, + { + "name": "model.layers.26.self_attn.q_proj.q_scale", + "shape": [ + 3072, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 393216, + "byteOffset": 14688256 + }, + { + "name": "model.layers.26.self_attn.kv_a_proj_with_mqa.q_weight", + "shape": [ + 576, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 589824, + "byteOffset": 15081472 + }, + { + "name": "model.layers.26.self_attn.kv_a_proj_with_mqa.q_scale", + "shape": [ + 576, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 73728, + "byteOffset": 15671296 + }, + { + "name": "model.layers.26.self_attn.kv_a_layernorm.weight", + "shape": [ + 512 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1024, + "byteOffset": 15745024 + }, + { + "name": "model.layers.26.self_attn.kv_b_proj.q_weight", + "shape": [ + 4096, + 64 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 1048576, + "byteOffset": 15746048 + }, + { + "name": "model.layers.26.self_attn.kv_b_proj.q_scale", + "shape": [ + 4096, + 16 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 131072, + "byteOffset": 16794624 + }, + { + "name": "model.layers.26.self_attn.o_proj.q_weight", + "shape": [ + 2048, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2097152, + "byteOffset": 16925696 + }, + { + "name": "model.layers.26.self_attn.o_proj.q_scale", + "shape": [ + 2048, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19022848 + }, + { + "name": "model.layers.26.mlp.gate.weight", + "shape": [ + 64, + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 262144, + "byteOffset": 19284992 + }, + { + "name": "model.layers.26.mlp.shared_experts.gate_up_proj.q_weight", + "shape": [ + 5632, + 256 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 5767168, + "byteOffset": 19547136 + }, + { + "name": "model.layers.26.mlp.shared_experts.gate_up_proj.q_scale", + "shape": [ + 5632, + 64 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 720896, + "byteOffset": 25314304 + }, + { + "name": "model.layers.26.mlp.shared_experts.down_proj.q_weight", + "shape": [ + 2048, + 352 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2883584, + "byteOffset": 26035200 + }, + { + "name": "model.layers.26.mlp.shared_experts.down_proj.q_scale", + "shape": [ + 2048, + 88 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 360448, + "byteOffset": 28918784 + } + ], + "md5sum": "f7a67a47df2a804e3296f0e30e5ff7a6" + }, + { + "dataPath": "params_shard_108.bin", + "format": "raw-shard", + "nbytes": 11542528, + "records": [ + { + "name": "model.layers.26.mlp.moe_down_proj.q_scale", + "shape": [ + 64, + 2048, + 44 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 11534336, + "byteOffset": 0 + }, + { + "name": "model.layers.26.input_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11534336 + }, + { + "name": "model.layers.26.post_attention_layernorm.weight", + "shape": [ + 2048 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 4096, + "byteOffset": 11538432 + } + ], + "md5sum": "b08e08a725e528c81d407433a0ada2a0" + } + ] +} \ No newline at end of file