|
/home/cfruan/.conda/envs/mlc-source-311/bin/python -m mlc_chat gen_config /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5 --quantization q4f16_1 --conv-template phi-2 --output /tmp/tmpxe445xtc |
|
[2023-12-28 23:33:19] INFO auto_config.py:115: [92mFound[0m model configuration: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/config.json |
|
[2023-12-28 23:33:19] INFO auto_config.py:151: [92mFound[0m model type: [1mphi-msft[0m. Use `--model-type` to override. |
|
[2023-12-28 23:33:19] INFO phi_model.py:59: [1mcontext_window_size[0m not found in config.json. Falling back to n_positions (2048) |
|
[2023-12-28 23:33:19] INFO gen_config.py:129: [91mNot found[0m tokenizer config: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/tokenizer.model |
|
[2023-12-28 23:33:19] INFO gen_config.py:127: [92mFound[0m tokenizer config: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/tokenizer.json. Copying to [1m/tmp/tmpxe445xtc/tokenizer.json[0m |
|
[2023-12-28 23:33:19] INFO gen_config.py:127: [92mFound[0m tokenizer config: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/vocab.json. Copying to [1m/tmp/tmpxe445xtc/vocab.json[0m |
|
[2023-12-28 23:33:19] INFO gen_config.py:127: [92mFound[0m tokenizer config: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/merges.txt. Copying to [1m/tmp/tmpxe445xtc/merges.txt[0m |
|
[2023-12-28 23:33:19] INFO gen_config.py:127: [92mFound[0m tokenizer config: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/added_tokens.json. Copying to [1m/tmp/tmpxe445xtc/added_tokens.json[0m |
|
[2023-12-28 23:33:19] INFO gen_config.py:127: [92mFound[0m tokenizer config: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/tokenizer_config.json. Copying to [1m/tmp/tmpxe445xtc/tokenizer_config.json[0m |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mpad_token_id[0m: 0 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mbos_token_id[0m: 1 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1meos_token_id[0m: 2 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mtemperature[0m: 0.7 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mrepetition_penalty[0m: 1.0 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mtop_p[0m: 0.95 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mmean_gen_len[0m: 128 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mmax_gen_len[0m: 512 |
|
[2023-12-28 23:33:19] INFO gen_config.py:69: [System default] Setting [1mshift_fill_factor[0m: 0.3 |
|
[2023-12-28 23:33:19] INFO gen_config.py:157: Dumping configuration file to: [1m/tmp/tmpxe445xtc/mlc-chat-config.json[0m |
|
/home/cfruan/.conda/envs/mlc-source-311/bin/python -m mlc_chat convert_weight /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5 --quantization q4f16_1 --source-format auto --output /tmp/tmpxe445xtc |
|
[2023-12-28 23:33:20] INFO auto_config.py:115: [92mFound[0m model configuration: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/config.json |
|
[2023-12-28 23:33:20] INFO auto_device.py:76: [92mFound[0m device: cuda:0 |
|
[2023-12-28 23:33:20] INFO auto_device.py:76: [92mFound[0m device: cuda:1 |
|
[2023-12-28 23:33:20] INFO auto_device.py:85: [91mNot found[0m device: rocm:0 |
|
[2023-12-28 23:33:20] INFO auto_device.py:85: [91mNot found[0m device: metal:0 |
|
[2023-12-28 23:33:21] INFO auto_device.py:76: [92mFound[0m device: vulkan:0 |
|
[2023-12-28 23:33:21] INFO auto_device.py:76: [92mFound[0m device: vulkan:1 |
|
[2023-12-28 23:33:21] INFO auto_device.py:76: [92mFound[0m device: vulkan:2 |
|
[2023-12-28 23:33:21] INFO auto_device.py:85: [91mNot found[0m device: opencl:0 |
|
[2023-12-28 23:33:21] INFO auto_device.py:33: Using device: [1mcuda:0[0m |
|
[2023-12-28 23:33:21] INFO auto_weight.py:70: Finding weights in: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5 |
|
[2023-12-28 23:33:21] INFO auto_weight.py:129: [92mFound[0m source weight format: huggingface-torch. Source configuration: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/pytorch_model.bin |
|
[2023-12-28 23:33:21] INFO auto_weight.py:149: [91mNot found[0m Huggingface Safetensor |
|
[2023-12-28 23:33:21] INFO auto_weight.py:106: Using source weight configuration: [1m/ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/pytorch_model.bin[0m. Use `--source` to override. |
|
[2023-12-28 23:33:21] INFO auto_weight.py:110: Using source weight format: [1mhuggingface-torch[0m. Use `--source-format` to override. |
|
[2023-12-28 23:33:21] INFO auto_config.py:151: [92mFound[0m model type: [1mphi-msft[0m. Use `--model-type` to override. |
|
[2023-12-28 23:33:21] INFO phi_model.py:59: [1mcontext_window_size[0m not found in config.json. Falling back to n_positions (2048) |
|
[2023-12-28 23:33:24] INFO huggingface_loader.py:169: Loading HF parameters from: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/pytorch_model.bin |
|
[1mWeight conversion with arguments:[0m |
|
[1m--config[0m /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/config.json |
|
[1m--quantization[0m GroupQuantize(name='q4f16_1', kind='group-quant', group_size=32, quantize_dtype='int4', storage_dtype='uint32', model_dtype='float16', num_elem_per_storage=8, num_storage_per_group=4, max_int_value=7) |
|
[1m--model-type[0m phi-msft |
|
[1m--device[0m cuda:0 |
|
[1m--source[0m /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/pytorch_model.bin |
|
[1m--source-format[0m huggingface-torch |
|
[1m--output[0m /tmp/tmpxe445xtc |
|
0%| | 0/245 [00:00<?, ?it/s]
[2023-12-28 23:33:25] INFO group_quantization.py:200: Compiling quantize function for key: (51200, 2048, 'float16', 'cuda') |
|
0%| | 0/245 [00:00<?, ?it/s]
[2023-12-28 23:33:26] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.embd.q_weight[0m", shape: (51200, 256), dtype: uint32 |
|
0%| | 0/245 [00:01<?, ?it/s]
[2023-12-28 23:33:26] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.embd.q_scale[0m", shape: (51200, 64), dtype: float16 |
|
0%| | 0/245 [00:01<?, ?it/s]
0%|β | 1/245 [00:01<04:43, 1.16s/it]
[2023-12-28 23:33:26] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.0.ln.weight[0m", shape: (2048,), dtype: float16 |
|
0%|β | 1/245 [00:01<04:43, 1.16s/it]
[2023-12-28 23:33:26] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.0.ln.bias[0m", shape: (2048,), dtype: float16 |
|
0%|β | 1/245 [00:01<04:43, 1.16s/it]
[2023-12-28 23:33:26] INFO group_quantization.py:200: Compiling quantize function for key: (6144, 2048, 'float16', 'cuda') |
|
0%|β | 1/245 [00:01<04:43, 1.16s/it]
[2023-12-28 23:33:27] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
0%|β | 1/245 [00:01<04:43, 1.16s/it]
[2023-12-28 23:33:27] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
0%|β | 1/245 [00:01<04:43, 1.16s/it]
2%|ββ | 4/245 [00:01<01:17, 3.10it/s]
[2023-12-28 23:33:27] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.0.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
2%|ββ | 4/245 [00:01<01:17, 3.10it/s]
[2023-12-28 23:33:27] INFO group_quantization.py:200: Compiling quantize function for key: (2048, 2048, 'float16', 'cuda') |
|
2%|ββ | 4/245 [00:01<01:17, 3.10it/s]
[2023-12-28 23:33:27] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
2%|ββ | 4/245 [00:01<01:17, 3.10it/s]
[2023-12-28 23:33:27] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
2%|ββ | 4/245 [00:01<01:17, 3.10it/s]
2%|βββ | 6/245 [00:01<01:02, 3.83it/s]
[2023-12-28 23:33:27] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.0.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
2%|βββ | 6/245 [00:01<01:02, 3.83it/s]
[2023-12-28 23:33:27] INFO group_quantization.py:200: Compiling quantize function for key: (8192, 2048, 'float16', 'cuda') |
|
2%|βββ | 6/245 [00:01<01:02, 3.83it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
2%|βββ | 6/245 [00:02<01:02, 3.83it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
2%|βββ | 6/245 [00:02<01:02, 3.83it/s]
3%|ββββ | 8/245 [00:02<00:55, 4.29it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.0.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
3%|ββββ | 8/245 [00:02<00:55, 4.29it/s]
[2023-12-28 23:33:28] INFO group_quantization.py:200: Compiling quantize function for key: (2048, 8192, 'float16', 'cuda') |
|
3%|ββββ | 8/245 [00:02<00:55, 4.29it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
3%|ββββ | 8/245 [00:02<00:55, 4.29it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.0.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
3%|ββββ | 8/245 [00:02<00:55, 4.29it/s]
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.0.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.1.ln.weight[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.1.ln.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.1.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.1.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.1.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.1.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.1.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.2.ln.weight[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.2.ln.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.2.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.2.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.2.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.2.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.2.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.3.ln.weight[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.3.ln.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.3.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.3.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.3.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.3.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.3.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.4.ln.weight[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.4.ln.bias[0m", shape: (2048,), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
4%|βββββ | 10/245 [00:02<00:51, 4.59it/s]
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.4.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.4.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.4.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.4.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.4.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.5.ln.weight[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.5.ln.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.5.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.5.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.5.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.5.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.5.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.6.ln.weight[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.6.ln.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.6.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.6.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.6.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.6.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.6.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.7.ln.weight[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.7.ln.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.7.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.7.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
18%|ββββββββββββββββββββββ | 44/245 [00:02<00:05, 39.49it/s]
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.7.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.7.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.7.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.8.ln.weight[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.8.ln.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.8.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.8.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.8.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.8.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.8.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.9.ln.weight[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.9.ln.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.9.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.9.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.9.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.9.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.9.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.10.ln.weight[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.10.ln.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.10.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.10.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.10.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.10.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.10.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
32%|βββββββββββββββββββββββββββββββββββββββ | 78/245 [00:02<00:02, 78.26it/s]
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.11.ln.weight[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.11.ln.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.11.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.11.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.11.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.11.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.11.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.12.ln.weight[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.12.ln.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.12.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:02<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.12.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.12.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.12.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.12.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.13.ln.weight[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.13.ln.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.13.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.13.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.13.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.13.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.13.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.14.ln.weight[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.14.ln.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.14.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.14.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
45%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 111/245 [00:03<00:01, 117.47it/s]
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.14.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.14.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.14.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.15.ln.weight[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.15.ln.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.15.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.15.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.15.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.15.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.15.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.16.ln.weight[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.16.ln.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.16.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.16.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.16.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.16.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.16.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.17.ln.weight[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.17.ln.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.17.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.17.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.17.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.17.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 147/245 [00:03<00:00, 161.54it/s]
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.17.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.18.ln.weight[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.18.ln.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.18.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.18.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.18.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.18.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.18.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.19.ln.weight[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.19.ln.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.19.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.19.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.19.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.19.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.19.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.20.ln.weight[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.20.ln.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.20.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.20.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:28] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.20.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.20.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.20.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.21.ln.weight[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.21.ln.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.21.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.21.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 180/245 [00:03<00:00, 192.97it/s]
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.21.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.21.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.21.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.22.ln.weight[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.22.ln.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.22.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.22.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.22.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.22.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.22.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.23.ln.weight[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.23.ln.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mixer.Wqkv.q_weight[0m", shape: (6144, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mixer.Wqkv.q_scale[0m", shape: (6144, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.23.mixer.Wqkv.bias[0m", shape: (6144,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mixer.out_proj.q_weight[0m", shape: (2048, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mixer.out_proj.q_scale[0m", shape: (2048, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.23.mixer.out_proj.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mlp.fc1.q_weight[0m", shape: (8192, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mlp.fc1.q_scale[0m", shape: (8192, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.23.mlp.fc1.bias[0m", shape: (8192,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mlp.fc2.q_weight[0m", shape: (2048, 1024), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mtransformer.h.23.mlp.fc2.q_scale[0m", shape: (2048, 256), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mtransformer.h.23.mlp.fc2.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mlm_head.ln.weight[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mlm_head.ln.bias[0m", shape: (2048,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mlm_head.linear.q_weight[0m", shape: (51200, 256), dtype: uint32 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:121: [Quantized] Parameter: "[1mlm_head.linear.q_scale[0m", shape: (51200, 64), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
[2023-12-28 23:33:29] INFO huggingface_loader.py:129: [Not quantized] Parameter: "[1mlm_head.linear.bias[0m", shape: (51200,), dtype: float16 |
|
89%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 218/245 [00:03<00:00, 230.43it/s]
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 245/245 [00:03<00:00, 71.86it/s] |
|
[2023-12-28 23:33:29] INFO huggingface_loader.py:179: Unloading HF weight file: /ssd1/cfruan/mlc-llm-repos/mlc-llm-head/dist/models/phi-1_5/pytorch_model.bin |
|
[2023-12-28 23:33:29] INFO stats.py:71: [92mTime usage[0m: HF loading: 1.868 sec; Pre-quantization mapping: 0.359 sec; Quantization: 2.522 sec |
|
[2023-12-28 23:33:29] INFO stats.py:85: [92mRAM usage[0m: Peak RAM: 2.642 GB. Total bytes loaded from disk: 2.642 GB |
|
[2023-12-28 23:33:29] INFO convert_weight.py:110: [92mParameter size[0m after quantization: 0.744 GB |
|
[2023-12-28 23:33:29] INFO convert_weight.py:115: [92mTotal parameters[0m: 1,418,270,720 |
|
[2023-12-28 23:33:29] INFO convert_weight.py:116: [92mBits per parameter[0m: 4.505 |
|
Start storing to cache /tmp/tmpxe445xtc |
|
[0001/0343] saving transformer.embd.q_weight
[0002/0343] saving transformer.embd.q_scale
[0003/0343] saving transformer.h.0.ln.weight
[0004/0343] saving transformer.h.0.ln.bias
[0005/0343] saving transformer.h.0.mixer.Wqkv.q_weight
[0006/0343] saving transformer.h.0.mixer.Wqkv.q_scale
[0007/0343] saving transformer.h.0.mixer.Wqkv.bias
[0008/0343] saving transformer.h.0.mixer.out_proj.q_weight
[0009/0343] saving transformer.h.0.mixer.out_proj.q_scale
[0010/0343] saving transformer.h.0.mixer.out_proj.bias
[0011/0343] saving transformer.h.0.mlp.fc1.q_weight
[0012/0343] saving transformer.h.0.mlp.fc1.q_scale
[0013/0343] saving transformer.h.0.mlp.fc1.bias
[0014/0343] saving transformer.h.0.mlp.fc2.q_weight
[0015/0343] saving transformer.h.0.mlp.fc2.q_scale
[0016/0343] saving transformer.h.0.mlp.fc2.bias
[0017/0343] saving transformer.h.1.ln.weight
[0018/0343] saving transformer.h.1.ln.bias
[0019/0343] saving transformer.h.1.mixer.Wqkv.q_weight
[0020/0343] saving transformer.h.1.mixer.Wqkv.q_scale
[0021/0343] saving transformer.h.1.mixer.Wqkv.bias
[0022/0343] saving transformer.h.1.mixer.out_proj.q_weight
[0023/0343] saving transformer.h.1.mixer.out_proj.q_scale
[0024/0343] saving transformer.h.1.mixer.out_proj.bias
[0025/0343] saving transformer.h.1.mlp.fc1.q_weight
[0026/0343] saving transformer.h.1.mlp.fc1.q_scale
[0027/0343] saving transformer.h.1.mlp.fc1.bias
[0028/0343] saving transformer.h.1.mlp.fc2.q_weight
[0029/0343] saving transformer.h.1.mlp.fc2.q_scale
[0030/0343] saving transformer.h.1.mlp.fc2.bias
[0031/0343] saving transformer.h.2.ln.weight
[0032/0343] saving transformer.h.2.ln.bias
[0033/0343] saving transformer.h.2.mixer.Wqkv.q_weight
[0034/0343] saving transformer.h.2.mixer.Wqkv.q_scale
[0035/0343] saving transformer.h.2.mixer.Wqkv.bias
[0036/0343] saving transformer.h.2.mixer.out_proj.q_weight
[0037/0343] saving transformer.h.2.mixer.out_proj.q_scale
[0038/0343] saving transformer.h.2.mixer.out_proj.bias
[0039/0343] saving transformer.h.2.mlp.fc1.q_weight
[0040/0343] saving transformer.h.2.mlp.fc1.q_scale
[0041/0343] saving transformer.h.2.mlp.fc1.bias
[0042/0343] saving transformer.h.2.mlp.fc2.q_weight
[0043/0343] saving transformer.h.2.mlp.fc2.q_scale
[0044/0343] saving transformer.h.2.mlp.fc2.bias
[0045/0343] saving transformer.h.3.ln.weight
[0046/0343] saving transformer.h.3.ln.bias
[0047/0343] saving transformer.h.3.mixer.Wqkv.q_weight
[0048/0343] saving transformer.h.3.mixer.Wqkv.q_scale
[0049/0343] saving transformer.h.3.mixer.Wqkv.bias
[0050/0343] saving transformer.h.3.mixer.out_proj.q_weight
[0051/0343] saving transformer.h.3.mixer.out_proj.q_scale
[0052/0343] saving transformer.h.3.mixer.out_proj.bias
[0053/0343] saving transformer.h.3.mlp.fc1.q_weight
[0054/0343] saving transformer.h.3.mlp.fc1.q_scale
[0055/0343] saving transformer.h.3.mlp.fc1.bias
[0056/0343] saving transformer.h.3.mlp.fc2.q_weight
[0057/0343] saving transformer.h.3.mlp.fc2.q_scale
[0058/0343] saving transformer.h.3.mlp.fc2.bias
[0059/0343] saving transformer.h.4.ln.weight
[0060/0343] saving transformer.h.4.ln.bias
[0061/0343] saving transformer.h.4.mixer.Wqkv.q_weight
[0062/0343] saving transformer.h.4.mixer.Wqkv.q_scale
[0063/0343] saving transformer.h.4.mixer.Wqkv.bias
[0064/0343] saving transformer.h.4.mixer.out_proj.q_weight
[0065/0343] saving transformer.h.4.mixer.out_proj.q_scale
[0066/0343] saving transformer.h.4.mixer.out_proj.bias
[0067/0343] saving transformer.h.4.mlp.fc1.q_weight
[0068/0343] saving transformer.h.4.mlp.fc1.q_scale
[0069/0343] saving transformer.h.4.mlp.fc1.bias
[0070/0343] saving transformer.h.4.mlp.fc2.q_weight
[0071/0343] saving transformer.h.4.mlp.fc2.q_scale
[0072/0343] saving transformer.h.4.mlp.fc2.bias
[0073/0343] saving transformer.h.5.ln.weight
[0074/0343] saving transformer.h.5.ln.bias
[0075/0343] saving transformer.h.5.mixer.Wqkv.q_weight
[0076/0343] saving transformer.h.5.mixer.Wqkv.q_scale
[0077/0343] saving transformer.h.5.mixer.Wqkv.bias
[0078/0343] saving transformer.h.5.mixer.out_proj.q_weight
[0079/0343] saving transformer.h.5.mixer.out_proj.q_scale
[0080/0343] saving transformer.h.5.mixer.out_proj.bias
[0081/0343] saving transformer.h.5.mlp.fc1.q_weight
[0082/0343] saving transformer.h.5.mlp.fc1.q_scale
[0083/0343] saving transformer.h.5.mlp.fc1.bias
[0084/0343] saving transformer.h.5.mlp.fc2.q_weight
[0085/0343] saving transformer.h.5.mlp.fc2.q_scale
[0086/0343] saving transformer.h.5.mlp.fc2.bias
[0087/0343] saving transformer.h.6.ln.weight
[0088/0343] saving transformer.h.6.ln.bias
[0089/0343] saving transformer.h.6.mixer.Wqkv.q_weight
[0090/0343] saving transformer.h.6.mixer.Wqkv.q_scale
[0091/0343] saving transformer.h.6.mixer.Wqkv.bias
[0092/0343] saving transformer.h.6.mixer.out_proj.q_weight
[0093/0343] saving transformer.h.6.mixer.out_proj.q_scale
[0094/0343] saving transformer.h.6.mixer.out_proj.bias
[0095/0343] saving transformer.h.6.mlp.fc1.q_weight
[0096/0343] saving transformer.h.6.mlp.fc1.q_scale
[0097/0343] saving transformer.h.6.mlp.fc1.bias
[0098/0343] saving transformer.h.6.mlp.fc2.q_weight
[0099/0343] saving transformer.h.6.mlp.fc2.q_scale
[0100/0343] saving transformer.h.6.mlp.fc2.bias
[0101/0343] saving transformer.h.7.ln.weight
[0102/0343] saving transformer.h.7.ln.bias
[0103/0343] saving transformer.h.7.mixer.Wqkv.q_weight
[0104/0343] saving transformer.h.7.mixer.Wqkv.q_scale
[0105/0343] saving transformer.h.7.mixer.Wqkv.bias
[0106/0343] saving transformer.h.7.mixer.out_proj.q_weight
[0107/0343] saving transformer.h.7.mixer.out_proj.q_scale
[0108/0343] saving transformer.h.7.mixer.out_proj.bias
[0109/0343] saving transformer.h.7.mlp.fc1.q_weight
[0110/0343] saving transformer.h.7.mlp.fc1.q_scale
[0111/0343] saving transformer.h.7.mlp.fc1.bias
[0112/0343] saving transformer.h.7.mlp.fc2.q_weight
[0113/0343] saving transformer.h.7.mlp.fc2.q_scale
[0114/0343] saving transformer.h.7.mlp.fc2.bias
[0115/0343] saving transformer.h.8.ln.weight
[0116/0343] saving transformer.h.8.ln.bias
[0117/0343] saving transformer.h.8.mixer.Wqkv.q_weight
[0118/0343] saving transformer.h.8.mixer.Wqkv.q_scale
[0119/0343] saving transformer.h.8.mixer.Wqkv.bias
[0120/0343] saving transformer.h.8.mixer.out_proj.q_weight
[0121/0343] saving transformer.h.8.mixer.out_proj.q_scale
[0122/0343] saving transformer.h.8.mixer.out_proj.bias
[0123/0343] saving transformer.h.8.mlp.fc1.q_weight
[0124/0343] saving transformer.h.8.mlp.fc1.q_scale
[0125/0343] saving transformer.h.8.mlp.fc1.bias
[0126/0343] saving transformer.h.8.mlp.fc2.q_weight
[0127/0343] saving transformer.h.8.mlp.fc2.q_scale
[0128/0343] saving transformer.h.8.mlp.fc2.bias
[0129/0343] saving transformer.h.9.ln.weight
[0130/0343] saving transformer.h.9.ln.bias
[0131/0343] saving transformer.h.9.mixer.Wqkv.q_weight
[0132/0343] saving transformer.h.9.mixer.Wqkv.q_scale
[0133/0343] saving transformer.h.9.mixer.Wqkv.bias
[0134/0343] saving transformer.h.9.mixer.out_proj.q_weight
[0135/0343] saving transformer.h.9.mixer.out_proj.q_scale
[0136/0343] saving transformer.h.9.mixer.out_proj.bias
[0137/0343] saving transformer.h.9.mlp.fc1.q_weight
[0138/0343] saving transformer.h.9.mlp.fc1.q_scale
[0139/0343] saving transformer.h.9.mlp.fc1.bias
[0140/0343] saving transformer.h.9.mlp.fc2.q_weight
[0141/0343] saving transformer.h.9.mlp.fc2.q_scale
[0142/0343] saving transformer.h.9.mlp.fc2.bias
[0143/0343] saving transformer.h.10.ln.weight
[0144/0343] saving transformer.h.10.ln.bias
[0145/0343] saving transformer.h.10.mixer.Wqkv.q_weight
[0146/0343] saving transformer.h.10.mixer.Wqkv.q_scale
[0147/0343] saving transformer.h.10.mixer.Wqkv.bias
[0148/0343] saving transformer.h.10.mixer.out_proj.q_weight
[0149/0343] saving transformer.h.10.mixer.out_proj.q_scale
[0150/0343] saving transformer.h.10.mixer.out_proj.bias
[0151/0343] saving transformer.h.10.mlp.fc1.q_weight
[0152/0343] saving transformer.h.10.mlp.fc1.q_scale
[0153/0343] saving transformer.h.10.mlp.fc1.bias
[0154/0343] saving transformer.h.10.mlp.fc2.q_weight
[0155/0343] saving transformer.h.10.mlp.fc2.q_scale
[0156/0343] saving transformer.h.10.mlp.fc2.bias
[0157/0343] saving transformer.h.11.ln.weight
[0158/0343] saving transformer.h.11.ln.bias
[0159/0343] saving transformer.h.11.mixer.Wqkv.q_weight
[0160/0343] saving transformer.h.11.mixer.Wqkv.q_scale
[0161/0343] saving transformer.h.11.mixer.Wqkv.bias
[0162/0343] saving transformer.h.11.mixer.out_proj.q_weight
[0163/0343] saving transformer.h.11.mixer.out_proj.q_scale
[0164/0343] saving transformer.h.11.mixer.out_proj.bias
[0165/0343] saving transformer.h.11.mlp.fc1.q_weight
[0166/0343] saving transformer.h.11.mlp.fc1.q_scale
[0167/0343] saving transformer.h.11.mlp.fc1.bias
[0168/0343] saving transformer.h.11.mlp.fc2.q_weight
[0169/0343] saving transformer.h.11.mlp.fc2.q_scale
[0170/0343] saving transformer.h.11.mlp.fc2.bias
[0171/0343] saving transformer.h.12.ln.weight
[0172/0343] saving transformer.h.12.ln.bias
[0173/0343] saving transformer.h.12.mixer.Wqkv.q_weight
[0174/0343] saving transformer.h.12.mixer.Wqkv.q_scale
[0175/0343] saving transformer.h.12.mixer.Wqkv.bias
[0176/0343] saving transformer.h.12.mixer.out_proj.q_weight
[0177/0343] saving transformer.h.12.mixer.out_proj.q_scale
[0178/0343] saving transformer.h.12.mixer.out_proj.bias
[0179/0343] saving transformer.h.12.mlp.fc1.q_weight
[0180/0343] saving transformer.h.12.mlp.fc1.q_scale
[0181/0343] saving transformer.h.12.mlp.fc1.bias
[0182/0343] saving transformer.h.12.mlp.fc2.q_weight
[0183/0343] saving transformer.h.12.mlp.fc2.q_scale
[0184/0343] saving transformer.h.12.mlp.fc2.bias
[0185/0343] saving transformer.h.13.ln.weight
[0186/0343] saving transformer.h.13.ln.bias
[0187/0343] saving transformer.h.13.mixer.Wqkv.q_weight
[0188/0343] saving transformer.h.13.mixer.Wqkv.q_scale
[0189/0343] saving transformer.h.13.mixer.Wqkv.bias
[0190/0343] saving transformer.h.13.mixer.out_proj.q_weight
[0191/0343] saving transformer.h.13.mixer.out_proj.q_scale
[0192/0343] saving transformer.h.13.mixer.out_proj.bias
[0193/0343] saving transformer.h.13.mlp.fc1.q_weight
[0194/0343] saving transformer.h.13.mlp.fc1.q_scale
[0195/0343] saving transformer.h.13.mlp.fc1.bias
[0196/0343] saving transformer.h.13.mlp.fc2.q_weight
[0197/0343] saving transformer.h.13.mlp.fc2.q_scale
[0198/0343] saving transformer.h.13.mlp.fc2.bias
[0199/0343] saving transformer.h.14.ln.weight
[0200/0343] saving transformer.h.14.ln.bias
[0201/0343] saving transformer.h.14.mixer.Wqkv.q_weight
[0202/0343] saving transformer.h.14.mixer.Wqkv.q_scale
[0203/0343] saving transformer.h.14.mixer.Wqkv.bias
[0204/0343] saving transformer.h.14.mixer.out_proj.q_weight
[0205/0343] saving transformer.h.14.mixer.out_proj.q_scale
[0206/0343] saving transformer.h.14.mixer.out_proj.bias
[0207/0343] saving transformer.h.14.mlp.fc1.q_weight
[0208/0343] saving transformer.h.14.mlp.fc1.q_scale
[0209/0343] saving transformer.h.14.mlp.fc1.bias
[0210/0343] saving transformer.h.14.mlp.fc2.q_weight
[0211/0343] saving transformer.h.14.mlp.fc2.q_scale
[0212/0343] saving transformer.h.14.mlp.fc2.bias
[0213/0343] saving transformer.h.15.ln.weight
[0214/0343] saving transformer.h.15.ln.bias
[0215/0343] saving transformer.h.15.mixer.Wqkv.q_weight
[0216/0343] saving transformer.h.15.mixer.Wqkv.q_scale
[0217/0343] saving transformer.h.15.mixer.Wqkv.bias
[0218/0343] saving transformer.h.15.mixer.out_proj.q_weight
[0219/0343] saving transformer.h.15.mixer.out_proj.q_scale
[0220/0343] saving transformer.h.15.mixer.out_proj.bias
[0221/0343] saving transformer.h.15.mlp.fc1.q_weight
[0222/0343] saving transformer.h.15.mlp.fc1.q_scale
[0223/0343] saving transformer.h.15.mlp.fc1.bias
[0224/0343] saving transformer.h.15.mlp.fc2.q_weight
[0225/0343] saving transformer.h.15.mlp.fc2.q_scale
[0226/0343] saving transformer.h.15.mlp.fc2.bias
[0227/0343] saving transformer.h.16.ln.weight
[0228/0343] saving transformer.h.16.ln.bias
[0229/0343] saving transformer.h.16.mixer.Wqkv.q_weight
[0230/0343] saving transformer.h.16.mixer.Wqkv.q_scale
[0231/0343] saving transformer.h.16.mixer.Wqkv.bias
[0232/0343] saving transformer.h.16.mixer.out_proj.q_weight
[0233/0343] saving transformer.h.16.mixer.out_proj.q_scale
[0234/0343] saving transformer.h.16.mixer.out_proj.bias
[0235/0343] saving transformer.h.16.mlp.fc1.q_weight
[0236/0343] saving transformer.h.16.mlp.fc1.q_scale
[0237/0343] saving transformer.h.16.mlp.fc1.bias
[0238/0343] saving transformer.h.16.mlp.fc2.q_weight
[0239/0343] saving transformer.h.16.mlp.fc2.q_scale
[0240/0343] saving transformer.h.16.mlp.fc2.bias
[0241/0343] saving transformer.h.17.ln.weight
[0242/0343] saving transformer.h.17.ln.bias
[0243/0343] saving transformer.h.17.mixer.Wqkv.q_weight
[0244/0343] saving transformer.h.17.mixer.Wqkv.q_scale
[0245/0343] saving transformer.h.17.mixer.Wqkv.bias
[0246/0343] saving transformer.h.17.mixer.out_proj.q_weight
[0247/0343] saving transformer.h.17.mixer.out_proj.q_scale
[0248/0343] saving transformer.h.17.mixer.out_proj.bias
[0249/0343] saving transformer.h.17.mlp.fc1.q_weight
[0250/0343] saving transformer.h.17.mlp.fc1.q_scale
[0251/0343] saving transformer.h.17.mlp.fc1.bias
[0252/0343] saving transformer.h.17.mlp.fc2.q_weight
[0253/0343] saving transformer.h.17.mlp.fc2.q_scale
[0254/0343] saving transformer.h.17.mlp.fc2.bias
[0255/0343] saving transformer.h.18.ln.weight
[0256/0343] saving transformer.h.18.ln.bias
[0257/0343] saving transformer.h.18.mixer.Wqkv.q_weight
[0258/0343] saving transformer.h.18.mixer.Wqkv.q_scale
[0259/0343] saving transformer.h.18.mixer.Wqkv.bias
[0260/0343] saving transformer.h.18.mixer.out_proj.q_weight
[0261/0343] saving transformer.h.18.mixer.out_proj.q_scale
[0262/0343] saving transformer.h.18.mixer.out_proj.bias
[0263/0343] saving transformer.h.18.mlp.fc1.q_weight
[0264/0343] saving transformer.h.18.mlp.fc1.q_scale
[0265/0343] saving transformer.h.18.mlp.fc1.bias
[0266/0343] saving transformer.h.18.mlp.fc2.q_weight
[0267/0343] saving transformer.h.18.mlp.fc2.q_scale
[0268/0343] saving transformer.h.18.mlp.fc2.bias
[0269/0343] saving transformer.h.19.ln.weight
[0270/0343] saving transformer.h.19.ln.bias
[0271/0343] saving transformer.h.19.mixer.Wqkv.q_weight
[0272/0343] saving transformer.h.19.mixer.Wqkv.q_scale
[0273/0343] saving transformer.h.19.mixer.Wqkv.bias
[0274/0343] saving transformer.h.19.mixer.out_proj.q_weight
[0275/0343] saving transformer.h.19.mixer.out_proj.q_scale
[0276/0343] saving transformer.h.19.mixer.out_proj.bias
[0277/0343] saving transformer.h.19.mlp.fc1.q_weight
[0278/0343] saving transformer.h.19.mlp.fc1.q_scale
[0279/0343] saving transformer.h.19.mlp.fc1.bias
[0280/0343] saving transformer.h.19.mlp.fc2.q_weight
[0281/0343] saving transformer.h.19.mlp.fc2.q_scale
[0282/0343] saving transformer.h.19.mlp.fc2.bias
[0283/0343] saving transformer.h.20.ln.weight
[0284/0343] saving transformer.h.20.ln.bias
[0285/0343] saving transformer.h.20.mixer.Wqkv.q_weight
[0286/0343] saving transformer.h.20.mixer.Wqkv.q_scale
[0287/0343] saving transformer.h.20.mixer.Wqkv.bias
[0288/0343] saving transformer.h.20.mixer.out_proj.q_weight
[0289/0343] saving transformer.h.20.mixer.out_proj.q_scale
[0290/0343] saving transformer.h.20.mixer.out_proj.bias
[0291/0343] saving transformer.h.20.mlp.fc1.q_weight
[0292/0343] saving transformer.h.20.mlp.fc1.q_scale[2023-12-28 23:33:30] INFO convert_weight.py:132: Saved to directory: [1m/tmp/tmpxe445xtc[0m |
|
[0293/0343] saving transformer.h.20.mlp.fc1.bias
[0294/0343] saving transformer.h.20.mlp.fc2.q_weight
[0295/0343] saving transformer.h.20.mlp.fc2.q_scale
[0296/0343] saving transformer.h.20.mlp.fc2.bias
[0297/0343] saving transformer.h.21.ln.weight
[0298/0343] saving transformer.h.21.ln.bias
[0299/0343] saving transformer.h.21.mixer.Wqkv.q_weight
[0300/0343] saving transformer.h.21.mixer.Wqkv.q_scale
[0301/0343] saving transformer.h.21.mixer.Wqkv.bias
[0302/0343] saving transformer.h.21.mixer.out_proj.q_weight
[0303/0343] saving transformer.h.21.mixer.out_proj.q_scale
[0304/0343] saving transformer.h.21.mixer.out_proj.bias
[0305/0343] saving transformer.h.21.mlp.fc1.q_weight
[0306/0343] saving transformer.h.21.mlp.fc1.q_scale
[0307/0343] saving transformer.h.21.mlp.fc1.bias
[0308/0343] saving transformer.h.21.mlp.fc2.q_weight
[0309/0343] saving transformer.h.21.mlp.fc2.q_scale
[0310/0343] saving transformer.h.21.mlp.fc2.bias
[0311/0343] saving transformer.h.22.ln.weight
[0312/0343] saving transformer.h.22.ln.bias
[0313/0343] saving transformer.h.22.mixer.Wqkv.q_weight
[0314/0343] saving transformer.h.22.mixer.Wqkv.q_scale
[0315/0343] saving transformer.h.22.mixer.Wqkv.bias
[0316/0343] saving transformer.h.22.mixer.out_proj.q_weight
[0317/0343] saving transformer.h.22.mixer.out_proj.q_scale
[0318/0343] saving transformer.h.22.mixer.out_proj.bias
[0319/0343] saving transformer.h.22.mlp.fc1.q_weight
[0320/0343] saving transformer.h.22.mlp.fc1.q_scale
[0321/0343] saving transformer.h.22.mlp.fc1.bias
[0322/0343] saving transformer.h.22.mlp.fc2.q_weight
[0323/0343] saving transformer.h.22.mlp.fc2.q_scale
[0324/0343] saving transformer.h.22.mlp.fc2.bias
[0325/0343] saving transformer.h.23.ln.weight
[0326/0343] saving transformer.h.23.ln.bias
[0327/0343] saving transformer.h.23.mixer.Wqkv.q_weight
[0328/0343] saving transformer.h.23.mixer.Wqkv.q_scale
[0329/0343] saving transformer.h.23.mixer.Wqkv.bias
[0330/0343] saving transformer.h.23.mixer.out_proj.q_weight
[0331/0343] saving transformer.h.23.mixer.out_proj.q_scale
[0332/0343] saving transformer.h.23.mixer.out_proj.bias
[0333/0343] saving transformer.h.23.mlp.fc1.q_weight
[0334/0343] saving transformer.h.23.mlp.fc1.q_scale
[0335/0343] saving transformer.h.23.mlp.fc1.bias
[0336/0343] saving transformer.h.23.mlp.fc2.q_weight
[0337/0343] saving transformer.h.23.mlp.fc2.q_scale
[0338/0343] saving transformer.h.23.mlp.fc2.bias
[0339/0343] saving lm_head.ln.weight
[0340/0343] saving lm_head.ln.bias
[0341/0343] saving lm_head.linear.q_weight
[0342/0343] saving lm_head.linear.q_scale
[0343/0343] saving lm_head.linear.bias |
|
All finished, 27 total shards committed, record saved to /tmp/tmpxe445xtc/ndarray-cache.json |
|
|