|
[INFO|2025-07-09 15:57:37] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/config.json |
|
|
|
[INFO|2025-07-09 15:57:37] configuration_utils.py:768 >> Model config LlamaConfig { |
|
"_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 100000, |
|
"eos_token_id": 100015, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 11008, |
|
"max_position_embeddings": 4096, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 30, |
|
"num_key_value_heads": 32, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.48.2", |
|
"use_cache": true, |
|
"vocab_size": 102400 |
|
} |
|
|
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2034 >> loading file tokenizer.model from cache at None |
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer.json |
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at None |
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer_config.json |
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None |
|
|
|
[INFO|2025-07-09 15:57:38] tokenization_utils_base.py:2304 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2025-07-09 15:57:39] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/config.json |
|
|
|
[INFO|2025-07-09 15:57:39] configuration_utils.py:768 >> Model config LlamaConfig { |
|
"_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 100000, |
|
"eos_token_id": 100015, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 11008, |
|
"max_position_embeddings": 4096, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 30, |
|
"num_key_value_heads": 32, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.48.2", |
|
"use_cache": true, |
|
"vocab_size": 102400 |
|
} |
|
|
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2034 >> loading file tokenizer.model from cache at None |
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer.json |
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at None |
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer_config.json |
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None |
|
|
|
[INFO|2025-07-09 15:57:39] tokenization_utils_base.py:2304 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2025-07-09 15:57:40] logging.py:157 >> Loading dataset Codes3_query_filtered_553474_mark_less_than_8.0.json... |
|
|
|
[INFO|2025-07-09 15:58:18] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/config.json |
|
|
|
[INFO|2025-07-09 15:58:18] configuration_utils.py:768 >> Model config LlamaConfig { |
|
"_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 100000, |
|
"eos_token_id": 100015, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 11008, |
|
"max_position_embeddings": 4096, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 30, |
|
"num_key_value_heads": 32, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.48.2", |
|
"use_cache": true, |
|
"vocab_size": 102400 |
|
} |
|
|
|
|
|
[WARNING|2025-07-09 15:58:18] logging.py:162 >> Input length is smaller than max length. Consider increase input length. |
|
|
|
[INFO|2025-07-09 15:58:18] logging.py:157 >> Using llama3 scaling strategy and setting scaling factor to 1.0. |
|
|
|
[INFO|2025-07-09 15:58:18] logging.py:157 >> Using block diagonal attention for sequence packing without cross-attention. |
|
|
|
[INFO|2025-07-09 15:58:18] logging.py:157 >> Liger kernel has been applied to the model. |
|
|
|
[INFO|2025-07-09 15:58:18] modeling_utils.py:3904 >> loading weights file model.safetensors from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/model.safetensors.index.json |
|
|
|
[INFO|2025-07-09 15:58:18] modeling_utils.py:1582 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|2025-07-09 15:58:18] configuration_utils.py:1140 >> Generate config GenerationConfig { |
|
"bos_token_id": 100000, |
|
"eos_token_id": 100015 |
|
} |
|
|
|
|
|
[INFO|2025-07-09 15:58:26] modeling_utils.py:4888 >> All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
|
|
[INFO|2025-07-09 15:58:26] modeling_utils.py:4896 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at deepseek-ai/deepseek-coder-7b-instruct-v1.5. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
|
|
[INFO|2025-07-09 15:58:27] configuration_utils.py:1095 >> loading configuration file generation_config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/generation_config.json |
|
|
|
[INFO|2025-07-09 15:58:27] configuration_utils.py:1140 >> Generate config GenerationConfig { |
|
"bos_token_id": 100000, |
|
"eos_token_id": 100015 |
|
} |
|
|
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Gradient checkpointing enabled. |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Using torch SDPA for faster training and inference. |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Upcasting trainable params to float32. |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Fine-tuning method: Freeze |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Set trainable layers: .14.,.29. |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> trainable params: 404,766,720 || all params: 6,910,365,696 || trainable%: 5.8574 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:741 >> Using auto half precision backend |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Found linear modules: k_proj,q_proj,v_proj,down_proj,gate_proj,up_proj,o_proj |
|
|
|
[INFO|2025-07-09 15:58:27] logging.py:157 >> Using APOLLO optimizer with args: {'rank': 256, 'proj': 'random', 'proj_type': 'std', 'update_proj_gap': 200, 'scale': 1, 'scale_type': 'channel', 'scale_front': False}. |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2369 >> ***** Running training ***** |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2370 >> Num examples = 25,964 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2371 >> Num Epochs = 1 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2372 >> Instantaneous batch size per device = 16 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2375 >> Total train batch size (w. parallel, distributed & accumulation) = 384 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2376 >> Gradient Accumulation steps = 8 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2377 >> Total optimization steps = 67 |
|
|
|
[INFO|2025-07-09 15:58:27] trainer.py:2378 >> Number of trainable parameters = 404,766,720 |
|
|
|
[INFO|2025-07-09 16:01:10] logging.py:157 >> {'loss': 1.5801, 'learning_rate': 4.9973e-05, 'epoch': 0.01, 'throughput': 9730.57} |
|
|
|
[INFO|2025-07-09 16:03:43] logging.py:157 >> {'loss': 1.3696, 'learning_rate': 4.9890e-05, 'epoch': 0.03, 'throughput': 9996.29} |
|
|
|
[INFO|2025-07-09 16:06:16] logging.py:157 >> {'loss': 1.2964, 'learning_rate': 4.9753e-05, 'epoch': 0.04, 'throughput': 10081.95} |
|
|
|
[INFO|2025-07-09 16:08:49] logging.py:157 >> {'loss': 1.2465, 'learning_rate': 4.9562e-05, 'epoch': 0.06, 'throughput': 10131.27} |
|
|
|
[INFO|2025-07-09 16:11:22] logging.py:157 >> {'loss': 1.1284, 'learning_rate': 4.9316e-05, 'epoch': 0.07, 'throughput': 10163.81} |
|
|
|
[INFO|2025-07-09 16:13:55] logging.py:157 >> {'loss': 1.0455, 'learning_rate': 4.9017e-05, 'epoch': 0.09, 'throughput': 10183.83} |
|
|
|
[INFO|2025-07-09 16:16:28] logging.py:157 >> {'loss': 0.9850, 'learning_rate': 4.8665e-05, 'epoch': 0.10, 'throughput': 10199.69} |
|
|
|
[INFO|2025-07-09 16:19:01] logging.py:157 >> {'loss': 0.9570, 'learning_rate': 4.8262e-05, 'epoch': 0.12, 'throughput': 10208.81} |
|
|
|
[INFO|2025-07-09 16:21:34] logging.py:157 >> {'loss': 0.9121, 'learning_rate': 4.7807e-05, 'epoch': 0.13, 'throughput': 10215.43} |
|
|
|
[INFO|2025-07-09 16:24:07] logging.py:157 >> {'loss': 0.8857, 'learning_rate': 4.7302e-05, 'epoch': 0.15, 'throughput': 10220.33} |
|
|
|
[INFO|2025-07-09 16:26:40] logging.py:157 >> {'loss': 0.8871, 'learning_rate': 4.6748e-05, 'epoch': 0.16, 'throughput': 10224.60} |
|
|
|
[INFO|2025-07-09 16:29:14] logging.py:157 >> {'loss': 0.8668, 'learning_rate': 4.6146e-05, 'epoch': 0.18, 'throughput': 10228.07} |
|
|
|
[INFO|2025-07-09 16:31:47] logging.py:157 >> {'loss': 0.8264, 'learning_rate': 4.5497e-05, 'epoch': 0.19, 'throughput': 10231.09} |
|
|
|
[INFO|2025-07-09 16:34:20] logging.py:157 >> {'loss': 0.8316, 'learning_rate': 4.4804e-05, 'epoch': 0.21, 'throughput': 10233.42} |
|
|
|
[INFO|2025-07-09 16:36:53] logging.py:157 >> {'loss': 0.8214, 'learning_rate': 4.4067e-05, 'epoch': 0.22, 'throughput': 10236.31} |
|
|
|
[INFO|2025-07-09 16:39:26] logging.py:157 >> {'loss': 0.8074, 'learning_rate': 4.3288e-05, 'epoch': 0.24, 'throughput': 10238.37} |
|
|
|
[INFO|2025-07-09 16:41:59] logging.py:157 >> {'loss': 0.7947, 'learning_rate': 4.2469e-05, 'epoch': 0.25, 'throughput': 10239.79} |
|
|
|
[INFO|2025-07-09 16:44:33] logging.py:157 >> {'loss': 0.7868, 'learning_rate': 4.1612e-05, 'epoch': 0.27, 'throughput': 10240.87} |
|
|
|
[INFO|2025-07-09 16:47:06] logging.py:157 >> {'loss': 0.7657, 'learning_rate': 4.0718e-05, 'epoch': 0.28, 'throughput': 10241.86} |
|
|
|
[INFO|2025-07-09 16:49:39] logging.py:157 >> {'loss': 0.7821, 'learning_rate': 3.9789e-05, 'epoch': 0.30, 'throughput': 10242.44} |
|
|
|
[INFO|2025-07-09 16:52:13] logging.py:157 >> {'loss': 0.7640, 'learning_rate': 3.8828e-05, 'epoch': 0.31, 'throughput': 10242.80} |
|
|
|
[INFO|2025-07-09 16:54:46] logging.py:157 >> {'loss': 0.7781, 'learning_rate': 3.7837e-05, 'epoch': 0.33, 'throughput': 10243.48} |
|
|
|
[INFO|2025-07-09 16:57:19] logging.py:157 >> {'loss': 0.7663, 'learning_rate': 3.6817e-05, 'epoch': 0.34, 'throughput': 10244.34} |
|
|
|
[INFO|2025-07-09 16:59:53] logging.py:157 >> {'loss': 0.7303, 'learning_rate': 3.5772e-05, 'epoch': 0.35, 'throughput': 10245.23} |
|
|
|
[INFO|2025-07-09 17:02:26] logging.py:157 >> {'loss': 0.7626, 'learning_rate': 3.4702e-05, 'epoch': 0.37, 'throughput': 10245.49} |
|
|
|
[INFO|2025-07-09 17:04:59] logging.py:157 >> {'loss': 0.7709, 'learning_rate': 3.3612e-05, 'epoch': 0.38, 'throughput': 10246.02} |
|
|
|
[INFO|2025-07-09 17:07:33] logging.py:157 >> {'loss': 0.7638, 'learning_rate': 3.2502e-05, 'epoch': 0.40, 'throughput': 10246.34} |
|
|
|
[INFO|2025-07-09 17:10:06] logging.py:157 >> {'loss': 0.7561, 'learning_rate': 3.1376e-05, 'epoch': 0.41, 'throughput': 10246.39} |
|
|
|
[INFO|2025-07-09 17:12:40] logging.py:157 >> {'loss': 0.7372, 'learning_rate': 3.0236e-05, 'epoch': 0.43, 'throughput': 10246.90} |
|
|
|
[INFO|2025-07-09 17:15:13] logging.py:157 >> {'loss': 0.7281, 'learning_rate': 2.9084e-05, 'epoch': 0.44, 'throughput': 10246.62} |
|
|
|
[INFO|2025-07-09 17:17:47] logging.py:157 >> {'loss': 0.7601, 'learning_rate': 2.7924e-05, 'epoch': 0.46, 'throughput': 10246.53} |
|
|
|
[INFO|2025-07-09 17:20:20] logging.py:157 >> {'loss': 0.7296, 'learning_rate': 2.6757e-05, 'epoch': 0.47, 'throughput': 10246.68} |
|
|
|
[INFO|2025-07-09 17:22:54] logging.py:157 >> {'loss': 0.7410, 'learning_rate': 2.5586e-05, 'epoch': 0.49, 'throughput': 10246.53} |
|
|
|
[INFO|2025-07-09 17:25:27] logging.py:157 >> {'loss': 0.7251, 'learning_rate': 2.4414e-05, 'epoch': 0.50, 'throughput': 10246.23} |
|
|
|
[INFO|2025-07-09 17:28:01] logging.py:157 >> {'loss': 0.7385, 'learning_rate': 2.3243e-05, 'epoch': 0.52, 'throughput': 10246.00} |
|
|
|
[INFO|2025-07-09 17:30:35] logging.py:157 >> {'loss': 0.7460, 'learning_rate': 2.2076e-05, 'epoch': 0.53, 'throughput': 10245.76} |
|
|
|
[INFO|2025-07-09 17:33:08] logging.py:157 >> {'loss': 0.7315, 'learning_rate': 2.0916e-05, 'epoch': 0.55, 'throughput': 10245.41} |
|
|
|
[INFO|2025-07-09 17:35:42] logging.py:157 >> {'loss': 0.7267, 'learning_rate': 1.9764e-05, 'epoch': 0.56, 'throughput': 10245.47} |
|
|
|
[INFO|2025-07-09 17:38:15] logging.py:157 >> {'loss': 0.7443, 'learning_rate': 1.8624e-05, 'epoch': 0.58, 'throughput': 10246.13} |
|
|
|
[INFO|2025-07-09 17:40:48] logging.py:157 >> {'loss': 0.7163, 'learning_rate': 1.7498e-05, 'epoch': 0.59, 'throughput': 10246.67} |
|
|
|
[INFO|2025-07-09 17:43:22] logging.py:157 >> {'loss': 0.7300, 'learning_rate': 1.6388e-05, 'epoch': 0.61, 'throughput': 10246.53} |
|
|
|
[INFO|2025-07-09 17:45:55] logging.py:157 >> {'loss': 0.7230, 'learning_rate': 1.5298e-05, 'epoch': 0.62, 'throughput': 10246.38} |
|
|
|
[INFO|2025-07-09 17:48:29] logging.py:157 >> {'loss': 0.7176, 'learning_rate': 1.4228e-05, 'epoch': 0.64, 'throughput': 10246.13} |
|
|
|
[INFO|2025-07-09 17:51:03] logging.py:157 >> {'loss': 0.7066, 'learning_rate': 1.3183e-05, 'epoch': 0.65, 'throughput': 10245.31} |
|
|
|
[INFO|2025-07-09 17:53:37] logging.py:157 >> {'loss': 0.7244, 'learning_rate': 1.2163e-05, 'epoch': 0.67, 'throughput': 10245.33} |
|
|
|
[INFO|2025-07-09 17:56:10] logging.py:157 >> {'loss': 0.7349, 'learning_rate': 1.1172e-05, 'epoch': 0.68, 'throughput': 10245.14} |
|
|
|
[INFO|2025-07-09 17:58:44] logging.py:157 >> {'loss': 0.7487, 'learning_rate': 1.0211e-05, 'epoch': 0.70, 'throughput': 10245.15} |
|
|
|
[INFO|2025-07-09 18:01:17] logging.py:157 >> {'loss': 0.7381, 'learning_rate': 9.2822e-06, 'epoch': 0.71, 'throughput': 10245.27} |
|
|
|
[INFO|2025-07-09 18:03:51] logging.py:157 >> {'loss': 0.7262, 'learning_rate': 8.3882e-06, 'epoch': 0.72, 'throughput': 10245.11} |
|
|
|
[INFO|2025-07-09 18:06:25] logging.py:157 >> {'loss': 0.7473, 'learning_rate': 7.5308e-06, 'epoch': 0.74, 'throughput': 10244.43} |
|
|
|
[INFO|2025-07-09 18:08:58] logging.py:157 >> {'loss': 0.6998, 'learning_rate': 6.7117e-06, 'epoch': 0.75, 'throughput': 10244.28} |
|
|
|
[INFO|2025-07-09 18:11:32] logging.py:157 >> {'loss': 0.7218, 'learning_rate': 5.9329e-06, 'epoch': 0.77, 'throughput': 10244.28} |
|
|
|
[INFO|2025-07-09 18:14:06] logging.py:157 >> {'loss': 0.7314, 'learning_rate': 5.1959e-06, 'epoch': 0.78, 'throughput': 10244.27} |
|
|
|
[INFO|2025-07-09 18:16:39] logging.py:157 >> {'loss': 0.7028, 'learning_rate': 4.5025e-06, 'epoch': 0.80, 'throughput': 10244.46} |
|
|
|
[INFO|2025-07-09 18:19:13] logging.py:157 >> {'loss': 0.7084, 'learning_rate': 3.8542e-06, 'epoch': 0.81, 'throughput': 10244.41} |
|
|
|
[INFO|2025-07-09 18:21:46] logging.py:157 >> {'loss': 0.7223, 'learning_rate': 3.2523e-06, 'epoch': 0.83, 'throughput': 10244.78} |
|
|
|
[INFO|2025-07-09 18:24:19] logging.py:157 >> {'loss': 0.7195, 'learning_rate': 2.6983e-06, 'epoch': 0.84, 'throughput': 10244.84} |
|
|
|
[INFO|2025-07-09 18:26:53] logging.py:157 >> {'loss': 0.7431, 'learning_rate': 2.1933e-06, 'epoch': 0.86, 'throughput': 10244.76} |
|
|
|
[INFO|2025-07-09 18:29:26] logging.py:157 >> {'loss': 0.7320, 'learning_rate': 1.7384e-06, 'epoch': 0.87, 'throughput': 10245.04} |
|
|
|
[INFO|2025-07-09 18:32:00] logging.py:157 >> {'loss': 0.7302, 'learning_rate': 1.3346e-06, 'epoch': 0.89, 'throughput': 10245.15} |
|
|
|
[INFO|2025-07-09 18:34:33] logging.py:157 >> {'loss': 0.7197, 'learning_rate': 9.8287e-07, 'epoch': 0.90, 'throughput': 10244.99} |
|
|
|
[INFO|2025-07-09 18:37:07] logging.py:157 >> {'loss': 0.7203, 'learning_rate': 6.8393e-07, 'epoch': 0.92, 'throughput': 10244.88} |
|
|
|
[INFO|2025-07-09 18:39:40] logging.py:157 >> {'loss': 0.7474, 'learning_rate': 4.3844e-07, 'epoch': 0.93, 'throughput': 10244.81} |
|
|
|
[INFO|2025-07-09 18:42:14] logging.py:157 >> {'loss': 0.7403, 'learning_rate': 2.4694e-07, 'epoch': 0.95, 'throughput': 10244.76} |
|
|
|
[INFO|2025-07-09 18:44:48] logging.py:157 >> {'loss': 0.7327, 'learning_rate': 1.0985e-07, 'epoch': 0.96, 'throughput': 10244.72} |
|
|
|
[INFO|2025-07-09 18:47:21] logging.py:157 >> {'loss': 0.7250, 'learning_rate': 2.7478e-08, 'epoch': 0.98, 'throughput': 10244.49} |
|
|
|
[INFO|2025-07-09 18:49:55] logging.py:157 >> {'loss': 0.7209, 'learning_rate': 0.0000e+00, 'epoch': 0.99, 'throughput': 10244.41} |
|
|
|
[INFO|2025-07-09 18:49:55] trainer.py:3910 >> Saving model checkpoint to saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/checkpoint-67 |
|
|
|
[INFO|2025-07-09 18:49:55] configuration_utils.py:420 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/checkpoint-67/config.json |
|
|
|
[INFO|2025-07-09 18:49:55] configuration_utils.py:909 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/checkpoint-67/generation_config.json |
|
|
|
[INFO|2025-07-09 18:50:16] modeling_utils.py:2996 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/checkpoint-67/model.safetensors.index.json. |
|
|
|
[INFO|2025-07-09 18:50:16] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/checkpoint-67/tokenizer_config.json |
|
|
|
[INFO|2025-07-09 18:50:16] tokenization_utils_base.py:2500 >> Special tokens file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/checkpoint-67/special_tokens_map.json |
|
|
|
[INFO|2025-07-09 18:50:17] trainer.py:2643 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|2025-07-09 18:50:17] trainer.py:3910 >> Saving model checkpoint to saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx |
|
|
|
[INFO|2025-07-09 18:50:17] configuration_utils.py:420 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/config.json |
|
|
|
[INFO|2025-07-09 18:50:17] configuration_utils.py:909 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/generation_config.json |
|
|
|
[INFO|2025-07-09 18:50:39] modeling_utils.py:2996 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/model.safetensors.index.json. |
|
|
|
[INFO|2025-07-09 18:50:39] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/tokenizer_config.json |
|
|
|
[INFO|2025-07-09 18:50:39] tokenization_utils_base.py:2500 >> Special tokens file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_under8_nsx/special_tokens_map.json |
|
|
|
[WARNING|2025-07-09 18:50:39] logging.py:162 >> No metric eval_loss to plot. |
|
|
|
[WARNING|2025-07-09 18:50:39] logging.py:162 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|2025-07-09 18:50:39] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|