k1h0 commited on
Commit
fbd8905
·
verified ·
1 Parent(s): 0323faf

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: deepseek-ai/deepseek-coder-7b-instruct-v1.5
5
+ tags:
6
+ - llama-factory
7
+ - freeze
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: deepseek_nsx_8_1
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # deepseek_nsx_8_1
18
+
19
+ This model is a fine-tuned version of [deepseek-ai/deepseek-coder-7b-instruct-v1.5](https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5) on the codes_nsx_over81 dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 16
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 4
44
+ - gradient_accumulation_steps: 8
45
+ - total_train_batch_size: 512
46
+ - total_eval_batch_size: 32
47
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: cosine
49
+ - num_epochs: 1.0
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - Transformers 4.48.2
58
+ - Pytorch 2.5.1+cu124
59
+ - Datasets 3.2.0
60
+ - Tokenizers 0.21.0
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9795918367346939,
3
+ "num_input_tokens_seen": 25165824,
4
+ "total_flos": 9.800984115271434e+17,
5
+ "train_loss": 0.7565137396256129,
6
+ "train_runtime": 1221.0904,
7
+ "train_samples_per_second": 5.136,
8
+ "train_steps_per_second": 0.01
9
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 100000,
9
+ "eos_token_id": 100015,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 30,
20
+ "num_key_value_heads": 32,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": {
24
+ "factor": 1.0,
25
+ "high_freq_factor": 4.0,
26
+ "low_freq_factor": 1.0,
27
+ "original_max_position_embeddings": 4096,
28
+ "rope_type": "llama3"
29
+ },
30
+ "rope_theta": 10000.0,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.48.2",
34
+ "use_cache": false,
35
+ "vocab_size": 102400
36
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "eos_token_id": 100015,
5
+ "transformers_version": "4.48.2"
6
+ }
llamaboard_config.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: liger_kernel
2
+ top.checkpoint_path: null
3
+ top.finetuning_type: freeze
4
+ top.model_name: DeepSeek-Coder-7B-Instruct
5
+ top.quantization_bit: none
6
+ top.quantization_method: bitsandbytes
7
+ top.rope_scaling: llama3
8
+ top.template: deepseekcoder
9
+ train.additional_target: ''
10
+ train.apollo_rank: 256
11
+ train.apollo_scale: 1
12
+ train.apollo_target: all
13
+ train.apollo_update_interval: 200
14
+ train.badam_mode: layer
15
+ train.badam_switch_interval: 50
16
+ train.badam_switch_mode: ascending
17
+ train.badam_update_ratio: 0.05
18
+ train.batch_size: 16
19
+ train.compute_type: bf16
20
+ train.create_new_adapter: false
21
+ train.cutoff_len: 4096
22
+ train.dataset:
23
+ - codes_nsx_over81
24
+ train.dataset_dir: data
25
+ train.ds_offload: false
26
+ train.ds_stage: none
27
+ train.extra_args: '{}'
28
+ train.freeze_extra_modules: ''
29
+ train.freeze_trainable_layers: 2
30
+ train.freeze_trainable_modules: all
31
+ train.galore_rank: 16
32
+ train.galore_scale: 2
33
+ train.galore_target: all
34
+ train.galore_update_interval: 200
35
+ train.gradient_accumulation_steps: 8
36
+ train.learning_rate: 5e-5
37
+ train.logging_steps: 1
38
+ train.lora_alpha: 16
39
+ train.lora_dropout: 0
40
+ train.lora_rank: 8
41
+ train.lora_target: ''
42
+ train.loraplus_lr_ratio: 0
43
+ train.lr_scheduler_type: cosine
44
+ train.mask_history: false
45
+ train.max_grad_norm: '1.0'
46
+ train.max_samples: '50000000'
47
+ train.neat_packing: true
48
+ train.neftune_alpha: 0
49
+ train.num_train_epochs: '1'
50
+ train.packing: true
51
+ train.ppo_score_norm: false
52
+ train.ppo_whiten_rewards: false
53
+ train.pref_beta: 0.1
54
+ train.pref_ftx: 0
55
+ train.pref_loss: sigmoid
56
+ train.report_to:
57
+ - none
58
+ train.resize_vocab: false
59
+ train.reward_model: null
60
+ train.save_steps: 500
61
+ train.swanlab_api_key: ''
62
+ train.swanlab_mode: cloud
63
+ train.swanlab_project: llamafactory
64
+ train.swanlab_run_name: ''
65
+ train.swanlab_workspace: ''
66
+ train.train_on_prompt: false
67
+ train.training_stage: Supervised Fine-Tuning
68
+ train.use_apollo: true
69
+ train.use_badam: false
70
+ train.use_dora: false
71
+ train.use_galore: false
72
+ train.use_llama_pro: false
73
+ train.use_pissa: false
74
+ train.use_rslora: false
75
+ train.use_swanlab: false
76
+ train.val_size: 0
77
+ train.warmup_steps: 0
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28aba7f82f5ce656674a3e922d68c0b7bd12d2f2e6da09e47fa72eec539c53c2
3
+ size 4987202208
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e815d13d98a67f7370ade76666bfa350f8083dea55f75a60050d05d8bb0728
3
+ size 4980945440
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62d37279dcee5098f4af1886178d9dc77c51467b6a1e8b5fd7b7f4d4e9c9335a
3
+ size 4662148920
model.safetensors.index.json ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14630264832
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
225
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
226
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
227
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
228
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
229
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
230
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
231
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
232
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
233
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
234
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
235
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
236
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
237
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
238
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
239
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
240
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
241
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
242
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.norm.weight": "model-00003-of-00003.safetensors"
279
+ }
280
+ }
running_log.txt ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|2025-05-29 22:52:47] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/config.json
2
+
3
+ [INFO|2025-05-29 22:52:47] configuration_utils.py:768 >> Model config LlamaConfig {
4
+ "_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
5
+ "architectures": [
6
+ "LlamaForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 100000,
11
+ "eos_token_id": 100015,
12
+ "head_dim": 128,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 4096,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 11008,
17
+ "max_position_embeddings": 4096,
18
+ "mlp_bias": false,
19
+ "model_type": "llama",
20
+ "num_attention_heads": 32,
21
+ "num_hidden_layers": 30,
22
+ "num_key_value_heads": 32,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.48.2",
30
+ "use_cache": true,
31
+ "vocab_size": 102400
32
+ }
33
+
34
+
35
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2034 >> loading file tokenizer.model from cache at None
36
+
37
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer.json
38
+
39
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
40
+
41
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at None
42
+
43
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer_config.json
44
+
45
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
46
+
47
+ [INFO|2025-05-29 22:52:47] tokenization_utils_base.py:2304 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
48
+
49
+ [INFO|2025-05-29 22:52:48] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/config.json
50
+
51
+ [INFO|2025-05-29 22:52:48] configuration_utils.py:768 >> Model config LlamaConfig {
52
+ "_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
53
+ "architectures": [
54
+ "LlamaForCausalLM"
55
+ ],
56
+ "attention_bias": false,
57
+ "attention_dropout": 0.0,
58
+ "bos_token_id": 100000,
59
+ "eos_token_id": 100015,
60
+ "head_dim": 128,
61
+ "hidden_act": "silu",
62
+ "hidden_size": 4096,
63
+ "initializer_range": 0.02,
64
+ "intermediate_size": 11008,
65
+ "max_position_embeddings": 4096,
66
+ "mlp_bias": false,
67
+ "model_type": "llama",
68
+ "num_attention_heads": 32,
69
+ "num_hidden_layers": 30,
70
+ "num_key_value_heads": 32,
71
+ "pretraining_tp": 1,
72
+ "rms_norm_eps": 1e-06,
73
+ "rope_scaling": null,
74
+ "rope_theta": 10000.0,
75
+ "tie_word_embeddings": false,
76
+ "torch_dtype": "bfloat16",
77
+ "transformers_version": "4.48.2",
78
+ "use_cache": true,
79
+ "vocab_size": 102400
80
+ }
81
+
82
+
83
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2034 >> loading file tokenizer.model from cache at None
84
+
85
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer.json
86
+
87
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
88
+
89
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at None
90
+
91
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/tokenizer_config.json
92
+
93
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
94
+
95
+ [INFO|2025-05-29 22:52:49] tokenization_utils_base.py:2304 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
96
+
97
+ [INFO|2025-05-29 22:52:49] logging.py:157 >> Loading dataset Codes_query_filtered_330k_ns_over8_1.json...
98
+
99
+ [INFO|2025-05-29 22:53:02] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/config.json
100
+
101
+ [INFO|2025-05-29 22:53:02] configuration_utils.py:768 >> Model config LlamaConfig {
102
+ "_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
103
+ "architectures": [
104
+ "LlamaForCausalLM"
105
+ ],
106
+ "attention_bias": false,
107
+ "attention_dropout": 0.0,
108
+ "bos_token_id": 100000,
109
+ "eos_token_id": 100015,
110
+ "head_dim": 128,
111
+ "hidden_act": "silu",
112
+ "hidden_size": 4096,
113
+ "initializer_range": 0.02,
114
+ "intermediate_size": 11008,
115
+ "max_position_embeddings": 4096,
116
+ "mlp_bias": false,
117
+ "model_type": "llama",
118
+ "num_attention_heads": 32,
119
+ "num_hidden_layers": 30,
120
+ "num_key_value_heads": 32,
121
+ "pretraining_tp": 1,
122
+ "rms_norm_eps": 1e-06,
123
+ "rope_scaling": null,
124
+ "rope_theta": 10000.0,
125
+ "tie_word_embeddings": false,
126
+ "torch_dtype": "bfloat16",
127
+ "transformers_version": "4.48.2",
128
+ "use_cache": true,
129
+ "vocab_size": 102400
130
+ }
131
+
132
+
133
+ [WARNING|2025-05-29 22:53:02] logging.py:162 >> Input length is smaller than max length. Consider increase input length.
134
+
135
+ [INFO|2025-05-29 22:53:02] logging.py:157 >> Using llama3 scaling strategy and setting scaling factor to 1.0.
136
+
137
+ [INFO|2025-05-29 22:53:02] logging.py:157 >> Using block diagonal attention for sequence packing without cross-attention.
138
+
139
+ [INFO|2025-05-29 22:53:02] logging.py:157 >> Liger kernel has been applied to the model.
140
+
141
+ [INFO|2025-05-29 22:53:02] modeling_utils.py:3904 >> loading weights file model.safetensors from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/model.safetensors.index.json
142
+
143
+ [INFO|2025-05-29 22:53:02] modeling_utils.py:1582 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
144
+
145
+ [INFO|2025-05-29 22:53:02] configuration_utils.py:1140 >> Generate config GenerationConfig {
146
+ "bos_token_id": 100000,
147
+ "eos_token_id": 100015
148
+ }
149
+
150
+
151
+ [INFO|2025-05-29 22:53:11] modeling_utils.py:4888 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
152
+
153
+
154
+ [INFO|2025-05-29 22:53:11] modeling_utils.py:4896 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at deepseek-ai/deepseek-coder-7b-instruct-v1.5.
155
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
156
+
157
+ [INFO|2025-05-29 22:53:11] configuration_utils.py:1095 >> loading configuration file generation_config.json from cache at /home/kiho/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-7b-instruct-v1.5/snapshots/2a050a4c59d687a85324d32e147517992117ed30/generation_config.json
158
+
159
+ [INFO|2025-05-29 22:53:11] configuration_utils.py:1140 >> Generate config GenerationConfig {
160
+ "bos_token_id": 100000,
161
+ "eos_token_id": 100015
162
+ }
163
+
164
+
165
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Gradient checkpointing enabled.
166
+
167
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Using torch SDPA for faster training and inference.
168
+
169
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Upcasting trainable params to float32.
170
+
171
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Fine-tuning method: Freeze
172
+
173
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Set trainable layers: .28.,.29.
174
+
175
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> trainable params: 404,766,720 || all params: 6,910,365,696 || trainable%: 5.8574
176
+
177
+ [INFO|2025-05-29 22:53:11] trainer.py:741 >> Using auto half precision backend
178
+
179
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Found linear modules: down_proj,gate_proj,o_proj,k_proj,up_proj,q_proj,v_proj
180
+
181
+ [INFO|2025-05-29 22:53:11] logging.py:157 >> Using APOLLO optimizer with args: {'rank': 256, 'proj': 'random', 'proj_type': 'std', 'update_proj_gap': 200, 'scale': 1, 'scale_type': 'channel', 'scale_front': False}.
182
+
183
+ [INFO|2025-05-29 22:53:11] trainer.py:2369 >> ***** Running training *****
184
+
185
+ [INFO|2025-05-29 22:53:11] trainer.py:2370 >> Num examples = 6,272
186
+
187
+ [INFO|2025-05-29 22:53:11] trainer.py:2371 >> Num Epochs = 1
188
+
189
+ [INFO|2025-05-29 22:53:11] trainer.py:2372 >> Instantaneous batch size per device = 16
190
+
191
+ [INFO|2025-05-29 22:53:11] trainer.py:2375 >> Total train batch size (w. parallel, distributed & accumulation) = 512
192
+
193
+ [INFO|2025-05-29 22:53:11] trainer.py:2376 >> Gradient Accumulation steps = 8
194
+
195
+ [INFO|2025-05-29 22:53:11] trainer.py:2377 >> Total optimization steps = 12
196
+
197
+ [INFO|2025-05-29 22:53:11] trainer.py:2378 >> Number of trainable parameters = 404,766,720
198
+
199
+ [INFO|2025-05-29 22:54:58] logging.py:157 >> {'loss': 0.8480, 'learning_rate': 4.9148e-05, 'epoch': 0.08, 'throughput': 19880.44}
200
+
201
+ [INFO|2025-05-29 22:56:38] logging.py:157 >> {'loss': 0.8111, 'learning_rate': 4.6651e-05, 'epoch': 0.16, 'throughput': 20427.48}
202
+
203
+ [INFO|2025-05-29 22:58:17] logging.py:157 >> {'loss': 0.7984, 'learning_rate': 4.2678e-05, 'epoch': 0.24, 'throughput': 20649.67}
204
+
205
+ [INFO|2025-05-29 22:59:56] logging.py:157 >> {'loss': 0.7649, 'learning_rate': 3.7500e-05, 'epoch': 0.33, 'throughput': 20755.74}
206
+
207
+ [INFO|2025-05-29 23:01:36] logging.py:157 >> {'loss': 0.7687, 'learning_rate': 3.1470e-05, 'epoch': 0.41, 'throughput': 20825.02}
208
+
209
+ [INFO|2025-05-29 23:03:15] logging.py:157 >> {'loss': 0.7696, 'learning_rate': 2.5000e-05, 'epoch': 0.49, 'throughput': 20874.16}
210
+
211
+ [INFO|2025-05-29 23:04:54] logging.py:157 >> {'loss': 0.7488, 'learning_rate': 1.8530e-05, 'epoch': 0.57, 'throughput': 20909.51}
212
+
213
+ [INFO|2025-05-29 23:06:34] logging.py:157 >> {'loss': 0.7368, 'learning_rate': 1.2500e-05, 'epoch': 0.65, 'throughput': 20931.46}
214
+
215
+ [INFO|2025-05-29 23:08:13] logging.py:157 >> {'loss': 0.7088, 'learning_rate': 7.3223e-06, 'epoch': 0.73, 'throughput': 20947.01}
216
+
217
+ [INFO|2025-05-29 23:09:53] logging.py:157 >> {'loss': 0.7054, 'learning_rate': 3.3494e-06, 'epoch': 0.82, 'throughput': 20961.84}
218
+
219
+ [INFO|2025-05-29 23:11:32] logging.py:157 >> {'loss': 0.7032, 'learning_rate': 8.5185e-07, 'epoch': 0.90, 'throughput': 20974.14}
220
+
221
+ [INFO|2025-05-29 23:13:11] logging.py:157 >> {'loss': 0.7144, 'learning_rate': 0.0000e+00, 'epoch': 0.98, 'throughput': 20986.53}
222
+
223
+ [INFO|2025-05-29 23:13:11] trainer.py:3910 >> Saving model checkpoint to saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/checkpoint-12
224
+
225
+ [INFO|2025-05-29 23:13:11] configuration_utils.py:420 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/checkpoint-12/config.json
226
+
227
+ [INFO|2025-05-29 23:13:11] configuration_utils.py:909 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/checkpoint-12/generation_config.json
228
+
229
+ [INFO|2025-05-29 23:13:32] modeling_utils.py:2996 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/checkpoint-12/model.safetensors.index.json.
230
+
231
+ [INFO|2025-05-29 23:13:32] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/checkpoint-12/tokenizer_config.json
232
+
233
+ [INFO|2025-05-29 23:13:32] tokenization_utils_base.py:2500 >> Special tokens file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/checkpoint-12/special_tokens_map.json
234
+
235
+ [INFO|2025-05-29 23:13:32] trainer.py:2643 >>
236
+
237
+ Training completed. Do not forget to share your model on huggingface.co/models =)
238
+
239
+
240
+
241
+ [INFO|2025-05-29 23:13:32] trainer.py:3910 >> Saving model checkpoint to saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1
242
+
243
+ [INFO|2025-05-29 23:13:32] configuration_utils.py:420 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/config.json
244
+
245
+ [INFO|2025-05-29 23:13:32] configuration_utils.py:909 >> Configuration saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/generation_config.json
246
+
247
+ [INFO|2025-05-29 23:13:53] modeling_utils.py:2996 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/model.safetensors.index.json.
248
+
249
+ [INFO|2025-05-29 23:13:53] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/tokenizer_config.json
250
+
251
+ [INFO|2025-05-29 23:13:53] tokenization_utils_base.py:2500 >> Special tokens file saved in saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1/special_tokens_map.json
252
+
253
+ [WARNING|2025-05-29 23:13:54] logging.py:162 >> No metric eval_loss to plot.
254
+
255
+ [WARNING|2025-05-29 23:13:54] logging.py:162 >> No metric eval_accuracy to plot.
256
+
257
+ [INFO|2025-05-29 23:13:54] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
258
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
259
+
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "100000": {
7
+ "content": "<|begin▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "100001": {
15
+ "content": "<|end▁of▁sentence|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "100002": {
23
+ "content": "ø",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "100003": {
31
+ "content": "ö",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "100004": {
39
+ "content": "ú",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "100005": {
47
+ "content": "ÿ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "100006": {
55
+ "content": "õ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "100007": {
63
+ "content": "÷",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "100008": {
71
+ "content": "û",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "100009": {
79
+ "content": "ý",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "100010": {
87
+ "content": "À",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "100011": {
95
+ "content": "ù",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "100012": {
103
+ "content": "Á",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "100013": {
111
+ "content": "þ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "100014": {
119
+ "content": "ü",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "100015": {
127
+ "content": "<|EOT|>",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ }
134
+ },
135
+ "bos_token": "<|begin▁of▁sentence|>",
136
+ "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
137
+ "clean_up_tokenization_spaces": false,
138
+ "eos_token": "<|EOT|>",
139
+ "extra_special_tokens": {},
140
+ "legacy": true,
141
+ "model_max_length": 4096,
142
+ "pad_token": "<|end▁of▁sentence|>",
143
+ "padding_side": "right",
144
+ "sp_model_kwargs": {},
145
+ "split_special_tokens": false,
146
+ "tokenizer_class": "LlamaTokenizer",
147
+ "unk_token": null,
148
+ "use_default_system_prompt": false
149
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9795918367346939,
3
+ "num_input_tokens_seen": 25165824,
4
+ "total_flos": 9.800984115271434e+17,
5
+ "train_loss": 0.7565137396256129,
6
+ "train_runtime": 1221.0904,
7
+ "train_samples_per_second": 5.136,
8
+ "train_steps_per_second": 0.01
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 1, "total_steps": 12, "loss": 0.848, "lr": 4.914814565722671e-05, "epoch": 0.08163265306122448, "percentage": 8.33, "elapsed_time": "0:01:45", "remaining_time": "0:19:20", "throughput": 19880.44, "total_tokens": 2097152}
2
+ {"current_steps": 2, "total_steps": 12, "loss": 0.8111, "lr": 4.665063509461097e-05, "epoch": 0.16326530612244897, "percentage": 16.67, "elapsed_time": "0:03:25", "remaining_time": "0:17:06", "throughput": 20427.48, "total_tokens": 4194304}
3
+ {"current_steps": 3, "total_steps": 12, "loss": 0.7984, "lr": 4.267766952966369e-05, "epoch": 0.24489795918367346, "percentage": 25.0, "elapsed_time": "0:05:04", "remaining_time": "0:15:14", "throughput": 20649.67, "total_tokens": 6291456}
4
+ {"current_steps": 4, "total_steps": 12, "loss": 0.7649, "lr": 3.7500000000000003e-05, "epoch": 0.32653061224489793, "percentage": 33.33, "elapsed_time": "0:06:44", "remaining_time": "0:13:28", "throughput": 20755.74, "total_tokens": 8388608}
5
+ {"current_steps": 5, "total_steps": 12, "loss": 0.7687, "lr": 3.147047612756302e-05, "epoch": 0.40816326530612246, "percentage": 41.67, "elapsed_time": "0:08:23", "remaining_time": "0:11:44", "throughput": 20825.02, "total_tokens": 10485760}
6
+ {"current_steps": 6, "total_steps": 12, "loss": 0.7696, "lr": 2.5e-05, "epoch": 0.4897959183673469, "percentage": 50.0, "elapsed_time": "0:10:02", "remaining_time": "0:10:02", "throughput": 20874.16, "total_tokens": 12582912}
7
+ {"current_steps": 7, "total_steps": 12, "loss": 0.7488, "lr": 1.852952387243698e-05, "epoch": 0.5714285714285714, "percentage": 58.33, "elapsed_time": "0:11:42", "remaining_time": "0:08:21", "throughput": 20909.51, "total_tokens": 14680064}
8
+ {"current_steps": 8, "total_steps": 12, "loss": 0.7368, "lr": 1.2500000000000006e-05, "epoch": 0.6530612244897959, "percentage": 66.67, "elapsed_time": "0:13:21", "remaining_time": "0:06:40", "throughput": 20931.46, "total_tokens": 16777216}
9
+ {"current_steps": 9, "total_steps": 12, "loss": 0.7088, "lr": 7.3223304703363135e-06, "epoch": 0.7346938775510204, "percentage": 75.0, "elapsed_time": "0:15:01", "remaining_time": "0:05:00", "throughput": 20947.01, "total_tokens": 18874368}
10
+ {"current_steps": 10, "total_steps": 12, "loss": 0.7054, "lr": 3.3493649053890326e-06, "epoch": 0.8163265306122449, "percentage": 83.33, "elapsed_time": "0:16:40", "remaining_time": "0:03:20", "throughput": 20961.84, "total_tokens": 20971520}
11
+ {"current_steps": 11, "total_steps": 12, "loss": 0.7032, "lr": 8.51854342773295e-07, "epoch": 0.8979591836734694, "percentage": 91.67, "elapsed_time": "0:18:19", "remaining_time": "0:01:39", "throughput": 20974.14, "total_tokens": 23068672}
12
+ {"current_steps": 12, "total_steps": 12, "loss": 0.7144, "lr": 0.0, "epoch": 0.9795918367346939, "percentage": 100.0, "elapsed_time": "0:19:59", "remaining_time": "0:00:00", "throughput": 20986.53, "total_tokens": 25165824}
13
+ {"current_steps": 12, "total_steps": 12, "epoch": 0.9795918367346939, "percentage": 100.0, "elapsed_time": "0:20:20", "remaining_time": "0:00:00", "throughput": 20626.1, "total_tokens": 25165824}
trainer_state.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9795918367346939,
5
+ "eval_steps": 500,
6
+ "global_step": 12,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08163265306122448,
13
+ "grad_norm": 0.5826467275619507,
14
+ "learning_rate": 4.914814565722671e-05,
15
+ "loss": 0.848,
16
+ "num_input_tokens_seen": 2097152,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.16326530612244897,
21
+ "grad_norm": 0.5116532444953918,
22
+ "learning_rate": 4.665063509461097e-05,
23
+ "loss": 0.8111,
24
+ "num_input_tokens_seen": 4194304,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.24489795918367346,
29
+ "grad_norm": 0.46351540088653564,
30
+ "learning_rate": 4.267766952966369e-05,
31
+ "loss": 0.7984,
32
+ "num_input_tokens_seen": 6291456,
33
+ "step": 3
34
+ },
35
+ {
36
+ "epoch": 0.32653061224489793,
37
+ "grad_norm": 0.39698782563209534,
38
+ "learning_rate": 3.7500000000000003e-05,
39
+ "loss": 0.7649,
40
+ "num_input_tokens_seen": 8388608,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.40816326530612246,
45
+ "grad_norm": 0.3687557578086853,
46
+ "learning_rate": 3.147047612756302e-05,
47
+ "loss": 0.7687,
48
+ "num_input_tokens_seen": 10485760,
49
+ "step": 5
50
+ },
51
+ {
52
+ "epoch": 0.4897959183673469,
53
+ "grad_norm": 0.3421611785888672,
54
+ "learning_rate": 2.5e-05,
55
+ "loss": 0.7696,
56
+ "num_input_tokens_seen": 12582912,
57
+ "step": 6
58
+ },
59
+ {
60
+ "epoch": 0.5714285714285714,
61
+ "grad_norm": 0.3205947279930115,
62
+ "learning_rate": 1.852952387243698e-05,
63
+ "loss": 0.7488,
64
+ "num_input_tokens_seen": 14680064,
65
+ "step": 7
66
+ },
67
+ {
68
+ "epoch": 0.6530612244897959,
69
+ "grad_norm": 0.2887260913848877,
70
+ "learning_rate": 1.2500000000000006e-05,
71
+ "loss": 0.7368,
72
+ "num_input_tokens_seen": 16777216,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.7346938775510204,
77
+ "grad_norm": 0.3012808561325073,
78
+ "learning_rate": 7.3223304703363135e-06,
79
+ "loss": 0.7088,
80
+ "num_input_tokens_seen": 18874368,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.8163265306122449,
85
+ "grad_norm": 0.3112427592277527,
86
+ "learning_rate": 3.3493649053890326e-06,
87
+ "loss": 0.7054,
88
+ "num_input_tokens_seen": 20971520,
89
+ "step": 10
90
+ },
91
+ {
92
+ "epoch": 0.8979591836734694,
93
+ "grad_norm": 0.29755619168281555,
94
+ "learning_rate": 8.51854342773295e-07,
95
+ "loss": 0.7032,
96
+ "num_input_tokens_seen": 23068672,
97
+ "step": 11
98
+ },
99
+ {
100
+ "epoch": 0.9795918367346939,
101
+ "grad_norm": 0.30344462394714355,
102
+ "learning_rate": 0.0,
103
+ "loss": 0.7144,
104
+ "num_input_tokens_seen": 25165824,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.9795918367346939,
109
+ "num_input_tokens_seen": 25165824,
110
+ "step": 12,
111
+ "total_flos": 9.800984115271434e+17,
112
+ "train_loss": 0.7565137396256129,
113
+ "train_runtime": 1221.0904,
114
+ "train_samples_per_second": 5.136,
115
+ "train_steps_per_second": 0.01
116
+ }
117
+ ],
118
+ "logging_steps": 1,
119
+ "max_steps": 12,
120
+ "num_input_tokens_seen": 25165824,
121
+ "num_train_epochs": 1,
122
+ "save_steps": 500,
123
+ "stateful_callbacks": {
124
+ "TrainerControl": {
125
+ "args": {
126
+ "should_epoch_stop": false,
127
+ "should_evaluate": false,
128
+ "should_log": false,
129
+ "should_save": true,
130
+ "should_training_stop": true
131
+ },
132
+ "attributes": {}
133
+ }
134
+ },
135
+ "total_flos": 9.800984115271434e+17,
136
+ "train_batch_size": 16,
137
+ "trial_name": null,
138
+ "trial_params": null
139
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f8b77590e8f79fc02f78a48259c2e8ebd4d92de94debe9fdb2a985d8788b984
3
+ size 5688
training_args.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apollo_rank: 256
2
+ apollo_scale: 1
3
+ apollo_target: all
4
+ apollo_update_interval: 200
5
+ bf16: true
6
+ cutoff_len: 4096
7
+ dataset: codes_nsx_over81
8
+ dataset_dir: data
9
+ ddp_timeout: 180000000
10
+ do_train: true
11
+ enable_liger_kernel: true
12
+ finetuning_type: freeze
13
+ flash_attn: auto
14
+ freeze_trainable_layers: 2
15
+ freeze_trainable_modules: all
16
+ gradient_accumulation_steps: 8
17
+ include_num_input_tokens_seen: true
18
+ learning_rate: 5.0e-05
19
+ logging_steps: 1
20
+ lr_scheduler_type: cosine
21
+ max_grad_norm: 1.0
22
+ max_samples: 50000000
23
+ model_name_or_path: deepseek-ai/deepseek-coder-7b-instruct-v1.5
24
+ neat_packing: true
25
+ num_train_epochs: 1.0
26
+ output_dir: saves/DeepSeek-Coder-7B-Instruct/freeze/deepseek_nsx_8_1
27
+ packing: true
28
+ per_device_train_batch_size: 16
29
+ plot_loss: true
30
+ preprocessing_num_workers: 16
31
+ report_to: none
32
+ rope_scaling: llama3
33
+ save_steps: 500
34
+ stage: sft
35
+ template: deepseekcoder
36
+ trust_remote_code: true
37
+ use_apollo: true
38
+ warmup_steps: 0
training_loss.png ADDED