bharati2324 commited on
Commit
3dc05d0
·
verified ·
1 Parent(s): 0b0c74f

Training in progress, step 400, checkpoint

Browse files
checkpoint-400/README.md CHANGED
@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.13.2
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.14.0
checkpoint-400/adapter_config.json CHANGED
@@ -3,6 +3,8 @@
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
5
  "bias": "none",
 
 
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
8
  "init_lora_weights": true,
@@ -11,6 +13,7 @@
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 16,
 
14
  "lora_dropout": 0,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
@@ -20,9 +23,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
 
 
24
  "down_proj",
25
- "gate_proj"
 
 
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
 
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
5
  "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
  "fan_in_fan_out": false,
9
  "inference_mode": true,
10
  "init_lora_weights": true,
 
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
  "lora_alpha": 16,
16
+ "lora_bias": false,
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
  "down_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-400/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9531e392f57d682e9df532da57890a1bd7da38d148a740b873837538803d871d
3
- size 31469800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b252a7207ca6e4d8ec5a01d39855fbcd6089aaf3e443515c903c21aff667eb1
3
+ size 45118424
checkpoint-400/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8dfdb4d4cd9f771845d30ca2a97e0f484b5472dc0f9641e55e968fb18f2517
3
- size 16089082
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a3fa4ab8e334a8e854b6d9f6bc7484419951d9e0294c5838d891b2594dfa19a
3
+ size 23159546
checkpoint-400/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:614e91b565c762accb7eb6cf425f4e176bceb16ec6adc15e8ac14171894c88b4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7345d0af93cc516d093cf07a0ee13b062df879c9c662d4d88703be4ea3a5e9
3
  size 1064
checkpoint-400/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4878048780487805,
5
  "eval_steps": 500,
6
  "global_step": 400,
7
  "is_hyper_param_search": false,
@@ -9,148 +9,148 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.024390243902439025,
13
- "grad_norm": 0.33140748739242554,
14
- "learning_rate": 0.00019631901840490797,
15
- "loss": 2.6356,
16
  "step": 20
17
  },
18
  {
19
- "epoch": 0.04878048780487805,
20
- "grad_norm": 0.26923373341560364,
21
- "learning_rate": 0.0001914110429447853,
22
- "loss": 2.3438,
23
  "step": 40
24
  },
25
  {
26
- "epoch": 0.07317073170731707,
27
- "grad_norm": 0.259231835603714,
28
- "learning_rate": 0.00018650306748466258,
29
- "loss": 2.3363,
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.0975609756097561,
34
- "grad_norm": 0.2962253987789154,
35
- "learning_rate": 0.00018159509202453987,
36
- "loss": 2.278,
37
  "step": 80
38
  },
39
  {
40
- "epoch": 0.12195121951219512,
41
- "grad_norm": 0.2886357605457306,
42
- "learning_rate": 0.0001766871165644172,
43
- "loss": 2.2543,
44
  "step": 100
45
  },
46
  {
47
- "epoch": 0.14634146341463414,
48
- "grad_norm": 0.40607205033302307,
49
- "learning_rate": 0.0001717791411042945,
50
- "loss": 2.2058,
51
  "step": 120
52
  },
53
  {
54
- "epoch": 0.17073170731707318,
55
- "grad_norm": 0.4145870804786682,
56
- "learning_rate": 0.00016687116564417177,
57
- "loss": 2.3036,
58
  "step": 140
59
  },
60
  {
61
- "epoch": 0.1951219512195122,
62
- "grad_norm": 0.2872335612773895,
63
- "learning_rate": 0.00016196319018404909,
64
- "loss": 2.1944,
65
  "step": 160
66
  },
67
  {
68
- "epoch": 0.21951219512195122,
69
- "grad_norm": 0.4880731403827667,
70
- "learning_rate": 0.0001570552147239264,
71
- "loss": 2.1981,
72
  "step": 180
73
  },
74
  {
75
- "epoch": 0.24390243902439024,
76
- "grad_norm": 0.3285306394100189,
77
- "learning_rate": 0.0001521472392638037,
78
- "loss": 2.2533,
79
  "step": 200
80
  },
81
  {
82
- "epoch": 0.2682926829268293,
83
- "grad_norm": 0.304231196641922,
84
- "learning_rate": 0.00014723926380368098,
85
- "loss": 2.2148,
86
  "step": 220
87
  },
88
  {
89
- "epoch": 0.2926829268292683,
90
- "grad_norm": 0.26624172925949097,
91
- "learning_rate": 0.00014233128834355828,
92
- "loss": 2.204,
93
  "step": 240
94
  },
95
  {
96
- "epoch": 0.3170731707317073,
97
- "grad_norm": 0.32761627435684204,
98
- "learning_rate": 0.0001374233128834356,
99
- "loss": 2.1843,
100
  "step": 260
101
  },
102
  {
103
- "epoch": 0.34146341463414637,
104
- "grad_norm": 0.2757970094680786,
105
- "learning_rate": 0.00013251533742331288,
106
- "loss": 2.1697,
107
  "step": 280
108
  },
109
  {
110
- "epoch": 0.36585365853658536,
111
- "grad_norm": 0.31138375401496887,
112
- "learning_rate": 0.00012760736196319017,
113
- "loss": 2.188,
114
  "step": 300
115
  },
116
  {
117
- "epoch": 0.3902439024390244,
118
- "grad_norm": 0.31954225897789,
119
- "learning_rate": 0.0001226993865030675,
120
- "loss": 2.181,
121
  "step": 320
122
  },
123
  {
124
- "epoch": 0.4146341463414634,
125
- "grad_norm": 0.2957305908203125,
126
- "learning_rate": 0.0001177914110429448,
127
- "loss": 2.2415,
128
  "step": 340
129
  },
130
  {
131
- "epoch": 0.43902439024390244,
132
- "grad_norm": 0.30089232325553894,
133
- "learning_rate": 0.00011288343558282209,
134
- "loss": 2.1094,
135
  "step": 360
136
  },
137
  {
138
- "epoch": 0.4634146341463415,
139
- "grad_norm": 0.2776000201702118,
140
- "learning_rate": 0.00010797546012269939,
141
- "loss": 2.2362,
142
  "step": 380
143
  },
144
  {
145
- "epoch": 0.4878048780487805,
146
- "grad_norm": 0.3909365236759186,
147
- "learning_rate": 0.0001030674846625767,
148
- "loss": 2.2577,
149
  "step": 400
150
  }
151
  ],
152
  "logging_steps": 20,
153
- "max_steps": 820,
154
  "num_input_tokens_seen": 0,
155
  "num_train_epochs": 1,
156
  "save_steps": 200,
@@ -166,7 +166,7 @@
166
  "attributes": {}
167
  }
168
  },
169
- "total_flos": 9.59851999002624e+16,
170
  "train_batch_size": 2,
171
  "trial_name": null,
172
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4266666666666667,
5
  "eval_steps": 500,
6
  "global_step": 400,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.021333333333333333,
13
+ "grad_norm": 0.46613356471061707,
14
+ "learning_rate": 0.00019678111587982831,
15
+ "loss": 1.2799,
16
  "step": 20
17
  },
18
  {
19
+ "epoch": 0.042666666666666665,
20
+ "grad_norm": 0.3468495309352875,
21
+ "learning_rate": 0.0001924892703862661,
22
+ "loss": 0.8919,
23
  "step": 40
24
  },
25
  {
26
+ "epoch": 0.064,
27
+ "grad_norm": 0.4602198004722595,
28
+ "learning_rate": 0.00018819742489270387,
29
+ "loss": 0.8586,
30
  "step": 60
31
  },
32
  {
33
+ "epoch": 0.08533333333333333,
34
+ "grad_norm": 0.480325311422348,
35
+ "learning_rate": 0.00018390557939914164,
36
+ "loss": 0.7571,
37
  "step": 80
38
  },
39
  {
40
+ "epoch": 0.10666666666666667,
41
+ "grad_norm": 0.30179363489151,
42
+ "learning_rate": 0.00017961373390557942,
43
+ "loss": 0.7793,
44
  "step": 100
45
  },
46
  {
47
+ "epoch": 0.128,
48
+ "grad_norm": 0.3483397364616394,
49
+ "learning_rate": 0.00017532188841201717,
50
+ "loss": 0.7647,
51
  "step": 120
52
  },
53
  {
54
+ "epoch": 0.14933333333333335,
55
+ "grad_norm": 0.29965728521347046,
56
+ "learning_rate": 0.00017103004291845494,
57
+ "loss": 0.6741,
58
  "step": 140
59
  },
60
  {
61
+ "epoch": 0.17066666666666666,
62
+ "grad_norm": 0.26644188165664673,
63
+ "learning_rate": 0.00016673819742489272,
64
+ "loss": 0.7586,
65
  "step": 160
66
  },
67
  {
68
+ "epoch": 0.192,
69
+ "grad_norm": 0.2962466776371002,
70
+ "learning_rate": 0.0001624463519313305,
71
+ "loss": 0.7364,
72
  "step": 180
73
  },
74
  {
75
+ "epoch": 0.21333333333333335,
76
+ "grad_norm": 0.36480244994163513,
77
+ "learning_rate": 0.00015815450643776824,
78
+ "loss": 0.7944,
79
  "step": 200
80
  },
81
  {
82
+ "epoch": 0.23466666666666666,
83
+ "grad_norm": 0.23963908851146698,
84
+ "learning_rate": 0.000153862660944206,
85
+ "loss": 0.7055,
86
  "step": 220
87
  },
88
  {
89
+ "epoch": 0.256,
90
+ "grad_norm": 0.3207215666770935,
91
+ "learning_rate": 0.00014957081545064377,
92
+ "loss": 0.7495,
93
  "step": 240
94
  },
95
  {
96
+ "epoch": 0.2773333333333333,
97
+ "grad_norm": 0.34940779209136963,
98
+ "learning_rate": 0.00014527896995708155,
99
+ "loss": 0.7739,
100
  "step": 260
101
  },
102
  {
103
+ "epoch": 0.2986666666666667,
104
+ "grad_norm": 0.2764255702495575,
105
+ "learning_rate": 0.00014098712446351932,
106
+ "loss": 0.7126,
107
  "step": 280
108
  },
109
  {
110
+ "epoch": 0.32,
111
+ "grad_norm": 0.27612850069999695,
112
+ "learning_rate": 0.0001366952789699571,
113
+ "loss": 0.7308,
114
  "step": 300
115
  },
116
  {
117
+ "epoch": 0.3413333333333333,
118
+ "grad_norm": 0.2778555154800415,
119
+ "learning_rate": 0.00013240343347639485,
120
+ "loss": 0.6974,
121
  "step": 320
122
  },
123
  {
124
+ "epoch": 0.3626666666666667,
125
+ "grad_norm": 0.33450472354888916,
126
+ "learning_rate": 0.00012811158798283262,
127
+ "loss": 0.7557,
128
  "step": 340
129
  },
130
  {
131
+ "epoch": 0.384,
132
+ "grad_norm": 0.2894728183746338,
133
+ "learning_rate": 0.0001238197424892704,
134
+ "loss": 0.7222,
135
  "step": 360
136
  },
137
  {
138
+ "epoch": 0.4053333333333333,
139
+ "grad_norm": 0.34458762407302856,
140
+ "learning_rate": 0.00011952789699570816,
141
+ "loss": 0.7467,
142
  "step": 380
143
  },
144
  {
145
+ "epoch": 0.4266666666666667,
146
+ "grad_norm": 0.42291197180747986,
147
+ "learning_rate": 0.00011523605150214594,
148
+ "loss": 0.6999,
149
  "step": 400
150
  }
151
  ],
152
  "logging_steps": 20,
153
+ "max_steps": 937,
154
  "num_input_tokens_seen": 0,
155
  "num_train_epochs": 1,
156
  "save_steps": 200,
 
166
  "attributes": {}
167
  }
168
  },
169
+ "total_flos": 9.63186370019328e+16,
170
  "train_batch_size": 2,
171
  "trial_name": null,
172
  "trial_params": null
checkpoint-400/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8d1652c64d5701d7542543217cc8eb0453db379d2bc34a961ad431eb6ebe7fc
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f519abf615386e0857c941fcb28a9140901798289aceaff057539afc5159bd3d
3
  size 5560