bharati2324 commited on
Commit
d39fb79
·
verified ·
1 Parent(s): fad5619

Training in progress, step 600, checkpoint

Browse files
checkpoint-600/README.md CHANGED
@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.13.2
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.14.0
checkpoint-600/adapter_config.json CHANGED
@@ -3,6 +3,8 @@
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
5
  "bias": "none",
 
 
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
8
  "init_lora_weights": true,
@@ -11,6 +13,7 @@
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 16,
 
14
  "lora_dropout": 0,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
@@ -20,9 +23,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
 
 
24
  "down_proj",
25
- "gate_proj"
 
 
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
 
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
5
  "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
  "fan_in_fan_out": false,
9
  "inference_mode": true,
10
  "init_lora_weights": true,
 
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
  "lora_alpha": 16,
16
+ "lora_bias": false,
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
  "down_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-600/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d55511db55c22e9481cdb62e936cd07e378ba9c5435ba8bdc16485fefb63f2e
3
- size 31469800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4abd84eb501a43e0be823bf73473d2641b4e716bfd5d73bb82757c8d193555
3
+ size 45118424
checkpoint-600/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e90561c4d9affdace9ed78167aba765dbd349020c201815e4e1a845547f85c94
3
- size 16089082
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93326e1274960d6decb84243e3e57fd46599fd57647309c56240c686aacce6bc
3
+ size 23159546
checkpoint-600/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae95b990300e394bedcc204f87da7868a72ef45d2c286d15335be5fed4753224
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e6ab2b1312663c255ba7ae61be8a65664cb48e4e09398877c3783034333efeb
3
  size 1064
checkpoint-600/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7317073170731707,
5
  "eval_steps": 500,
6
  "global_step": 600,
7
  "is_hyper_param_search": false,
@@ -9,218 +9,218 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.024390243902439025,
13
- "grad_norm": 0.33140748739242554,
14
- "learning_rate": 0.00019631901840490797,
15
- "loss": 2.6356,
16
  "step": 20
17
  },
18
  {
19
- "epoch": 0.04878048780487805,
20
- "grad_norm": 0.26923373341560364,
21
- "learning_rate": 0.0001914110429447853,
22
- "loss": 2.3438,
23
  "step": 40
24
  },
25
  {
26
- "epoch": 0.07317073170731707,
27
- "grad_norm": 0.259231835603714,
28
- "learning_rate": 0.00018650306748466258,
29
- "loss": 2.3363,
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.0975609756097561,
34
- "grad_norm": 0.2962253987789154,
35
- "learning_rate": 0.00018159509202453987,
36
- "loss": 2.278,
37
  "step": 80
38
  },
39
  {
40
- "epoch": 0.12195121951219512,
41
- "grad_norm": 0.2886357605457306,
42
- "learning_rate": 0.0001766871165644172,
43
- "loss": 2.2543,
44
  "step": 100
45
  },
46
  {
47
- "epoch": 0.14634146341463414,
48
- "grad_norm": 0.40607205033302307,
49
- "learning_rate": 0.0001717791411042945,
50
- "loss": 2.2058,
51
  "step": 120
52
  },
53
  {
54
- "epoch": 0.17073170731707318,
55
- "grad_norm": 0.4145870804786682,
56
- "learning_rate": 0.00016687116564417177,
57
- "loss": 2.3036,
58
  "step": 140
59
  },
60
  {
61
- "epoch": 0.1951219512195122,
62
- "grad_norm": 0.2872335612773895,
63
- "learning_rate": 0.00016196319018404909,
64
- "loss": 2.1944,
65
  "step": 160
66
  },
67
  {
68
- "epoch": 0.21951219512195122,
69
- "grad_norm": 0.4880731403827667,
70
- "learning_rate": 0.0001570552147239264,
71
- "loss": 2.1981,
72
  "step": 180
73
  },
74
  {
75
- "epoch": 0.24390243902439024,
76
- "grad_norm": 0.3285306394100189,
77
- "learning_rate": 0.0001521472392638037,
78
- "loss": 2.2533,
79
  "step": 200
80
  },
81
  {
82
- "epoch": 0.2682926829268293,
83
- "grad_norm": 0.304231196641922,
84
- "learning_rate": 0.00014723926380368098,
85
- "loss": 2.2148,
86
  "step": 220
87
  },
88
  {
89
- "epoch": 0.2926829268292683,
90
- "grad_norm": 0.26624172925949097,
91
- "learning_rate": 0.00014233128834355828,
92
- "loss": 2.204,
93
  "step": 240
94
  },
95
  {
96
- "epoch": 0.3170731707317073,
97
- "grad_norm": 0.32761627435684204,
98
- "learning_rate": 0.0001374233128834356,
99
- "loss": 2.1843,
100
  "step": 260
101
  },
102
  {
103
- "epoch": 0.34146341463414637,
104
- "grad_norm": 0.2757970094680786,
105
- "learning_rate": 0.00013251533742331288,
106
- "loss": 2.1697,
107
  "step": 280
108
  },
109
  {
110
- "epoch": 0.36585365853658536,
111
- "grad_norm": 0.31138375401496887,
112
- "learning_rate": 0.00012760736196319017,
113
- "loss": 2.188,
114
  "step": 300
115
  },
116
  {
117
- "epoch": 0.3902439024390244,
118
- "grad_norm": 0.31954225897789,
119
- "learning_rate": 0.0001226993865030675,
120
- "loss": 2.181,
121
  "step": 320
122
  },
123
  {
124
- "epoch": 0.4146341463414634,
125
- "grad_norm": 0.2957305908203125,
126
- "learning_rate": 0.0001177914110429448,
127
- "loss": 2.2415,
128
  "step": 340
129
  },
130
  {
131
- "epoch": 0.43902439024390244,
132
- "grad_norm": 0.30089232325553894,
133
- "learning_rate": 0.00011288343558282209,
134
- "loss": 2.1094,
135
  "step": 360
136
  },
137
  {
138
- "epoch": 0.4634146341463415,
139
- "grad_norm": 0.2776000201702118,
140
- "learning_rate": 0.00010797546012269939,
141
- "loss": 2.2362,
142
  "step": 380
143
  },
144
  {
145
- "epoch": 0.4878048780487805,
146
- "grad_norm": 0.3909365236759186,
147
- "learning_rate": 0.0001030674846625767,
148
- "loss": 2.2577,
149
  "step": 400
150
  },
151
  {
152
- "epoch": 0.5121951219512195,
153
- "grad_norm": 0.3422461748123169,
154
- "learning_rate": 9.815950920245399e-05,
155
- "loss": 2.2153,
156
  "step": 420
157
  },
158
  {
159
- "epoch": 0.5365853658536586,
160
- "grad_norm": 0.32766982913017273,
161
- "learning_rate": 9.325153374233129e-05,
162
- "loss": 2.1739,
163
  "step": 440
164
  },
165
  {
166
- "epoch": 0.5609756097560976,
167
- "grad_norm": 0.3100208640098572,
168
- "learning_rate": 8.83435582822086e-05,
169
- "loss": 2.2126,
170
  "step": 460
171
  },
172
  {
173
- "epoch": 0.5853658536585366,
174
- "grad_norm": 0.368093878030777,
175
- "learning_rate": 8.343558282208588e-05,
176
- "loss": 2.1664,
177
  "step": 480
178
  },
179
  {
180
- "epoch": 0.6097560975609756,
181
- "grad_norm": 0.2647826671600342,
182
- "learning_rate": 7.85276073619632e-05,
183
- "loss": 2.2018,
184
  "step": 500
185
  },
186
  {
187
- "epoch": 0.6341463414634146,
188
- "grad_norm": 0.3373850882053375,
189
- "learning_rate": 7.361963190184049e-05,
190
- "loss": 2.1439,
191
  "step": 520
192
  },
193
  {
194
- "epoch": 0.6585365853658537,
195
- "grad_norm": 0.3536331355571747,
196
- "learning_rate": 6.87116564417178e-05,
197
- "loss": 2.1583,
198
  "step": 540
199
  },
200
  {
201
- "epoch": 0.6829268292682927,
202
- "grad_norm": 0.3301340341567993,
203
- "learning_rate": 6.380368098159509e-05,
204
- "loss": 2.179,
205
  "step": 560
206
  },
207
  {
208
- "epoch": 0.7073170731707317,
209
- "grad_norm": 0.3494204878807068,
210
- "learning_rate": 5.88957055214724e-05,
211
- "loss": 2.1753,
212
  "step": 580
213
  },
214
  {
215
- "epoch": 0.7317073170731707,
216
- "grad_norm": 0.3503289520740509,
217
- "learning_rate": 5.3987730061349695e-05,
218
- "loss": 2.1724,
219
  "step": 600
220
  }
221
  ],
222
  "logging_steps": 20,
223
- "max_steps": 820,
224
  "num_input_tokens_seen": 0,
225
  "num_train_epochs": 1,
226
  "save_steps": 200,
@@ -236,7 +236,7 @@
236
  "attributes": {}
237
  }
238
  },
239
- "total_flos": 1.439777998503936e+17,
240
  "train_batch_size": 2,
241
  "trial_name": null,
242
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.64,
5
  "eval_steps": 500,
6
  "global_step": 600,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.021333333333333333,
13
+ "grad_norm": 0.46613356471061707,
14
+ "learning_rate": 0.00019678111587982831,
15
+ "loss": 1.2799,
16
  "step": 20
17
  },
18
  {
19
+ "epoch": 0.042666666666666665,
20
+ "grad_norm": 0.3468495309352875,
21
+ "learning_rate": 0.0001924892703862661,
22
+ "loss": 0.8919,
23
  "step": 40
24
  },
25
  {
26
+ "epoch": 0.064,
27
+ "grad_norm": 0.4602198004722595,
28
+ "learning_rate": 0.00018819742489270387,
29
+ "loss": 0.8586,
30
  "step": 60
31
  },
32
  {
33
+ "epoch": 0.08533333333333333,
34
+ "grad_norm": 0.480325311422348,
35
+ "learning_rate": 0.00018390557939914164,
36
+ "loss": 0.7571,
37
  "step": 80
38
  },
39
  {
40
+ "epoch": 0.10666666666666667,
41
+ "grad_norm": 0.30179363489151,
42
+ "learning_rate": 0.00017961373390557942,
43
+ "loss": 0.7793,
44
  "step": 100
45
  },
46
  {
47
+ "epoch": 0.128,
48
+ "grad_norm": 0.3483397364616394,
49
+ "learning_rate": 0.00017532188841201717,
50
+ "loss": 0.7647,
51
  "step": 120
52
  },
53
  {
54
+ "epoch": 0.14933333333333335,
55
+ "grad_norm": 0.29965728521347046,
56
+ "learning_rate": 0.00017103004291845494,
57
+ "loss": 0.6741,
58
  "step": 140
59
  },
60
  {
61
+ "epoch": 0.17066666666666666,
62
+ "grad_norm": 0.26644188165664673,
63
+ "learning_rate": 0.00016673819742489272,
64
+ "loss": 0.7586,
65
  "step": 160
66
  },
67
  {
68
+ "epoch": 0.192,
69
+ "grad_norm": 0.2962466776371002,
70
+ "learning_rate": 0.0001624463519313305,
71
+ "loss": 0.7364,
72
  "step": 180
73
  },
74
  {
75
+ "epoch": 0.21333333333333335,
76
+ "grad_norm": 0.36480244994163513,
77
+ "learning_rate": 0.00015815450643776824,
78
+ "loss": 0.7944,
79
  "step": 200
80
  },
81
  {
82
+ "epoch": 0.23466666666666666,
83
+ "grad_norm": 0.23963908851146698,
84
+ "learning_rate": 0.000153862660944206,
85
+ "loss": 0.7055,
86
  "step": 220
87
  },
88
  {
89
+ "epoch": 0.256,
90
+ "grad_norm": 0.3207215666770935,
91
+ "learning_rate": 0.00014957081545064377,
92
+ "loss": 0.7495,
93
  "step": 240
94
  },
95
  {
96
+ "epoch": 0.2773333333333333,
97
+ "grad_norm": 0.34940779209136963,
98
+ "learning_rate": 0.00014527896995708155,
99
+ "loss": 0.7739,
100
  "step": 260
101
  },
102
  {
103
+ "epoch": 0.2986666666666667,
104
+ "grad_norm": 0.2764255702495575,
105
+ "learning_rate": 0.00014098712446351932,
106
+ "loss": 0.7126,
107
  "step": 280
108
  },
109
  {
110
+ "epoch": 0.32,
111
+ "grad_norm": 0.27612850069999695,
112
+ "learning_rate": 0.0001366952789699571,
113
+ "loss": 0.7308,
114
  "step": 300
115
  },
116
  {
117
+ "epoch": 0.3413333333333333,
118
+ "grad_norm": 0.2778555154800415,
119
+ "learning_rate": 0.00013240343347639485,
120
+ "loss": 0.6974,
121
  "step": 320
122
  },
123
  {
124
+ "epoch": 0.3626666666666667,
125
+ "grad_norm": 0.33450472354888916,
126
+ "learning_rate": 0.00012811158798283262,
127
+ "loss": 0.7557,
128
  "step": 340
129
  },
130
  {
131
+ "epoch": 0.384,
132
+ "grad_norm": 0.2894728183746338,
133
+ "learning_rate": 0.0001238197424892704,
134
+ "loss": 0.7222,
135
  "step": 360
136
  },
137
  {
138
+ "epoch": 0.4053333333333333,
139
+ "grad_norm": 0.34458762407302856,
140
+ "learning_rate": 0.00011952789699570816,
141
+ "loss": 0.7467,
142
  "step": 380
143
  },
144
  {
145
+ "epoch": 0.4266666666666667,
146
+ "grad_norm": 0.42291197180747986,
147
+ "learning_rate": 0.00011523605150214594,
148
+ "loss": 0.6999,
149
  "step": 400
150
  },
151
  {
152
+ "epoch": 0.448,
153
+ "grad_norm": 0.3315523862838745,
154
+ "learning_rate": 0.0001109442060085837,
155
+ "loss": 0.7554,
156
  "step": 420
157
  },
158
  {
159
+ "epoch": 0.4693333333333333,
160
+ "grad_norm": 0.3429376482963562,
161
+ "learning_rate": 0.00010665236051502145,
162
+ "loss": 0.7347,
163
  "step": 440
164
  },
165
  {
166
+ "epoch": 0.49066666666666664,
167
+ "grad_norm": 0.35784032940864563,
168
+ "learning_rate": 0.00010236051502145923,
169
+ "loss": 0.7075,
170
  "step": 460
171
  },
172
  {
173
+ "epoch": 0.512,
174
+ "grad_norm": 0.3431866765022278,
175
+ "learning_rate": 9.8068669527897e-05,
176
+ "loss": 0.7338,
177
  "step": 480
178
  },
179
  {
180
+ "epoch": 0.5333333333333333,
181
+ "grad_norm": 0.34916117787361145,
182
+ "learning_rate": 9.377682403433476e-05,
183
+ "loss": 0.727,
184
  "step": 500
185
  },
186
  {
187
+ "epoch": 0.5546666666666666,
188
+ "grad_norm": 0.39798951148986816,
189
+ "learning_rate": 8.948497854077254e-05,
190
+ "loss": 0.7505,
191
  "step": 520
192
  },
193
  {
194
+ "epoch": 0.576,
195
+ "grad_norm": 0.33197829127311707,
196
+ "learning_rate": 8.51931330472103e-05,
197
+ "loss": 0.7085,
198
  "step": 540
199
  },
200
  {
201
+ "epoch": 0.5973333333333334,
202
+ "grad_norm": 0.31665658950805664,
203
+ "learning_rate": 8.090128755364808e-05,
204
+ "loss": 0.699,
205
  "step": 560
206
  },
207
  {
208
+ "epoch": 0.6186666666666667,
209
+ "grad_norm": 0.3004639744758606,
210
+ "learning_rate": 7.660944206008584e-05,
211
+ "loss": 0.7368,
212
  "step": 580
213
  },
214
  {
215
+ "epoch": 0.64,
216
+ "grad_norm": 0.32609978318214417,
217
+ "learning_rate": 7.23175965665236e-05,
218
+ "loss": 0.7168,
219
  "step": 600
220
  }
221
  ],
222
  "logging_steps": 20,
223
+ "max_steps": 937,
224
  "num_input_tokens_seen": 0,
225
  "num_train_epochs": 1,
226
  "save_steps": 200,
 
236
  "attributes": {}
237
  }
238
  },
239
+ "total_flos": 1.444779555028992e+17,
240
  "train_batch_size": 2,
241
  "trial_name": null,
242
  "trial_params": null
checkpoint-600/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8d1652c64d5701d7542543217cc8eb0453db379d2bc34a961ad431eb6ebe7fc
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f519abf615386e0857c941fcb28a9140901798289aceaff057539afc5159bd3d
3
  size 5560