error577 commited on
Commit
497cf2b
·
verified ·
1 Parent(s): 6b32b03

Training in progress, step 189, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "v_proj",
24
- "gate_proj",
25
  "o_proj",
26
  "q_proj",
 
27
  "down_proj",
28
- "up_proj",
29
- "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "up_proj",
24
+ "k_proj",
25
  "o_proj",
26
  "q_proj",
27
+ "gate_proj",
28
  "down_proj",
29
+ "v_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edd65420b862e61429ff861bbea459eea1f76c5ea075342be0906d19981cf945
3
  size 30026872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ffa3fbe172594fa81874fb5432c66f1b77eedc91fa66bf84d31baa09c65c39a
3
  size 30026872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cec3656a6fd13d3cbd4b1b68f333291d23d3d1c324d87d87923bc8e4d234296
3
  size 15611412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1fdea71780448dacfcff61d8446a676493035a43a9e7dd4d1906153dce3b04b
3
  size 15611412
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b84ff11766f1301a428c8db7102a6c637326291ad08a682caba1cafb5929e758
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abcfe02bdad7747d2c3c0c8f55623c1482ce271854d294109599bf6292cf696b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6eed27652e01d7de28ec286f20e81c34433ab622c5447b261848a9fb6d6bd739
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbceb056846578e9bbec42a2f09fff452739d7101bfebf65236834058dbeb39a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,312 +1,1390 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2987481434330575,
5
  "eval_steps": 32,
6
- "global_step": 176,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.001697432633142372,
13
  "eval_loss": 2.4177019596099854,
14
- "eval_runtime": 13.7299,
15
- "eval_samples_per_second": 18.136,
16
- "eval_steps_per_second": 18.136,
17
  "step": 1
18
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {
20
  "epoch": 0.008487163165711862,
21
- "grad_norm": 0.5580189228057861,
22
  "learning_rate": 5e-06,
23
- "loss": 2.0835,
24
  "step": 5
25
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  {
27
  "epoch": 0.016974326331423723,
28
- "grad_norm": 0.5617932081222534,
29
  "learning_rate": 1e-05,
30
- "loss": 2.3638,
31
  "step": 10
32
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  {
34
  "epoch": 0.025461489497135583,
35
- "grad_norm": 0.637174129486084,
36
  "learning_rate": 9.99743108100344e-06,
37
- "loss": 2.3443,
38
  "step": 15
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {
41
  "epoch": 0.033948652662847446,
42
- "grad_norm": 0.7906777858734131,
43
  "learning_rate": 9.989726963751683e-06,
44
- "loss": 2.4875,
45
  "step": 20
46
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  {
48
  "epoch": 0.042435815828559306,
49
- "grad_norm": 0.7220119833946228,
50
  "learning_rate": 9.976895564745993e-06,
51
- "loss": 2.2905,
52
  "step": 25
53
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  {
55
  "epoch": 0.050922978994271166,
56
- "grad_norm": 0.4569860100746155,
57
  "learning_rate": 9.95895006911623e-06,
58
- "loss": 2.8207,
59
  "step": 30
60
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  {
62
  "epoch": 0.05431784426055591,
63
- "eval_loss": 2.396263599395752,
64
- "eval_runtime": 13.7534,
65
- "eval_samples_per_second": 18.105,
66
- "eval_steps_per_second": 18.105,
67
  "step": 32
68
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  {
70
  "epoch": 0.059410142159983026,
71
- "grad_norm": 0.5562223196029663,
72
  "learning_rate": 9.935908917072253e-06,
73
- "loss": 2.3774,
74
  "step": 35
75
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  {
77
  "epoch": 0.06789730532569489,
78
- "grad_norm": 0.8851813077926636,
79
  "learning_rate": 9.907795784955327e-06,
80
- "loss": 2.3059,
81
  "step": 40
82
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  {
84
  "epoch": 0.07638446849140675,
85
- "grad_norm": 0.8263425827026367,
86
  "learning_rate": 9.874639560909118e-06,
87
- "loss": 2.2858,
88
  "step": 45
89
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  {
91
  "epoch": 0.08487163165711861,
92
- "grad_norm": 0.9496198296546936,
93
  "learning_rate": 9.836474315195148e-06,
94
- "loss": 2.2606,
95
  "step": 50
96
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  {
98
  "epoch": 0.09335879482283047,
99
- "grad_norm": 0.8389888405799866,
100
  "learning_rate": 9.793339265183303e-06,
101
- "loss": 2.4757,
102
  "step": 55
103
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  {
105
  "epoch": 0.10184595798854233,
106
- "grad_norm": 0.9090803861618042,
107
  "learning_rate": 9.745278735053345e-06,
108
- "loss": 2.2428,
109
  "step": 60
110
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  {
112
  "epoch": 0.10863568852111181,
113
- "eval_loss": 2.3315982818603516,
114
- "eval_runtime": 13.995,
115
- "eval_samples_per_second": 17.792,
116
- "eval_steps_per_second": 17.792,
117
  "step": 64
118
  },
119
  {
120
  "epoch": 0.11033312115425419,
121
- "grad_norm": 0.8944710493087769,
122
  "learning_rate": 9.692342110248802e-06,
123
- "loss": 2.361,
124
  "step": 65
125
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  {
127
  "epoch": 0.11882028431996605,
128
- "grad_norm": 0.8705277442932129,
129
  "learning_rate": 9.63458378673011e-06,
130
- "loss": 2.2061,
131
  "step": 70
132
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  {
134
  "epoch": 0.1273074474856779,
135
- "grad_norm": 1.0183981657028198,
136
  "learning_rate": 9.572063115079063e-06,
137
- "loss": 2.3014,
138
  "step": 75
139
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  {
141
  "epoch": 0.13579461065138979,
142
- "grad_norm": 0.9694010615348816,
143
  "learning_rate": 9.504844339512096e-06,
144
- "loss": 2.4273,
145
  "step": 80
146
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  {
148
  "epoch": 0.14428177381710164,
149
- "grad_norm": 0.6600094437599182,
150
  "learning_rate": 9.432996531865001e-06,
151
- "loss": 2.2039,
152
  "step": 85
153
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  {
155
  "epoch": 0.1527689369828135,
156
- "grad_norm": 1.437016487121582,
157
  "learning_rate": 9.356593520616948e-06,
158
- "loss": 2.4129,
159
  "step": 90
160
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  {
162
  "epoch": 0.16125610014852534,
163
- "grad_norm": 1.1358604431152344,
164
  "learning_rate": 9.275713815026732e-06,
165
- "loss": 2.2346,
166
  "step": 95
167
  },
168
  {
169
  "epoch": 0.16295353278166771,
170
- "eval_loss": 2.2801592350006104,
171
- "eval_runtime": 14.3531,
172
- "eval_samples_per_second": 17.348,
173
- "eval_steps_per_second": 17.348,
 
 
 
 
 
 
 
174
  "step": 96
175
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  {
177
  "epoch": 0.16974326331423722,
178
- "grad_norm": 0.8347494006156921,
179
  "learning_rate": 9.190440524459203e-06,
180
- "loss": 2.5003,
181
  "step": 100
182
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  {
184
  "epoch": 0.17823042647994908,
185
- "grad_norm": 0.9528422355651855,
186
  "learning_rate": 9.10086127298478e-06,
187
- "loss": 2.2398,
188
  "step": 105
189
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  {
191
  "epoch": 0.18671758964566093,
192
- "grad_norm": 0.7451781630516052,
193
  "learning_rate": 9.007068109339783e-06,
194
- "loss": 2.253,
195
  "step": 110
196
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  {
198
  "epoch": 0.1952047528113728,
199
- "grad_norm": 0.6891763210296631,
200
  "learning_rate": 8.90915741234015e-06,
201
- "loss": 2.0703,
202
  "step": 115
203
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  {
205
  "epoch": 0.20369191597708466,
206
- "grad_norm": 0.7363041639328003,
207
  "learning_rate": 8.807229791845673e-06,
208
- "loss": 2.3083,
209
  "step": 120
210
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  {
212
  "epoch": 0.21217907914279652,
213
- "grad_norm": 0.7747501730918884,
214
  "learning_rate": 8.701389985376578e-06,
215
- "loss": 2.2058,
216
  "step": 125
217
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  {
219
  "epoch": 0.21727137704222363,
220
- "eval_loss": 2.245945692062378,
221
- "eval_runtime": 14.3131,
222
- "eval_samples_per_second": 17.397,
223
- "eval_steps_per_second": 17.397,
224
  "step": 128
225
  },
 
 
 
 
 
 
 
226
  {
227
  "epoch": 0.22066624230850837,
228
- "grad_norm": 0.8016952276229858,
229
  "learning_rate": 8.591746750488639e-06,
230
- "loss": 2.1023,
231
  "step": 130
232
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  {
234
  "epoch": 0.22915340547422025,
235
- "grad_norm": 1.4869773387908936,
236
  "learning_rate": 8.478412753017433e-06,
237
- "loss": 2.2433,
238
  "step": 135
239
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  {
241
  "epoch": 0.2376405686399321,
242
- "grad_norm": 1.111100673675537,
243
  "learning_rate": 8.361504451306585e-06,
244
- "loss": 2.0758,
245
  "step": 140
246
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  {
248
  "epoch": 0.24612773180564396,
249
- "grad_norm": 0.7958012819290161,
250
  "learning_rate": 8.241141976538944e-06,
251
- "loss": 2.1178,
252
  "step": 145
253
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  {
255
  "epoch": 0.2546148949713558,
256
- "grad_norm": 0.6610277891159058,
257
  "learning_rate": 8.117449009293668e-06,
258
- "loss": 2.0289,
259
  "step": 150
260
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  {
262
  "epoch": 0.26310205813706766,
263
- "grad_norm": 0.9238812923431396,
264
  "learning_rate": 7.99055265245608e-06,
265
- "loss": 2.3871,
266
  "step": 155
267
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  {
269
  "epoch": 0.27158922130277957,
270
- "grad_norm": 0.9540156126022339,
271
  "learning_rate": 7.860583300610849e-06,
272
- "loss": 2.069,
273
  "step": 160
274
  },
275
  {
276
  "epoch": 0.27158922130277957,
277
- "eval_loss": 2.2213292121887207,
278
- "eval_runtime": 14.5462,
279
- "eval_samples_per_second": 17.118,
280
- "eval_steps_per_second": 17.118,
281
  "step": 160
282
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  {
284
  "epoch": 0.2800763844684914,
285
- "grad_norm": 1.1984089612960815,
286
  "learning_rate": 7.727674506052744e-06,
287
- "loss": 1.9745,
288
  "step": 165
289
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  {
291
  "epoch": 0.2885635476342033,
292
- "grad_norm": 0.8632123470306396,
293
  "learning_rate": 7.591962841552627e-06,
294
- "loss": 1.9814,
295
  "step": 170
296
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  {
298
  "epoch": 0.29705071079991513,
299
- "grad_norm": 0.6884729266166687,
300
  "learning_rate": 7.453587760019691e-06,
301
- "loss": 2.1216,
302
  "step": 175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  }
304
  ],
305
- "logging_steps": 5,
306
  "max_steps": 500,
307
  "num_input_tokens_seen": 0,
308
  "num_train_epochs": 1,
309
- "save_steps": 16,
310
  "stateful_callbacks": {
311
  "TrainerControl": {
312
  "args": {
@@ -319,7 +1397,7 @@
319
  "attributes": {}
320
  }
321
  },
322
- "total_flos": 5596099588915200.0,
323
  "train_batch_size": 1,
324
  "trial_name": null,
325
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3208147676639083,
5
  "eval_steps": 32,
6
+ "global_step": 189,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.001697432633142372,
13
+ "grad_norm": 0.812809407711029,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 1.8795,
16
+ "step": 1
17
+ },
18
  {
19
  "epoch": 0.001697432633142372,
20
  "eval_loss": 2.4177019596099854,
21
+ "eval_runtime": 13.752,
22
+ "eval_samples_per_second": 18.106,
23
+ "eval_steps_per_second": 18.106,
24
  "step": 1
25
  },
26
+ {
27
+ "epoch": 0.003394865266284744,
28
+ "grad_norm": 0.543270468711853,
29
+ "learning_rate": 2.0000000000000003e-06,
30
+ "loss": 2.0484,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.005092297899427116,
35
+ "grad_norm": 0.7654951214790344,
36
+ "learning_rate": 3e-06,
37
+ "loss": 2.2181,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.006789730532569488,
42
+ "grad_norm": 0.4694255292415619,
43
+ "learning_rate": 4.000000000000001e-06,
44
+ "loss": 1.8391,
45
+ "step": 4
46
+ },
47
  {
48
  "epoch": 0.008487163165711862,
49
+ "grad_norm": 0.5502790212631226,
50
  "learning_rate": 5e-06,
51
+ "loss": 2.4332,
52
  "step": 5
53
  },
54
+ {
55
+ "epoch": 0.010184595798854232,
56
+ "grad_norm": 0.7908716797828674,
57
+ "learning_rate": 6e-06,
58
+ "loss": 2.5234,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.011882028431996604,
63
+ "grad_norm": 0.6904019117355347,
64
+ "learning_rate": 7e-06,
65
+ "loss": 2.1753,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.013579461065138977,
70
+ "grad_norm": 0.534662127494812,
71
+ "learning_rate": 8.000000000000001e-06,
72
+ "loss": 2.4121,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.015276893698281349,
77
+ "grad_norm": 0.7569778561592102,
78
+ "learning_rate": 9e-06,
79
+ "loss": 2.2132,
80
+ "step": 9
81
+ },
82
  {
83
  "epoch": 0.016974326331423723,
84
+ "grad_norm": 0.5512822866439819,
85
  "learning_rate": 1e-05,
86
+ "loss": 2.4866,
87
  "step": 10
88
  },
89
+ {
90
+ "epoch": 0.018671758964566094,
91
+ "grad_norm": 1.0798619985580444,
92
+ "learning_rate": 9.999897234791831e-06,
93
+ "loss": 2.394,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.020369191597708464,
98
+ "grad_norm": 0.9266297221183777,
99
+ "learning_rate": 9.999588943391597e-06,
100
+ "loss": 2.7079,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.02206662423085084,
105
+ "grad_norm": 0.6924354434013367,
106
+ "learning_rate": 9.99907513847195e-06,
107
+ "loss": 2.0942,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.02376405686399321,
112
+ "grad_norm": 0.8434525728225708,
113
+ "learning_rate": 9.9983558411534e-06,
114
+ "loss": 2.441,
115
+ "step": 14
116
+ },
117
  {
118
  "epoch": 0.025461489497135583,
119
+ "grad_norm": 0.6289700269699097,
120
  "learning_rate": 9.99743108100344e-06,
121
+ "loss": 2.0808,
122
  "step": 15
123
  },
124
+ {
125
+ "epoch": 0.027158922130277954,
126
+ "grad_norm": 0.49521392583847046,
127
+ "learning_rate": 9.99630089603534e-06,
128
+ "loss": 2.4327,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.028856354763420328,
133
+ "grad_norm": 0.7913414239883423,
134
+ "learning_rate": 9.994965332706574e-06,
135
+ "loss": 2.3748,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.030553787396562698,
140
+ "grad_norm": 0.8467509746551514,
141
+ "learning_rate": 9.993424445916923e-06,
142
+ "loss": 2.9525,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.03225122002970507,
147
+ "grad_norm": 0.8036819696426392,
148
+ "learning_rate": 9.991678299006206e-06,
149
+ "loss": 2.119,
150
+ "step": 19
151
+ },
152
  {
153
  "epoch": 0.033948652662847446,
154
+ "grad_norm": 0.7837047576904297,
155
  "learning_rate": 9.989726963751683e-06,
156
+ "loss": 2.5605,
157
  "step": 20
158
  },
159
+ {
160
+ "epoch": 0.03564608529598982,
161
+ "grad_norm": 0.6784385442733765,
162
+ "learning_rate": 9.987570520365105e-06,
163
+ "loss": 2.3979,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.03734351792913219,
168
+ "grad_norm": 0.47587576508522034,
169
+ "learning_rate": 9.98520905748941e-06,
170
+ "loss": 2.255,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.03904095056227456,
175
+ "grad_norm": 0.8532634377479553,
176
+ "learning_rate": 9.982642672195093e-06,
177
+ "loss": 2.1105,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.04073838319541693,
182
+ "grad_norm": 0.9278027415275574,
183
+ "learning_rate": 9.979871469976197e-06,
184
+ "loss": 2.2521,
185
+ "step": 24
186
+ },
187
  {
188
  "epoch": 0.042435815828559306,
189
+ "grad_norm": 0.7016995549201965,
190
  "learning_rate": 9.976895564745993e-06,
191
+ "loss": 2.4353,
192
  "step": 25
193
  },
194
+ {
195
+ "epoch": 0.04413324846170168,
196
+ "grad_norm": 0.9779638051986694,
197
+ "learning_rate": 9.973715078832288e-06,
198
+ "loss": 2.1199,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.04583068109484405,
203
+ "grad_norm": 1.0116640329360962,
204
+ "learning_rate": 9.970330142972403e-06,
205
+ "loss": 2.4153,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.04752811372798642,
210
+ "grad_norm": 0.7033591866493225,
211
+ "learning_rate": 9.966740896307791e-06,
212
+ "loss": 2.2794,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.049225546361128796,
217
+ "grad_norm": 0.45710867643356323,
218
+ "learning_rate": 9.962947486378325e-06,
219
+ "loss": 3.6342,
220
+ "step": 29
221
+ },
222
  {
223
  "epoch": 0.050922978994271166,
224
+ "grad_norm": 0.45169001817703247,
225
  "learning_rate": 9.95895006911623e-06,
226
+ "loss": 3.6554,
227
  "step": 30
228
  },
229
+ {
230
+ "epoch": 0.05262041162741354,
231
+ "grad_norm": 0.8629238605499268,
232
+ "learning_rate": 9.954748808839675e-06,
233
+ "loss": 2.4657,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.05431784426055591,
238
+ "grad_norm": 1.053633451461792,
239
+ "learning_rate": 9.950343878246011e-06,
240
+ "loss": 2.5273,
241
+ "step": 32
242
+ },
243
  {
244
  "epoch": 0.05431784426055591,
245
+ "eval_loss": 2.396038293838501,
246
+ "eval_runtime": 13.7517,
247
+ "eval_samples_per_second": 18.107,
248
+ "eval_steps_per_second": 18.107,
249
  "step": 32
250
  },
251
+ {
252
+ "epoch": 0.056015276893698285,
253
+ "grad_norm": 1.0445197820663452,
254
+ "learning_rate": 9.945735458404681e-06,
255
+ "loss": 2.2506,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.057712709526840655,
260
+ "grad_norm": 1.2565211057662964,
261
+ "learning_rate": 9.94092373874978e-06,
262
+ "loss": 2.1758,
263
+ "step": 34
264
+ },
265
  {
266
  "epoch": 0.059410142159983026,
267
+ "grad_norm": 0.5416985750198364,
268
  "learning_rate": 9.935908917072253e-06,
269
+ "loss": 2.4685,
270
  "step": 35
271
  },
272
+ {
273
+ "epoch": 0.061107574793125397,
274
+ "grad_norm": 0.4666396975517273,
275
+ "learning_rate": 9.930691199511775e-06,
276
+ "loss": 2.3469,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 0.06280500742626777,
281
+ "grad_norm": 0.8385149240493774,
282
+ "learning_rate": 9.925270800548285e-06,
283
+ "loss": 2.319,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 0.06450244005941014,
288
+ "grad_norm": 0.9896085262298584,
289
+ "learning_rate": 9.91964794299315e-06,
290
+ "loss": 2.3799,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 0.06619987269255251,
295
+ "grad_norm": 0.6610036492347717,
296
+ "learning_rate": 9.91382285798002e-06,
297
+ "loss": 2.1173,
298
+ "step": 39
299
+ },
300
  {
301
  "epoch": 0.06789730532569489,
302
+ "grad_norm": 0.8618804216384888,
303
  "learning_rate": 9.907795784955327e-06,
304
+ "loss": 2.3758,
305
  "step": 40
306
  },
307
+ {
308
+ "epoch": 0.06959473795883726,
309
+ "grad_norm": 0.7183472514152527,
310
+ "learning_rate": 9.901566971668437e-06,
311
+ "loss": 2.3252,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 0.07129217059197963,
316
+ "grad_norm": 0.9099292755126953,
317
+ "learning_rate": 9.895136674161466e-06,
318
+ "loss": 2.1801,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 0.072989603225122,
323
+ "grad_norm": 1.4539014101028442,
324
+ "learning_rate": 9.888505156758758e-06,
325
+ "loss": 2.4889,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.07468703585826438,
330
+ "grad_norm": 0.8664920330047607,
331
+ "learning_rate": 9.881672692056022e-06,
332
+ "loss": 2.2992,
333
+ "step": 44
334
+ },
335
  {
336
  "epoch": 0.07638446849140675,
337
+ "grad_norm": 0.811949610710144,
338
  "learning_rate": 9.874639560909118e-06,
339
+ "loss": 2.127,
340
  "step": 45
341
  },
342
+ {
343
+ "epoch": 0.07808190112454912,
344
+ "grad_norm": 1.517981767654419,
345
+ "learning_rate": 9.867406052422525e-06,
346
+ "loss": 2.127,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.07977933375769149,
351
+ "grad_norm": 0.7088892459869385,
352
+ "learning_rate": 9.85997246393744e-06,
353
+ "loss": 1.7491,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.08147676639083386,
358
+ "grad_norm": 1.0426262617111206,
359
+ "learning_rate": 9.852339101019574e-06,
360
+ "loss": 2.7149,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.08317419902397624,
365
+ "grad_norm": 1.0637565851211548,
366
+ "learning_rate": 9.844506277446577e-06,
367
+ "loss": 2.3593,
368
+ "step": 49
369
+ },
370
  {
371
  "epoch": 0.08487163165711861,
372
+ "grad_norm": 0.9380516409873962,
373
  "learning_rate": 9.836474315195148e-06,
374
+ "loss": 2.3564,
375
  "step": 50
376
  },
377
+ {
378
+ "epoch": 0.08656906429026098,
379
+ "grad_norm": 1.4351186752319336,
380
+ "learning_rate": 9.828243544427795e-06,
381
+ "loss": 2.5773,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.08826649692340335,
386
+ "grad_norm": 1.0541988611221313,
387
+ "learning_rate": 9.819814303479268e-06,
388
+ "loss": 2.5121,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.08996392955654572,
393
+ "grad_norm": 0.7792794108390808,
394
+ "learning_rate": 9.811186938842645e-06,
395
+ "loss": 2.5692,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.0916613621896881,
400
+ "grad_norm": 0.6798502802848816,
401
+ "learning_rate": 9.802361805155097e-06,
402
+ "loss": 2.4711,
403
+ "step": 54
404
+ },
405
  {
406
  "epoch": 0.09335879482283047,
407
+ "grad_norm": 0.8218227624893188,
408
  "learning_rate": 9.793339265183303e-06,
409
+ "loss": 2.2474,
410
  "step": 55
411
  },
412
+ {
413
+ "epoch": 0.09505622745597284,
414
+ "grad_norm": 0.6220968961715698,
415
+ "learning_rate": 9.784119689808545e-06,
416
+ "loss": 2.35,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.0967536600891152,
421
+ "grad_norm": 1.2888144254684448,
422
+ "learning_rate": 9.774703458011453e-06,
423
+ "loss": 2.3106,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.09845109272225759,
428
+ "grad_norm": 1.092092514038086,
429
+ "learning_rate": 9.765090956856437e-06,
430
+ "loss": 2.2174,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.10014852535539996,
435
+ "grad_norm": 0.8670210242271423,
436
+ "learning_rate": 9.755282581475769e-06,
437
+ "loss": 2.13,
438
+ "step": 59
439
+ },
440
  {
441
  "epoch": 0.10184595798854233,
442
+ "grad_norm": 0.9023963212966919,
443
  "learning_rate": 9.745278735053345e-06,
444
+ "loss": 2.2009,
445
  "step": 60
446
  },
447
+ {
448
+ "epoch": 0.1035433906216847,
449
+ "grad_norm": 0.5580175518989563,
450
+ "learning_rate": 9.735079828808107e-06,
451
+ "loss": 3.6874,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.10524082325482707,
456
+ "grad_norm": 0.5617425441741943,
457
+ "learning_rate": 9.724686281977146e-06,
458
+ "loss": 2.2681,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.10693825588796944,
463
+ "grad_norm": 1.0360230207443237,
464
+ "learning_rate": 9.714098521798466e-06,
465
+ "loss": 2.1843,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.10863568852111181,
470
+ "grad_norm": 1.1913357973098755,
471
+ "learning_rate": 9.703316983493414e-06,
472
+ "loss": 1.8541,
473
+ "step": 64
474
+ },
475
  {
476
  "epoch": 0.10863568852111181,
477
+ "eval_loss": 2.3317337036132812,
478
+ "eval_runtime": 13.7684,
479
+ "eval_samples_per_second": 18.085,
480
+ "eval_steps_per_second": 18.085,
481
  "step": 64
482
  },
483
  {
484
  "epoch": 0.11033312115425419,
485
+ "grad_norm": 0.8698362708091736,
486
  "learning_rate": 9.692342110248802e-06,
487
+ "loss": 1.8058,
488
  "step": 65
489
  },
490
+ {
491
+ "epoch": 0.11203055378739657,
492
+ "grad_norm": 1.156919240951538,
493
+ "learning_rate": 9.681174353198687e-06,
494
+ "loss": 2.5911,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.11372798642053894,
499
+ "grad_norm": 0.9537479877471924,
500
+ "learning_rate": 9.669814171405818e-06,
501
+ "loss": 2.4091,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.11542541905368131,
506
+ "grad_norm": 0.9246991276741028,
507
+ "learning_rate": 9.658262031842772e-06,
508
+ "loss": 2.2413,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.11712285168682368,
513
+ "grad_norm": 0.9161209464073181,
514
+ "learning_rate": 9.64651840937276e-06,
515
+ "loss": 2.0002,
516
+ "step": 69
517
+ },
518
  {
519
  "epoch": 0.11882028431996605,
520
+ "grad_norm": 0.8550183773040771,
521
  "learning_rate": 9.63458378673011e-06,
522
+ "loss": 1.7941,
523
  "step": 70
524
  },
525
+ {
526
+ "epoch": 0.12051771695310842,
527
+ "grad_norm": 1.3188018798828125,
528
+ "learning_rate": 9.622458654500408e-06,
529
+ "loss": 2.5336,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.12221514958625079,
534
+ "grad_norm": 0.7801106572151184,
535
+ "learning_rate": 9.610143511100354e-06,
536
+ "loss": 2.1945,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.12391258221939316,
541
+ "grad_norm": 0.6405597925186157,
542
+ "learning_rate": 9.597638862757255e-06,
543
+ "loss": 2.3703,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.12561001485253553,
548
+ "grad_norm": 1.1282563209533691,
549
+ "learning_rate": 9.584945223488227e-06,
550
+ "loss": 2.2197,
551
+ "step": 74
552
+ },
553
  {
554
  "epoch": 0.1273074474856779,
555
+ "grad_norm": 1.006836175918579,
556
  "learning_rate": 9.572063115079063e-06,
557
+ "loss": 2.1953,
558
  "step": 75
559
  },
560
+ {
561
+ "epoch": 0.12900488011882028,
562
+ "grad_norm": 0.9159533381462097,
563
+ "learning_rate": 9.558993067062785e-06,
564
+ "loss": 2.6987,
565
+ "step": 76
566
+ },
567
+ {
568
+ "epoch": 0.13070231275196265,
569
+ "grad_norm": 0.8953261375427246,
570
+ "learning_rate": 9.545735616697875e-06,
571
+ "loss": 2.6163,
572
+ "step": 77
573
+ },
574
+ {
575
+ "epoch": 0.13239974538510502,
576
+ "grad_norm": 0.9653823375701904,
577
+ "learning_rate": 9.532291308946191e-06,
578
+ "loss": 1.9761,
579
+ "step": 78
580
+ },
581
+ {
582
+ "epoch": 0.1340971780182474,
583
+ "grad_norm": 1.0957810878753662,
584
+ "learning_rate": 9.518660696450567e-06,
585
+ "loss": 2.3978,
586
+ "step": 79
587
+ },
588
  {
589
  "epoch": 0.13579461065138979,
590
+ "grad_norm": 0.9461221098899841,
591
  "learning_rate": 9.504844339512096e-06,
592
+ "loss": 2.4555,
593
  "step": 80
594
  },
595
+ {
596
+ "epoch": 0.13749204328453216,
597
+ "grad_norm": 1.2313168048858643,
598
+ "learning_rate": 9.490842806067095e-06,
599
+ "loss": 2.357,
600
+ "step": 81
601
+ },
602
+ {
603
+ "epoch": 0.13918947591767453,
604
+ "grad_norm": 1.202469825744629,
605
+ "learning_rate": 9.476656671663766e-06,
606
+ "loss": 2.4103,
607
+ "step": 82
608
+ },
609
+ {
610
+ "epoch": 0.1408869085508169,
611
+ "grad_norm": 0.9876028299331665,
612
+ "learning_rate": 9.462286519438531e-06,
613
+ "loss": 2.1357,
614
+ "step": 83
615
+ },
616
+ {
617
+ "epoch": 0.14258434118395927,
618
+ "grad_norm": 0.8315178751945496,
619
+ "learning_rate": 9.44773294009206e-06,
620
+ "loss": 2.0217,
621
+ "step": 84
622
+ },
623
  {
624
  "epoch": 0.14428177381710164,
625
+ "grad_norm": 0.6415848731994629,
626
  "learning_rate": 9.432996531865001e-06,
627
+ "loss": 2.1035,
628
  "step": 85
629
  },
630
+ {
631
+ "epoch": 0.145979206450244,
632
+ "grad_norm": 1.2223278284072876,
633
+ "learning_rate": 9.418077900513377e-06,
634
+ "loss": 2.2361,
635
+ "step": 86
636
+ },
637
+ {
638
+ "epoch": 0.14767663908338638,
639
+ "grad_norm": 0.5901211500167847,
640
+ "learning_rate": 9.40297765928369e-06,
641
+ "loss": 3.7592,
642
+ "step": 87
643
+ },
644
+ {
645
+ "epoch": 0.14937407171652875,
646
+ "grad_norm": 0.8656297922134399,
647
+ "learning_rate": 9.387696428887715e-06,
648
+ "loss": 1.8585,
649
+ "step": 88
650
+ },
651
+ {
652
+ "epoch": 0.15107150434967112,
653
+ "grad_norm": 0.7770694494247437,
654
+ "learning_rate": 9.372234837476979e-06,
655
+ "loss": 2.0877,
656
+ "step": 89
657
+ },
658
  {
659
  "epoch": 0.1527689369828135,
660
+ "grad_norm": 1.36960768699646,
661
  "learning_rate": 9.356593520616948e-06,
662
+ "loss": 2.1314,
663
  "step": 90
664
  },
665
+ {
666
+ "epoch": 0.15446636961595586,
667
+ "grad_norm": 1.1888093948364258,
668
+ "learning_rate": 9.340773121260893e-06,
669
+ "loss": 2.1648,
670
+ "step": 91
671
+ },
672
+ {
673
+ "epoch": 0.15616380224909823,
674
+ "grad_norm": 0.9230120778083801,
675
+ "learning_rate": 9.324774289723469e-06,
676
+ "loss": 2.2115,
677
+ "step": 92
678
+ },
679
+ {
680
+ "epoch": 0.1578612348822406,
681
+ "grad_norm": 1.374051570892334,
682
+ "learning_rate": 9.308597683653976e-06,
683
+ "loss": 2.1053,
684
+ "step": 93
685
+ },
686
+ {
687
+ "epoch": 0.15955866751538297,
688
+ "grad_norm": 0.6828913688659668,
689
+ "learning_rate": 9.292243968009332e-06,
690
+ "loss": 2.5306,
691
+ "step": 94
692
+ },
693
  {
694
  "epoch": 0.16125610014852534,
695
+ "grad_norm": 1.097485065460205,
696
  "learning_rate": 9.275713815026732e-06,
697
+ "loss": 2.1556,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 0.16295353278166771,
702
+ "grad_norm": 0.6347125768661499,
703
+ "learning_rate": 9.259007904196023e-06,
704
+ "loss": 3.499,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.16295353278166771,
709
+ "eval_loss": 2.2804582118988037,
710
+ "eval_runtime": 14.4804,
711
+ "eval_samples_per_second": 17.196,
712
+ "eval_steps_per_second": 17.196,
713
  "step": 96
714
  },
715
+ {
716
+ "epoch": 0.1646509654148101,
717
+ "grad_norm": 0.882883608341217,
718
+ "learning_rate": 9.242126922231763e-06,
719
+ "loss": 2.2631,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.16634839804795248,
724
+ "grad_norm": 0.9639202356338501,
725
+ "learning_rate": 9.225071563045007e-06,
726
+ "loss": 2.2463,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.16804583068109485,
731
+ "grad_norm": 0.818662703037262,
732
+ "learning_rate": 9.207842527714767e-06,
733
+ "loss": 2.5152,
734
+ "step": 99
735
+ },
736
  {
737
  "epoch": 0.16974326331423722,
738
+ "grad_norm": 0.8204854130744934,
739
  "learning_rate": 9.190440524459203e-06,
740
+ "loss": 1.9777,
741
  "step": 100
742
  },
743
+ {
744
+ "epoch": 0.1714406959473796,
745
+ "grad_norm": 1.0629301071166992,
746
+ "learning_rate": 9.172866268606514e-06,
747
+ "loss": 2.5928,
748
+ "step": 101
749
+ },
750
+ {
751
+ "epoch": 0.17313812858052197,
752
+ "grad_norm": 1.1597543954849243,
753
+ "learning_rate": 9.15512048256552e-06,
754
+ "loss": 2.3281,
755
+ "step": 102
756
+ },
757
+ {
758
+ "epoch": 0.17483556121366434,
759
+ "grad_norm": 1.2773549556732178,
760
+ "learning_rate": 9.137203895795983e-06,
761
+ "loss": 2.2066,
762
+ "step": 103
763
+ },
764
+ {
765
+ "epoch": 0.1765329938468067,
766
+ "grad_norm": 1.1871850490570068,
767
+ "learning_rate": 9.119117244778609e-06,
768
+ "loss": 2.008,
769
+ "step": 104
770
+ },
771
  {
772
  "epoch": 0.17823042647994908,
773
+ "grad_norm": 0.9358012080192566,
774
  "learning_rate": 9.10086127298478e-06,
775
+ "loss": 2.063,
776
  "step": 105
777
  },
778
+ {
779
+ "epoch": 0.17992785911309145,
780
+ "grad_norm": 0.770573616027832,
781
+ "learning_rate": 9.082436730845993e-06,
782
+ "loss": 2.4115,
783
+ "step": 106
784
+ },
785
+ {
786
+ "epoch": 0.18162529174623382,
787
+ "grad_norm": 0.5804722905158997,
788
+ "learning_rate": 9.063844375723014e-06,
789
+ "loss": 2.0023,
790
+ "step": 107
791
+ },
792
+ {
793
+ "epoch": 0.1833227243793762,
794
+ "grad_norm": 0.7254613637924194,
795
+ "learning_rate": 9.045084971874738e-06,
796
+ "loss": 2.4563,
797
+ "step": 108
798
+ },
799
+ {
800
+ "epoch": 0.18502015701251856,
801
+ "grad_norm": 0.9243117570877075,
802
+ "learning_rate": 9.026159290426782e-06,
803
+ "loss": 2.2037,
804
+ "step": 109
805
+ },
806
  {
807
  "epoch": 0.18671758964566093,
808
+ "grad_norm": 0.7356573939323425,
809
  "learning_rate": 9.007068109339783e-06,
810
+ "loss": 2.1914,
811
  "step": 110
812
  },
813
+ {
814
+ "epoch": 0.1884150222788033,
815
+ "grad_norm": 1.0659101009368896,
816
+ "learning_rate": 8.987812213377423e-06,
817
+ "loss": 2.3557,
818
+ "step": 111
819
+ },
820
+ {
821
+ "epoch": 0.19011245491194567,
822
+ "grad_norm": 0.8057276606559753,
823
+ "learning_rate": 8.968392394074164e-06,
824
+ "loss": 2.1884,
825
+ "step": 112
826
+ },
827
+ {
828
+ "epoch": 0.19180988754508804,
829
+ "grad_norm": 0.8869062662124634,
830
+ "learning_rate": 8.948809449702712e-06,
831
+ "loss": 2.2887,
832
+ "step": 113
833
+ },
834
+ {
835
+ "epoch": 0.1935073201782304,
836
+ "grad_norm": 0.8336088061332703,
837
+ "learning_rate": 8.929064185241214e-06,
838
+ "loss": 1.7894,
839
+ "step": 114
840
+ },
841
  {
842
  "epoch": 0.1952047528113728,
843
+ "grad_norm": 0.6759926080703735,
844
  "learning_rate": 8.90915741234015e-06,
845
+ "loss": 1.7366,
846
  "step": 115
847
  },
848
+ {
849
+ "epoch": 0.19690218544451518,
850
+ "grad_norm": 0.8624815940856934,
851
+ "learning_rate": 8.889089949288986e-06,
852
+ "loss": 2.014,
853
+ "step": 116
854
+ },
855
+ {
856
+ "epoch": 0.19859961807765755,
857
+ "grad_norm": 0.5995393395423889,
858
+ "learning_rate": 8.868862620982534e-06,
859
+ "loss": 1.7895,
860
+ "step": 117
861
+ },
862
+ {
863
+ "epoch": 0.20029705071079992,
864
+ "grad_norm": 1.1912142038345337,
865
+ "learning_rate": 8.84847625888703e-06,
866
+ "loss": 2.1547,
867
+ "step": 118
868
+ },
869
+ {
870
+ "epoch": 0.2019944833439423,
871
+ "grad_norm": 0.5600825548171997,
872
+ "learning_rate": 8.827931701005974e-06,
873
+ "loss": 3.6099,
874
+ "step": 119
875
+ },
876
  {
877
  "epoch": 0.20369191597708466,
878
+ "grad_norm": 0.7131045460700989,
879
  "learning_rate": 8.807229791845673e-06,
880
+ "loss": 1.9734,
881
  "step": 120
882
  },
883
+ {
884
+ "epoch": 0.20538934861022703,
885
+ "grad_norm": 0.793480634689331,
886
+ "learning_rate": 8.786371382380527e-06,
887
+ "loss": 2.3979,
888
+ "step": 121
889
+ },
890
+ {
891
+ "epoch": 0.2070867812433694,
892
+ "grad_norm": 1.2263506650924683,
893
+ "learning_rate": 8.765357330018056e-06,
894
+ "loss": 2.1874,
895
+ "step": 122
896
+ },
897
+ {
898
+ "epoch": 0.20878421387651178,
899
+ "grad_norm": 1.0434142351150513,
900
+ "learning_rate": 8.74418849856364e-06,
901
+ "loss": 2.3009,
902
+ "step": 123
903
+ },
904
+ {
905
+ "epoch": 0.21048164650965415,
906
+ "grad_norm": 0.851495087146759,
907
+ "learning_rate": 8.722865758185036e-06,
908
+ "loss": 2.0946,
909
+ "step": 124
910
+ },
911
  {
912
  "epoch": 0.21217907914279652,
913
+ "grad_norm": 0.7568338513374329,
914
  "learning_rate": 8.701389985376578e-06,
915
+ "loss": 2.0477,
916
  "step": 125
917
  },
918
+ {
919
+ "epoch": 0.2138765117759389,
920
+ "grad_norm": 1.1540439128875732,
921
+ "learning_rate": 8.679762062923176e-06,
922
+ "loss": 2.5352,
923
+ "step": 126
924
+ },
925
+ {
926
+ "epoch": 0.21557394440908126,
927
+ "grad_norm": 0.7867431044578552,
928
+ "learning_rate": 8.657982879864007e-06,
929
+ "loss": 2.0802,
930
+ "step": 127
931
+ },
932
+ {
933
+ "epoch": 0.21727137704222363,
934
+ "grad_norm": 0.8983986377716064,
935
+ "learning_rate": 8.636053331455986e-06,
936
+ "loss": 2.2621,
937
+ "step": 128
938
+ },
939
  {
940
  "epoch": 0.21727137704222363,
941
+ "eval_loss": 2.24617862701416,
942
+ "eval_runtime": 13.9497,
943
+ "eval_samples_per_second": 17.85,
944
+ "eval_steps_per_second": 17.85,
945
  "step": 128
946
  },
947
+ {
948
+ "epoch": 0.218968809675366,
949
+ "grad_norm": 1.0233252048492432,
950
+ "learning_rate": 8.613974319136959e-06,
951
+ "loss": 1.8431,
952
+ "step": 129
953
+ },
954
  {
955
  "epoch": 0.22066624230850837,
956
+ "grad_norm": 0.7929720282554626,
957
  "learning_rate": 8.591746750488639e-06,
958
+ "loss": 1.7856,
959
  "step": 130
960
  },
961
+ {
962
+ "epoch": 0.22236367494165074,
963
+ "grad_norm": 0.9897335767745972,
964
+ "learning_rate": 8.569371539199316e-06,
965
+ "loss": 2.2006,
966
+ "step": 131
967
+ },
968
+ {
969
+ "epoch": 0.22406110757479314,
970
+ "grad_norm": 0.6569939851760864,
971
+ "learning_rate": 8.54684960502629e-06,
972
+ "loss": 2.1989,
973
+ "step": 132
974
+ },
975
+ {
976
+ "epoch": 0.2257585402079355,
977
+ "grad_norm": 0.608720600605011,
978
+ "learning_rate": 8.52418187375806e-06,
979
+ "loss": 2.1275,
980
+ "step": 133
981
+ },
982
+ {
983
+ "epoch": 0.22745597284107788,
984
+ "grad_norm": 1.2562707662582397,
985
+ "learning_rate": 8.501369277176275e-06,
986
+ "loss": 2.4486,
987
+ "step": 134
988
+ },
989
  {
990
  "epoch": 0.22915340547422025,
991
+ "grad_norm": 1.4444576501846313,
992
  "learning_rate": 8.478412753017433e-06,
993
+ "loss": 2.2416,
994
  "step": 135
995
  },
996
+ {
997
+ "epoch": 0.23085083810736262,
998
+ "grad_norm": 0.6447165012359619,
999
+ "learning_rate": 8.455313244934324e-06,
1000
+ "loss": 1.4183,
1001
+ "step": 136
1002
+ },
1003
+ {
1004
+ "epoch": 0.232548270740505,
1005
+ "grad_norm": 0.7615664005279541,
1006
+ "learning_rate": 8.432071702457253e-06,
1007
+ "loss": 2.043,
1008
+ "step": 137
1009
+ },
1010
+ {
1011
+ "epoch": 0.23424570337364736,
1012
+ "grad_norm": 1.1442954540252686,
1013
+ "learning_rate": 8.408689080954997e-06,
1014
+ "loss": 2.452,
1015
+ "step": 138
1016
+ },
1017
+ {
1018
+ "epoch": 0.23594313600678973,
1019
+ "grad_norm": 0.9602837562561035,
1020
+ "learning_rate": 8.38516634159555e-06,
1021
+ "loss": 1.9689,
1022
+ "step": 139
1023
+ },
1024
  {
1025
  "epoch": 0.2376405686399321,
1026
+ "grad_norm": 1.0848236083984375,
1027
  "learning_rate": 8.361504451306585e-06,
1028
+ "loss": 2.4904,
1029
  "step": 140
1030
  },
1031
+ {
1032
+ "epoch": 0.23933800127307447,
1033
+ "grad_norm": 0.8649332523345947,
1034
+ "learning_rate": 8.337704382735741e-06,
1035
+ "loss": 2.2389,
1036
+ "step": 141
1037
+ },
1038
+ {
1039
+ "epoch": 0.24103543390621684,
1040
+ "grad_norm": 1.0953073501586914,
1041
+ "learning_rate": 8.313767114210615e-06,
1042
+ "loss": 1.7113,
1043
+ "step": 142
1044
+ },
1045
+ {
1046
+ "epoch": 0.24273286653935922,
1047
+ "grad_norm": 0.8301921486854553,
1048
+ "learning_rate": 8.289693629698564e-06,
1049
+ "loss": 2.5715,
1050
+ "step": 143
1051
+ },
1052
+ {
1053
+ "epoch": 0.24443029917250159,
1054
+ "grad_norm": 1.1218247413635254,
1055
+ "learning_rate": 8.265484918766243e-06,
1056
+ "loss": 2.1422,
1057
+ "step": 144
1058
+ },
1059
  {
1060
  "epoch": 0.24612773180564396,
1061
+ "grad_norm": 0.7811430096626282,
1062
  "learning_rate": 8.241141976538944e-06,
1063
+ "loss": 1.9302,
1064
  "step": 145
1065
  },
1066
+ {
1067
+ "epoch": 0.24782516443878633,
1068
+ "grad_norm": 0.7039507627487183,
1069
+ "learning_rate": 8.216665803659671e-06,
1070
+ "loss": 2.0113,
1071
+ "step": 146
1072
+ },
1073
+ {
1074
+ "epoch": 0.2495225970719287,
1075
+ "grad_norm": 0.8372006416320801,
1076
+ "learning_rate": 8.192057406248028e-06,
1077
+ "loss": 1.9525,
1078
+ "step": 147
1079
+ },
1080
+ {
1081
+ "epoch": 0.25122002970507107,
1082
+ "grad_norm": 0.8085992932319641,
1083
+ "learning_rate": 8.16731779585885e-06,
1084
+ "loss": 1.8218,
1085
+ "step": 148
1086
+ },
1087
+ {
1088
+ "epoch": 0.25291746233821344,
1089
+ "grad_norm": 0.9540099501609802,
1090
+ "learning_rate": 8.142447989440618e-06,
1091
+ "loss": 2.2575,
1092
+ "step": 149
1093
+ },
1094
  {
1095
  "epoch": 0.2546148949713558,
1096
+ "grad_norm": 0.6501407623291016,
1097
  "learning_rate": 8.117449009293668e-06,
1098
+ "loss": 2.1018,
1099
  "step": 150
1100
  },
1101
+ {
1102
+ "epoch": 0.2563123276044982,
1103
+ "grad_norm": 0.8084308505058289,
1104
+ "learning_rate": 8.092321883028157e-06,
1105
+ "loss": 1.8939,
1106
+ "step": 151
1107
+ },
1108
+ {
1109
+ "epoch": 0.25800976023764055,
1110
+ "grad_norm": 0.6410229802131653,
1111
+ "learning_rate": 8.067067643521834e-06,
1112
+ "loss": 3.5254,
1113
+ "step": 152
1114
+ },
1115
+ {
1116
+ "epoch": 0.2597071928707829,
1117
+ "grad_norm": 1.2251901626586914,
1118
+ "learning_rate": 8.041687328877566e-06,
1119
+ "loss": 2.4041,
1120
+ "step": 153
1121
+ },
1122
+ {
1123
+ "epoch": 0.2614046255039253,
1124
+ "grad_norm": 0.641666054725647,
1125
+ "learning_rate": 8.016181982380682e-06,
1126
+ "loss": 2.0584,
1127
+ "step": 154
1128
+ },
1129
  {
1130
  "epoch": 0.26310205813706766,
1131
+ "grad_norm": 0.8989869356155396,
1132
  "learning_rate": 7.99055265245608e-06,
1133
+ "loss": 2.0512,
1134
  "step": 155
1135
  },
1136
+ {
1137
+ "epoch": 0.26479949077021003,
1138
+ "grad_norm": 1.3281505107879639,
1139
+ "learning_rate": 7.96480039262513e-06,
1140
+ "loss": 2.3069,
1141
+ "step": 156
1142
+ },
1143
+ {
1144
+ "epoch": 0.2664969234033524,
1145
+ "grad_norm": 0.6267737746238708,
1146
+ "learning_rate": 7.938926261462366e-06,
1147
+ "loss": 2.1205,
1148
+ "step": 157
1149
+ },
1150
+ {
1151
+ "epoch": 0.2681943560364948,
1152
+ "grad_norm": 0.9214989542961121,
1153
+ "learning_rate": 7.912931322551981e-06,
1154
+ "loss": 1.8681,
1155
+ "step": 158
1156
+ },
1157
+ {
1158
+ "epoch": 0.2698917886696372,
1159
+ "grad_norm": 0.9009872674942017,
1160
+ "learning_rate": 7.886816644444099e-06,
1161
+ "loss": 1.9475,
1162
+ "step": 159
1163
+ },
1164
  {
1165
  "epoch": 0.27158922130277957,
1166
+ "grad_norm": 0.9399353861808777,
1167
  "learning_rate": 7.860583300610849e-06,
1168
+ "loss": 2.0988,
1169
  "step": 160
1170
  },
1171
  {
1172
  "epoch": 0.27158922130277957,
1173
+ "eval_loss": 2.2218072414398193,
1174
+ "eval_runtime": 14.2264,
1175
+ "eval_samples_per_second": 17.503,
1176
+ "eval_steps_per_second": 17.503,
1177
  "step": 160
1178
  },
1179
+ {
1180
+ "epoch": 0.27328665393592194,
1181
+ "grad_norm": 0.8981595635414124,
1182
+ "learning_rate": 7.83423236940225e-06,
1183
+ "loss": 1.9505,
1184
+ "step": 161
1185
+ },
1186
+ {
1187
+ "epoch": 0.2749840865690643,
1188
+ "grad_norm": 1.0000768899917603,
1189
+ "learning_rate": 7.807764934001875e-06,
1190
+ "loss": 2.0657,
1191
+ "step": 162
1192
+ },
1193
+ {
1194
+ "epoch": 0.2766815192022067,
1195
+ "grad_norm": 0.92807936668396,
1196
+ "learning_rate": 7.781182082382325e-06,
1197
+ "loss": 1.8255,
1198
+ "step": 163
1199
+ },
1200
+ {
1201
+ "epoch": 0.27837895183534905,
1202
+ "grad_norm": 0.857050895690918,
1203
+ "learning_rate": 7.754484907260513e-06,
1204
+ "loss": 1.9167,
1205
+ "step": 164
1206
+ },
1207
  {
1208
  "epoch": 0.2800763844684914,
1209
+ "grad_norm": 1.1543477773666382,
1210
  "learning_rate": 7.727674506052744e-06,
1211
+ "loss": 2.1098,
1212
  "step": 165
1213
  },
1214
+ {
1215
+ "epoch": 0.2817738171016338,
1216
+ "grad_norm": 0.9627947807312012,
1217
+ "learning_rate": 7.700751980829601e-06,
1218
+ "loss": 2.08,
1219
+ "step": 166
1220
+ },
1221
+ {
1222
+ "epoch": 0.28347124973477617,
1223
+ "grad_norm": 0.973473310470581,
1224
+ "learning_rate": 7.673718438270649e-06,
1225
+ "loss": 1.8338,
1226
+ "step": 167
1227
+ },
1228
+ {
1229
+ "epoch": 0.28516868236791854,
1230
+ "grad_norm": 0.8794575929641724,
1231
+ "learning_rate": 7.646574989618938e-06,
1232
+ "loss": 2.0059,
1233
+ "step": 168
1234
+ },
1235
+ {
1236
+ "epoch": 0.2868661150010609,
1237
+ "grad_norm": 0.8404746055603027,
1238
+ "learning_rate": 7.619322750635327e-06,
1239
+ "loss": 1.9873,
1240
+ "step": 169
1241
+ },
1242
  {
1243
  "epoch": 0.2885635476342033,
1244
+ "grad_norm": 0.8498474955558777,
1245
  "learning_rate": 7.591962841552627e-06,
1246
+ "loss": 1.9968,
1247
  "step": 170
1248
  },
1249
+ {
1250
+ "epoch": 0.29026098026734565,
1251
+ "grad_norm": 0.9468035101890564,
1252
+ "learning_rate": 7.564496387029532e-06,
1253
+ "loss": 2.0884,
1254
+ "step": 171
1255
+ },
1256
+ {
1257
+ "epoch": 0.291958412900488,
1258
+ "grad_norm": 0.9746489524841309,
1259
+ "learning_rate": 7.536924516104411e-06,
1260
+ "loss": 2.081,
1261
+ "step": 172
1262
+ },
1263
+ {
1264
+ "epoch": 0.2936558455336304,
1265
+ "grad_norm": 0.9985252618789673,
1266
+ "learning_rate": 7.509248362148889e-06,
1267
+ "loss": 2.275,
1268
+ "step": 173
1269
+ },
1270
+ {
1271
+ "epoch": 0.29535327816677276,
1272
+ "grad_norm": 0.7430412769317627,
1273
+ "learning_rate": 7.481469062821252e-06,
1274
+ "loss": 2.2651,
1275
+ "step": 174
1276
+ },
1277
  {
1278
  "epoch": 0.29705071079991513,
1279
+ "grad_norm": 0.6778978109359741,
1280
  "learning_rate": 7.453587760019691e-06,
1281
+ "loss": 1.897,
1282
  "step": 175
1283
+ },
1284
+ {
1285
+ "epoch": 0.2987481434330575,
1286
+ "grad_norm": 0.9426060914993286,
1287
+ "learning_rate": 7.42560559983536e-06,
1288
+ "loss": 2.0401,
1289
+ "step": 176
1290
+ },
1291
+ {
1292
+ "epoch": 0.30044557606619987,
1293
+ "grad_norm": 1.0092425346374512,
1294
+ "learning_rate": 7.39752373250527e-06,
1295
+ "loss": 1.6578,
1296
+ "step": 177
1297
+ },
1298
+ {
1299
+ "epoch": 0.30214300869934224,
1300
+ "grad_norm": 0.7923305034637451,
1301
+ "learning_rate": 7.369343312364994e-06,
1302
+ "loss": 2.5945,
1303
+ "step": 178
1304
+ },
1305
+ {
1306
+ "epoch": 0.3038404413324846,
1307
+ "grad_norm": 1.0854469537734985,
1308
+ "learning_rate": 7.34106549780123e-06,
1309
+ "loss": 2.2971,
1310
+ "step": 179
1311
+ },
1312
+ {
1313
+ "epoch": 0.305537873965627,
1314
+ "grad_norm": 0.9536014795303345,
1315
+ "learning_rate": 7.312691451204178e-06,
1316
+ "loss": 1.9666,
1317
+ "step": 180
1318
+ },
1319
+ {
1320
+ "epoch": 0.30723530659876935,
1321
+ "grad_norm": 1.0422824621200562,
1322
+ "learning_rate": 7.284222338919758e-06,
1323
+ "loss": 2.1869,
1324
+ "step": 181
1325
+ },
1326
+ {
1327
+ "epoch": 0.3089327392319117,
1328
+ "grad_norm": 1.1253349781036377,
1329
+ "learning_rate": 7.255659331201673e-06,
1330
+ "loss": 2.5139,
1331
+ "step": 182
1332
+ },
1333
+ {
1334
+ "epoch": 0.3106301718650541,
1335
+ "grad_norm": 0.8049088716506958,
1336
+ "learning_rate": 7.227003602163296e-06,
1337
+ "loss": 2.1705,
1338
+ "step": 183
1339
+ },
1340
+ {
1341
+ "epoch": 0.31232760449819646,
1342
+ "grad_norm": 0.674087643623352,
1343
+ "learning_rate": 7.198256329729412e-06,
1344
+ "loss": 2.0292,
1345
+ "step": 184
1346
+ },
1347
+ {
1348
+ "epoch": 0.31402503713133884,
1349
+ "grad_norm": 0.7477718591690063,
1350
+ "learning_rate": 7.169418695587791e-06,
1351
+ "loss": 3.4058,
1352
+ "step": 185
1353
+ },
1354
+ {
1355
+ "epoch": 0.3157224697644812,
1356
+ "grad_norm": 0.9587709903717041,
1357
+ "learning_rate": 7.140491885140629e-06,
1358
+ "loss": 1.8608,
1359
+ "step": 186
1360
+ },
1361
+ {
1362
+ "epoch": 0.3174199023976236,
1363
+ "grad_norm": 0.9873301386833191,
1364
+ "learning_rate": 7.1114770874558e-06,
1365
+ "loss": 2.3293,
1366
+ "step": 187
1367
+ },
1368
+ {
1369
+ "epoch": 0.31911733503076595,
1370
+ "grad_norm": 0.8506638407707214,
1371
+ "learning_rate": 7.082375495217996e-06,
1372
+ "loss": 1.9833,
1373
+ "step": 188
1374
+ },
1375
+ {
1376
+ "epoch": 0.3208147676639083,
1377
+ "grad_norm": 0.9029797315597534,
1378
+ "learning_rate": 7.053188304679691e-06,
1379
+ "loss": 2.074,
1380
+ "step": 189
1381
  }
1382
  ],
1383
+ "logging_steps": 1,
1384
  "max_steps": 500,
1385
  "num_input_tokens_seen": 0,
1386
  "num_train_epochs": 1,
1387
+ "save_steps": 63,
1388
  "stateful_callbacks": {
1389
  "TrainerControl": {
1390
  "args": {
 
1397
  "attributes": {}
1398
  }
1399
  },
1400
+ "total_flos": 6011357862297600.0,
1401
  "train_batch_size": 1,
1402
  "trial_name": null,
1403
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e902904bbfeafcf7200b04696313449326cdab359ba6c8339db9eada6e4a62e
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f03c4a46512f835c77b348026f2cd80ed944a8972c0abb5f37ba864154ef403f
3
  size 6776