mtzig commited on
Commit
dd1a6d5
1 Parent(s): e371b28

Training in progress, step 100, checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ last-checkpoint/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ last-checkpoint/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ last-checkpoint/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
last-checkpoint/optimizer_0/.metadata ADDED
Binary file (369 kB). View file
 
last-checkpoint/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84cc6088d1a36caf682b4ba964cd893752db7233c9ac2731e0bdecbcdf7b5227
3
+ size 13934748
last-checkpoint/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:032757e766de4d9f4bbb31e72c0fc2f5016ed662bacbf4cdcb7e3308b6a0cac9
3
+ size 13999412
last-checkpoint/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70dfcfbc62023412081edce90fbfc6bb052f6d340a3c3b07f004f301584ee0c7
3
+ size 13990904
last-checkpoint/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d14f395311907bcfada3de9ddd795697a83c436721b6e3a2577a7a74cef92c4
3
+ size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata ADDED
Binary file (135 kB). View file
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2500194f21bc76538b2c1b940c338d34558bbab87fb9822b585aa3e1dd3bf6d3
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d67ce69eb50c202ffe38a37d23ce8fbaad42cc3bb0f50546f25c3f334b359f2
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f618b60099ee98077cb33d7ef25639a63a2d4d3d44e14c170bbcf863d1d7915
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01392e604b9718dfb75b6aca4d767028a8d9e9db814f90dfae1a541d68f3bd92
3
+ size 6966784
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e3af1412bccc3eadfac8740ec9f3a4420a7c5f09e3fe3834916f8d1d62e32cc
3
+ size 14960
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:507c5ba79941654ef19efd04ea7687a98a5da5e324999e5c56ab7ef5a7874501
3
+ size 14960
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbcca81eaf3e1fcd1ffc0e0811d0c9e7d0347da9661b13e7fa139d8134abcfcb
3
+ size 14960
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b21db9b4b2ed193b6628ce22e61310c77f199ef08663c67ba353159845902c
3
+ size 14960
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cef0f333423fa3aff21a166b2bb6d2a55f2f3a173fc6ee99710701765969b439
3
+ size 1064
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.04740459824602986,
5
+ "eval_steps": 20,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0,
13
+ "eval_accuracy": 0.9098228663446055,
14
+ "eval_f1": 0.3,
15
+ "eval_loss": 0.39109358191490173,
16
+ "eval_precision": 0.2222222222222222,
17
+ "eval_recall": 0.46153846153846156,
18
+ "eval_runtime": 50.0411,
19
+ "eval_samples_per_second": 5.416,
20
+ "eval_steps_per_second": 0.18,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.00047404598246029864,
25
+ "grad_norm": 2.736264228820801,
26
+ "learning_rate": 9.478672985781992e-08,
27
+ "loss": 0.6485,
28
+ "step": 1
29
+ },
30
+ {
31
+ "epoch": 0.0009480919649205973,
32
+ "grad_norm": 2.5090606212615967,
33
+ "learning_rate": 1.8957345971563984e-07,
34
+ "loss": 0.6663,
35
+ "step": 2
36
+ },
37
+ {
38
+ "epoch": 0.001422137947380896,
39
+ "grad_norm": 2.8418514728546143,
40
+ "learning_rate": 2.843601895734597e-07,
41
+ "loss": 0.6669,
42
+ "step": 3
43
+ },
44
+ {
45
+ "epoch": 0.0018961839298411946,
46
+ "grad_norm": 3.081920862197876,
47
+ "learning_rate": 3.791469194312797e-07,
48
+ "loss": 0.7195,
49
+ "step": 4
50
+ },
51
+ {
52
+ "epoch": 0.002370229912301493,
53
+ "grad_norm": 2.9263253211975098,
54
+ "learning_rate": 4.7393364928909956e-07,
55
+ "loss": 0.699,
56
+ "step": 5
57
+ },
58
+ {
59
+ "epoch": 0.002844275894761792,
60
+ "grad_norm": 2.3481531143188477,
61
+ "learning_rate": 5.687203791469194e-07,
62
+ "loss": 0.6352,
63
+ "step": 6
64
+ },
65
+ {
66
+ "epoch": 0.0033183218772220905,
67
+ "grad_norm": 2.840491533279419,
68
+ "learning_rate": 6.635071090047394e-07,
69
+ "loss": 0.6859,
70
+ "step": 7
71
+ },
72
+ {
73
+ "epoch": 0.003792367859682389,
74
+ "grad_norm": 2.8396637439727783,
75
+ "learning_rate": 7.582938388625594e-07,
76
+ "loss": 0.6912,
77
+ "step": 8
78
+ },
79
+ {
80
+ "epoch": 0.004266413842142688,
81
+ "grad_norm": 3.1124157905578613,
82
+ "learning_rate": 8.530805687203792e-07,
83
+ "loss": 0.6481,
84
+ "step": 9
85
+ },
86
+ {
87
+ "epoch": 0.004740459824602986,
88
+ "grad_norm": 2.768177032470703,
89
+ "learning_rate": 9.478672985781991e-07,
90
+ "loss": 0.6911,
91
+ "step": 10
92
+ },
93
+ {
94
+ "epoch": 0.0052145058070632855,
95
+ "grad_norm": 2.4753313064575195,
96
+ "learning_rate": 1.042654028436019e-06,
97
+ "loss": 0.5654,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 0.005688551789523584,
102
+ "grad_norm": 3.73299241065979,
103
+ "learning_rate": 1.1374407582938388e-06,
104
+ "loss": 0.7249,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.006162597771983883,
109
+ "grad_norm": 2.8647408485412598,
110
+ "learning_rate": 1.2322274881516587e-06,
111
+ "loss": 0.571,
112
+ "step": 13
113
+ },
114
+ {
115
+ "epoch": 0.006636643754444181,
116
+ "grad_norm": 1.9949300289154053,
117
+ "learning_rate": 1.3270142180094788e-06,
118
+ "loss": 0.5458,
119
+ "step": 14
120
+ },
121
+ {
122
+ "epoch": 0.00711068973690448,
123
+ "grad_norm": 2.8095905780792236,
124
+ "learning_rate": 1.4218009478672987e-06,
125
+ "loss": 0.6974,
126
+ "step": 15
127
+ },
128
+ {
129
+ "epoch": 0.007584735719364778,
130
+ "grad_norm": 2.3465747833251953,
131
+ "learning_rate": 1.5165876777251187e-06,
132
+ "loss": 0.5798,
133
+ "step": 16
134
+ },
135
+ {
136
+ "epoch": 0.008058781701825076,
137
+ "grad_norm": 2.2376415729522705,
138
+ "learning_rate": 1.6113744075829384e-06,
139
+ "loss": 0.6219,
140
+ "step": 17
141
+ },
142
+ {
143
+ "epoch": 0.008532827684285376,
144
+ "grad_norm": 2.2321646213531494,
145
+ "learning_rate": 1.7061611374407585e-06,
146
+ "loss": 0.6327,
147
+ "step": 18
148
+ },
149
+ {
150
+ "epoch": 0.009006873666745675,
151
+ "grad_norm": 2.9532177448272705,
152
+ "learning_rate": 1.8009478672985784e-06,
153
+ "loss": 0.7287,
154
+ "step": 19
155
+ },
156
+ {
157
+ "epoch": 0.009480919649205973,
158
+ "grad_norm": 2.7521305084228516,
159
+ "learning_rate": 1.8957345971563982e-06,
160
+ "loss": 0.7904,
161
+ "step": 20
162
+ },
163
+ {
164
+ "epoch": 0.009480919649205973,
165
+ "eval_accuracy": 0.9162640901771336,
166
+ "eval_f1": 0.3157894736842105,
167
+ "eval_loss": 0.3854508101940155,
168
+ "eval_precision": 0.24,
169
+ "eval_recall": 0.46153846153846156,
170
+ "eval_runtime": 50.1605,
171
+ "eval_samples_per_second": 5.403,
172
+ "eval_steps_per_second": 0.179,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.009954965631666271,
177
+ "grad_norm": 2.5812387466430664,
178
+ "learning_rate": 1.990521327014218e-06,
179
+ "loss": 0.6582,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.010429011614126571,
184
+ "grad_norm": 2.5878043174743652,
185
+ "learning_rate": 2.085308056872038e-06,
186
+ "loss": 0.5975,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.01090305759658687,
191
+ "grad_norm": 2.4602837562561035,
192
+ "learning_rate": 2.180094786729858e-06,
193
+ "loss": 0.6356,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.011377103579047167,
198
+ "grad_norm": 2.857377290725708,
199
+ "learning_rate": 2.2748815165876777e-06,
200
+ "loss": 0.6933,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.011851149561507466,
205
+ "grad_norm": 2.478761911392212,
206
+ "learning_rate": 2.369668246445498e-06,
207
+ "loss": 0.6806,
208
+ "step": 25
209
+ },
210
+ {
211
+ "epoch": 0.012325195543967766,
212
+ "grad_norm": 2.6150331497192383,
213
+ "learning_rate": 2.4644549763033174e-06,
214
+ "loss": 0.6727,
215
+ "step": 26
216
+ },
217
+ {
218
+ "epoch": 0.012799241526428064,
219
+ "grad_norm": 2.4646215438842773,
220
+ "learning_rate": 2.5592417061611373e-06,
221
+ "loss": 0.7231,
222
+ "step": 27
223
+ },
224
+ {
225
+ "epoch": 0.013273287508888362,
226
+ "grad_norm": 2.3204421997070312,
227
+ "learning_rate": 2.6540284360189576e-06,
228
+ "loss": 0.715,
229
+ "step": 28
230
+ },
231
+ {
232
+ "epoch": 0.01374733349134866,
233
+ "grad_norm": 2.1901276111602783,
234
+ "learning_rate": 2.7488151658767775e-06,
235
+ "loss": 0.6207,
236
+ "step": 29
237
+ },
238
+ {
239
+ "epoch": 0.01422137947380896,
240
+ "grad_norm": 2.659156322479248,
241
+ "learning_rate": 2.8436018957345973e-06,
242
+ "loss": 0.6465,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.014695425456269258,
247
+ "grad_norm": 3.0104305744171143,
248
+ "learning_rate": 2.938388625592417e-06,
249
+ "loss": 0.6848,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.015169471438729557,
254
+ "grad_norm": 3.2612526416778564,
255
+ "learning_rate": 3.0331753554502375e-06,
256
+ "loss": 0.6094,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.015643517421189856,
261
+ "grad_norm": 2.8630073070526123,
262
+ "learning_rate": 3.1279620853080574e-06,
263
+ "loss": 0.6679,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.016117563403650153,
268
+ "grad_norm": 3.1366546154022217,
269
+ "learning_rate": 3.222748815165877e-06,
270
+ "loss": 0.6961,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.016591609386110453,
275
+ "grad_norm": 2.5289793014526367,
276
+ "learning_rate": 3.3175355450236967e-06,
277
+ "loss": 0.6363,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.017065655368570753,
282
+ "grad_norm": 1.996009111404419,
283
+ "learning_rate": 3.412322274881517e-06,
284
+ "loss": 0.6137,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.01753970135103105,
289
+ "grad_norm": 2.309265613555908,
290
+ "learning_rate": 3.507109004739337e-06,
291
+ "loss": 0.5873,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.01801374733349135,
296
+ "grad_norm": 2.2232859134674072,
297
+ "learning_rate": 3.6018957345971567e-06,
298
+ "loss": 0.6081,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.018487793315951646,
303
+ "grad_norm": 2.608635902404785,
304
+ "learning_rate": 3.6966824644549766e-06,
305
+ "loss": 0.5946,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.018961839298411946,
310
+ "grad_norm": 2.9958667755126953,
311
+ "learning_rate": 3.7914691943127964e-06,
312
+ "loss": 0.6496,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.018961839298411946,
317
+ "eval_accuracy": 0.9299516908212561,
318
+ "eval_f1": 0.304,
319
+ "eval_loss": 0.357759952545166,
320
+ "eval_precision": 0.2602739726027397,
321
+ "eval_recall": 0.36538461538461536,
322
+ "eval_runtime": 50.7819,
323
+ "eval_samples_per_second": 5.337,
324
+ "eval_steps_per_second": 0.177,
325
+ "step": 40
326
+ },
327
+ {
328
+ "epoch": 0.019435885280872246,
329
+ "grad_norm": 2.9960222244262695,
330
+ "learning_rate": 3.886255924170616e-06,
331
+ "loss": 0.7012,
332
+ "step": 41
333
+ },
334
+ {
335
+ "epoch": 0.019909931263332542,
336
+ "grad_norm": 2.469219923019409,
337
+ "learning_rate": 3.981042654028436e-06,
338
+ "loss": 0.5172,
339
+ "step": 42
340
+ },
341
+ {
342
+ "epoch": 0.020383977245792842,
343
+ "grad_norm": 2.2367403507232666,
344
+ "learning_rate": 4.075829383886256e-06,
345
+ "loss": 0.6086,
346
+ "step": 43
347
+ },
348
+ {
349
+ "epoch": 0.020858023228253142,
350
+ "grad_norm": 2.455852746963501,
351
+ "learning_rate": 4.170616113744076e-06,
352
+ "loss": 0.6616,
353
+ "step": 44
354
+ },
355
+ {
356
+ "epoch": 0.02133206921071344,
357
+ "grad_norm": 2.6048426628112793,
358
+ "learning_rate": 4.265402843601897e-06,
359
+ "loss": 0.6319,
360
+ "step": 45
361
+ },
362
+ {
363
+ "epoch": 0.02180611519317374,
364
+ "grad_norm": 2.633476495742798,
365
+ "learning_rate": 4.360189573459716e-06,
366
+ "loss": 0.5807,
367
+ "step": 46
368
+ },
369
+ {
370
+ "epoch": 0.022280161175634035,
371
+ "grad_norm": 2.525595188140869,
372
+ "learning_rate": 4.4549763033175355e-06,
373
+ "loss": 0.566,
374
+ "step": 47
375
+ },
376
+ {
377
+ "epoch": 0.022754207158094335,
378
+ "grad_norm": 2.116396427154541,
379
+ "learning_rate": 4.549763033175355e-06,
380
+ "loss": 0.5222,
381
+ "step": 48
382
+ },
383
+ {
384
+ "epoch": 0.023228253140554635,
385
+ "grad_norm": 2.2869620323181152,
386
+ "learning_rate": 4.644549763033176e-06,
387
+ "loss": 0.6677,
388
+ "step": 49
389
+ },
390
+ {
391
+ "epoch": 0.02370229912301493,
392
+ "grad_norm": 3.656646966934204,
393
+ "learning_rate": 4.739336492890996e-06,
394
+ "loss": 0.6329,
395
+ "step": 50
396
+ },
397
+ {
398
+ "epoch": 0.02417634510547523,
399
+ "grad_norm": 2.4779574871063232,
400
+ "learning_rate": 4.834123222748816e-06,
401
+ "loss": 0.6179,
402
+ "step": 51
403
+ },
404
+ {
405
+ "epoch": 0.02465039108793553,
406
+ "grad_norm": 2.9239354133605957,
407
+ "learning_rate": 4.928909952606635e-06,
408
+ "loss": 0.5679,
409
+ "step": 52
410
+ },
411
+ {
412
+ "epoch": 0.025124437070395828,
413
+ "grad_norm": 2.596090793609619,
414
+ "learning_rate": 5.023696682464455e-06,
415
+ "loss": 0.5907,
416
+ "step": 53
417
+ },
418
+ {
419
+ "epoch": 0.025598483052856127,
420
+ "grad_norm": 2.4275245666503906,
421
+ "learning_rate": 5.118483412322275e-06,
422
+ "loss": 0.5432,
423
+ "step": 54
424
+ },
425
+ {
426
+ "epoch": 0.026072529035316427,
427
+ "grad_norm": 3.1805362701416016,
428
+ "learning_rate": 5.213270142180096e-06,
429
+ "loss": 0.6221,
430
+ "step": 55
431
+ },
432
+ {
433
+ "epoch": 0.026546575017776724,
434
+ "grad_norm": 2.3142030239105225,
435
+ "learning_rate": 5.308056872037915e-06,
436
+ "loss": 0.6459,
437
+ "step": 56
438
+ },
439
+ {
440
+ "epoch": 0.027020621000237024,
441
+ "grad_norm": 2.3154592514038086,
442
+ "learning_rate": 5.402843601895735e-06,
443
+ "loss": 0.5481,
444
+ "step": 57
445
+ },
446
+ {
447
+ "epoch": 0.02749466698269732,
448
+ "grad_norm": 2.70127272605896,
449
+ "learning_rate": 5.497630331753555e-06,
450
+ "loss": 0.5592,
451
+ "step": 58
452
+ },
453
+ {
454
+ "epoch": 0.02796871296515762,
455
+ "grad_norm": 2.5554442405700684,
456
+ "learning_rate": 5.592417061611375e-06,
457
+ "loss": 0.587,
458
+ "step": 59
459
+ },
460
+ {
461
+ "epoch": 0.02844275894761792,
462
+ "grad_norm": 2.4974448680877686,
463
+ "learning_rate": 5.687203791469195e-06,
464
+ "loss": 0.5209,
465
+ "step": 60
466
+ },
467
+ {
468
+ "epoch": 0.02844275894761792,
469
+ "eval_accuracy": 0.9468599033816425,
470
+ "eval_f1": 0.23255813953488372,
471
+ "eval_loss": 0.30114662647247314,
472
+ "eval_precision": 0.29411764705882354,
473
+ "eval_recall": 0.19230769230769232,
474
+ "eval_runtime": 50.6864,
475
+ "eval_samples_per_second": 5.347,
476
+ "eval_steps_per_second": 0.178,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.028916804930078217,
481
+ "grad_norm": 2.1992790699005127,
482
+ "learning_rate": 5.7819905213270145e-06,
483
+ "loss": 0.6134,
484
+ "step": 61
485
+ },
486
+ {
487
+ "epoch": 0.029390850912538517,
488
+ "grad_norm": 2.135422468185425,
489
+ "learning_rate": 5.876777251184834e-06,
490
+ "loss": 0.5917,
491
+ "step": 62
492
+ },
493
+ {
494
+ "epoch": 0.029864896894998817,
495
+ "grad_norm": 1.9710865020751953,
496
+ "learning_rate": 5.971563981042654e-06,
497
+ "loss": 0.5341,
498
+ "step": 63
499
+ },
500
+ {
501
+ "epoch": 0.030338942877459113,
502
+ "grad_norm": 2.6831486225128174,
503
+ "learning_rate": 6.066350710900475e-06,
504
+ "loss": 0.5878,
505
+ "step": 64
506
+ },
507
+ {
508
+ "epoch": 0.030812988859919413,
509
+ "grad_norm": 2.277893543243408,
510
+ "learning_rate": 6.161137440758295e-06,
511
+ "loss": 0.5407,
512
+ "step": 65
513
+ },
514
+ {
515
+ "epoch": 0.03128703484237971,
516
+ "grad_norm": 2.153470993041992,
517
+ "learning_rate": 6.255924170616115e-06,
518
+ "loss": 0.5109,
519
+ "step": 66
520
+ },
521
+ {
522
+ "epoch": 0.03176108082484001,
523
+ "grad_norm": 2.458293914794922,
524
+ "learning_rate": 6.350710900473935e-06,
525
+ "loss": 0.5687,
526
+ "step": 67
527
+ },
528
+ {
529
+ "epoch": 0.032235126807300306,
530
+ "grad_norm": 1.6730012893676758,
531
+ "learning_rate": 6.445497630331754e-06,
532
+ "loss": 0.511,
533
+ "step": 68
534
+ },
535
+ {
536
+ "epoch": 0.032709172789760606,
537
+ "grad_norm": 2.294477939605713,
538
+ "learning_rate": 6.5402843601895735e-06,
539
+ "loss": 0.5163,
540
+ "step": 69
541
+ },
542
+ {
543
+ "epoch": 0.033183218772220906,
544
+ "grad_norm": 1.931765079498291,
545
+ "learning_rate": 6.635071090047393e-06,
546
+ "loss": 0.5463,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.033657264754681206,
551
+ "grad_norm": 1.9582473039627075,
552
+ "learning_rate": 6.729857819905213e-06,
553
+ "loss": 0.5404,
554
+ "step": 71
555
+ },
556
+ {
557
+ "epoch": 0.034131310737141506,
558
+ "grad_norm": 2.352447986602783,
559
+ "learning_rate": 6.824644549763034e-06,
560
+ "loss": 0.5004,
561
+ "step": 72
562
+ },
563
+ {
564
+ "epoch": 0.0346053567196018,
565
+ "grad_norm": 2.5306575298309326,
566
+ "learning_rate": 6.919431279620854e-06,
567
+ "loss": 0.491,
568
+ "step": 73
569
+ },
570
+ {
571
+ "epoch": 0.0350794027020621,
572
+ "grad_norm": 1.954287052154541,
573
+ "learning_rate": 7.014218009478674e-06,
574
+ "loss": 0.5096,
575
+ "step": 74
576
+ },
577
+ {
578
+ "epoch": 0.0355534486845224,
579
+ "grad_norm": 1.7585203647613525,
580
+ "learning_rate": 7.1090047393364935e-06,
581
+ "loss": 0.4437,
582
+ "step": 75
583
+ },
584
+ {
585
+ "epoch": 0.0360274946669827,
586
+ "grad_norm": 1.9448845386505127,
587
+ "learning_rate": 7.203791469194313e-06,
588
+ "loss": 0.4902,
589
+ "step": 76
590
+ },
591
+ {
592
+ "epoch": 0.036501540649443,
593
+ "grad_norm": 2.1417629718780518,
594
+ "learning_rate": 7.298578199052133e-06,
595
+ "loss": 0.5599,
596
+ "step": 77
597
+ },
598
+ {
599
+ "epoch": 0.03697558663190329,
600
+ "grad_norm": 1.9677048921585083,
601
+ "learning_rate": 7.393364928909953e-06,
602
+ "loss": 0.5196,
603
+ "step": 78
604
+ },
605
+ {
606
+ "epoch": 0.03744963261436359,
607
+ "grad_norm": 4.773871421813965,
608
+ "learning_rate": 7.488151658767773e-06,
609
+ "loss": 0.7193,
610
+ "step": 79
611
+ },
612
+ {
613
+ "epoch": 0.03792367859682389,
614
+ "grad_norm": 1.7716329097747803,
615
+ "learning_rate": 7.582938388625593e-06,
616
+ "loss": 0.482,
617
+ "step": 80
618
+ },
619
+ {
620
+ "epoch": 0.03792367859682389,
621
+ "eval_accuracy": 0.9524959742351047,
622
+ "eval_f1": 0.21333333333333335,
623
+ "eval_loss": 0.2597336769104004,
624
+ "eval_precision": 0.34782608695652173,
625
+ "eval_recall": 0.15384615384615385,
626
+ "eval_runtime": 50.3051,
627
+ "eval_samples_per_second": 5.387,
628
+ "eval_steps_per_second": 0.179,
629
+ "step": 80
630
+ },
631
+ {
632
+ "epoch": 0.03839772457928419,
633
+ "grad_norm": 1.7248247861862183,
634
+ "learning_rate": 7.677725118483414e-06,
635
+ "loss": 0.4988,
636
+ "step": 81
637
+ },
638
+ {
639
+ "epoch": 0.03887177056174449,
640
+ "grad_norm": 2.6806564331054688,
641
+ "learning_rate": 7.772511848341233e-06,
642
+ "loss": 0.6173,
643
+ "step": 82
644
+ },
645
+ {
646
+ "epoch": 0.03934581654420479,
647
+ "grad_norm": 3.3090500831604004,
648
+ "learning_rate": 7.867298578199053e-06,
649
+ "loss": 0.6747,
650
+ "step": 83
651
+ },
652
+ {
653
+ "epoch": 0.039819862526665084,
654
+ "grad_norm": 1.7768396139144897,
655
+ "learning_rate": 7.962085308056872e-06,
656
+ "loss": 0.4246,
657
+ "step": 84
658
+ },
659
+ {
660
+ "epoch": 0.040293908509125384,
661
+ "grad_norm": 2.553398847579956,
662
+ "learning_rate": 8.056872037914693e-06,
663
+ "loss": 0.5885,
664
+ "step": 85
665
+ },
666
+ {
667
+ "epoch": 0.040767954491585684,
668
+ "grad_norm": 2.223745107650757,
669
+ "learning_rate": 8.151658767772512e-06,
670
+ "loss": 0.631,
671
+ "step": 86
672
+ },
673
+ {
674
+ "epoch": 0.041242000474045984,
675
+ "grad_norm": 2.303098440170288,
676
+ "learning_rate": 8.246445497630333e-06,
677
+ "loss": 0.4689,
678
+ "step": 87
679
+ },
680
+ {
681
+ "epoch": 0.041716046456506284,
682
+ "grad_norm": 1.8970552682876587,
683
+ "learning_rate": 8.341232227488152e-06,
684
+ "loss": 0.523,
685
+ "step": 88
686
+ },
687
+ {
688
+ "epoch": 0.04219009243896658,
689
+ "grad_norm": 2.505955934524536,
690
+ "learning_rate": 8.436018957345973e-06,
691
+ "loss": 0.4935,
692
+ "step": 89
693
+ },
694
+ {
695
+ "epoch": 0.04266413842142688,
696
+ "grad_norm": 1.875301718711853,
697
+ "learning_rate": 8.530805687203793e-06,
698
+ "loss": 0.4522,
699
+ "step": 90
700
+ },
701
+ {
702
+ "epoch": 0.04313818440388718,
703
+ "grad_norm": 1.900534749031067,
704
+ "learning_rate": 8.625592417061612e-06,
705
+ "loss": 0.4667,
706
+ "step": 91
707
+ },
708
+ {
709
+ "epoch": 0.04361223038634748,
710
+ "grad_norm": 3.142495632171631,
711
+ "learning_rate": 8.720379146919431e-06,
712
+ "loss": 0.7367,
713
+ "step": 92
714
+ },
715
+ {
716
+ "epoch": 0.04408627636880778,
717
+ "grad_norm": 2.096675395965576,
718
+ "learning_rate": 8.815165876777252e-06,
719
+ "loss": 0.5286,
720
+ "step": 93
721
+ },
722
+ {
723
+ "epoch": 0.04456032235126807,
724
+ "grad_norm": 2.4111526012420654,
725
+ "learning_rate": 8.909952606635071e-06,
726
+ "loss": 0.4165,
727
+ "step": 94
728
+ },
729
+ {
730
+ "epoch": 0.04503436833372837,
731
+ "grad_norm": 2.4553468227386475,
732
+ "learning_rate": 9.004739336492892e-06,
733
+ "loss": 0.5336,
734
+ "step": 95
735
+ },
736
+ {
737
+ "epoch": 0.04550841431618867,
738
+ "grad_norm": 2.3772170543670654,
739
+ "learning_rate": 9.09952606635071e-06,
740
+ "loss": 0.455,
741
+ "step": 96
742
+ },
743
+ {
744
+ "epoch": 0.04598246029864897,
745
+ "grad_norm": 2.652953863143921,
746
+ "learning_rate": 9.194312796208532e-06,
747
+ "loss": 0.5995,
748
+ "step": 97
749
+ },
750
+ {
751
+ "epoch": 0.04645650628110927,
752
+ "grad_norm": 1.7384384870529175,
753
+ "learning_rate": 9.289099526066352e-06,
754
+ "loss": 0.4296,
755
+ "step": 98
756
+ },
757
+ {
758
+ "epoch": 0.04693055226356957,
759
+ "grad_norm": 2.1251447200775146,
760
+ "learning_rate": 9.383886255924171e-06,
761
+ "loss": 0.5505,
762
+ "step": 99
763
+ },
764
+ {
765
+ "epoch": 0.04740459824602986,
766
+ "grad_norm": 1.7407325506210327,
767
+ "learning_rate": 9.478672985781992e-06,
768
+ "loss": 0.4165,
769
+ "step": 100
770
+ },
771
+ {
772
+ "epoch": 0.04740459824602986,
773
+ "eval_accuracy": 0.9549114331723028,
774
+ "eval_f1": 0.40425531914893614,
775
+ "eval_loss": 0.2602430284023285,
776
+ "eval_precision": 0.4523809523809524,
777
+ "eval_recall": 0.36538461538461536,
778
+ "eval_runtime": 50.5287,
779
+ "eval_samples_per_second": 5.363,
780
+ "eval_steps_per_second": 0.178,
781
+ "step": 100
782
+ }
783
+ ],
784
+ "logging_steps": 1,
785
+ "max_steps": 2109,
786
+ "num_input_tokens_seen": 0,
787
+ "num_train_epochs": 1,
788
+ "save_steps": 100,
789
+ "stateful_callbacks": {
790
+ "TrainerControl": {
791
+ "args": {
792
+ "should_epoch_stop": false,
793
+ "should_evaluate": false,
794
+ "should_log": false,
795
+ "should_save": true,
796
+ "should_training_stop": false
797
+ },
798
+ "attributes": {}
799
+ }
800
+ },
801
+ "total_flos": 2.677803521880883e+16,
802
+ "train_batch_size": 8,
803
+ "trial_name": null,
804
+ "trial_params": null
805
+ }