dmartincc commited on
Commit
40f86f6
·
verified ·
1 Parent(s): 7d2fcc5

StressTech/vedt-lg

Browse files
Files changed (4) hide show
  1. all_results.json +11 -11
  2. eval_results.json +7 -7
  3. train_results.json +4 -4
  4. trainer_state.json +160 -160
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 4.98,
3
- "eval_accuracy": 0.94,
4
- "eval_f1": 0.95,
5
- "eval_loss": 0.1643325686454773,
6
- "eval_roc_auc": 0.96,
7
- "eval_runtime": 251.2629,
8
- "eval_samples_per_second": 2.754,
9
- "eval_steps_per_second": 0.175,
10
- "train_loss": 0.2953212790801877,
11
- "train_runtime": 23053.216,
12
- "train_samples_per_second": 0.849,
13
- "train_steps_per_second": 0.026
14
  }
 
1
  {
2
  "epoch": 4.98,
3
+ "eval_accuracy": 0.92,
4
+ "eval_f1": 0.93,
5
+ "eval_loss": 0.18165849149227142,
6
+ "eval_roc_auc": 0.95,
7
+ "eval_runtime": 267.8594,
8
+ "eval_samples_per_second": 2.583,
9
+ "eval_steps_per_second": 0.164,
10
+ "train_loss": 0.31025831718913843,
11
+ "train_runtime": 21358.5627,
12
+ "train_samples_per_second": 0.916,
13
+ "train_steps_per_second": 0.029
14
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 4.98,
3
- "eval_accuracy": 0.94,
4
- "eval_f1": 0.95,
5
- "eval_loss": 0.1643325686454773,
6
- "eval_roc_auc": 0.96,
7
- "eval_runtime": 251.2629,
8
- "eval_samples_per_second": 2.754,
9
- "eval_steps_per_second": 0.175
10
  }
 
1
  {
2
  "epoch": 4.98,
3
+ "eval_accuracy": 0.92,
4
+ "eval_f1": 0.93,
5
+ "eval_loss": 0.18165849149227142,
6
+ "eval_roc_auc": 0.95,
7
+ "eval_runtime": 267.8594,
8
+ "eval_samples_per_second": 2.583,
9
+ "eval_steps_per_second": 0.164
10
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 4.98,
3
- "train_loss": 0.2953212790801877,
4
- "train_runtime": 23053.216,
5
- "train_samples_per_second": 0.849,
6
- "train_steps_per_second": 0.026
7
  }
 
1
  {
2
  "epoch": 4.98,
3
+ "train_loss": 0.31025831718913843,
4
+ "train_runtime": 21358.5627,
5
+ "train_samples_per_second": 0.916,
6
+ "train_steps_per_second": 0.029
7
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.94,
3
  "best_model_checkpoint": "vedt-lg/checkpoint-610",
4
  "epoch": 4.979591836734694,
5
  "eval_steps": 500,
@@ -10,494 +10,494 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.08,
13
- "grad_norm": 0.6926315426826477,
14
  "learning_rate": 8.196721311475409e-06,
15
- "loss": 0.6858,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.16,
20
- "grad_norm": 0.6380481123924255,
21
  "learning_rate": 1.6393442622950818e-05,
22
- "loss": 0.6122,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.24,
27
- "grad_norm": 0.44903865456581116,
28
  "learning_rate": 2.459016393442623e-05,
29
- "loss": 0.5935,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.33,
34
- "grad_norm": 0.1756860464811325,
35
  "learning_rate": 3.2786885245901635e-05,
36
- "loss": 0.5547,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.41,
41
- "grad_norm": 0.43841540813446045,
42
  "learning_rate": 4.098360655737705e-05,
43
- "loss": 0.5486,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.49,
48
- "grad_norm": 0.3223252296447754,
49
  "learning_rate": 4.918032786885246e-05,
50
- "loss": 0.5816,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.57,
55
- "grad_norm": 0.42403605580329895,
56
  "learning_rate": 4.918032786885246e-05,
57
- "loss": 0.5797,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.65,
62
- "grad_norm": 0.6035967469215393,
63
  "learning_rate": 4.8269581056466304e-05,
64
- "loss": 0.5587,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.73,
69
- "grad_norm": 0.6372044086456299,
70
  "learning_rate": 4.7358834244080144e-05,
71
- "loss": 0.548,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.82,
76
- "grad_norm": 0.5376227498054504,
77
  "learning_rate": 4.644808743169399e-05,
78
- "loss": 0.5741,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.9,
83
- "grad_norm": 0.5992793440818787,
84
  "learning_rate": 4.553734061930783e-05,
85
- "loss": 0.5331,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.98,
90
- "grad_norm": 0.6592369675636292,
91
  "learning_rate": 4.462659380692168e-05,
92
- "loss": 0.523,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
- "eval_accuracy": 0.45,
98
  "eval_f1": 0.53,
99
- "eval_loss": 0.5291920304298401,
100
  "eval_roc_auc": 0.67,
101
- "eval_runtime": 308.9689,
102
- "eval_samples_per_second": 2.24,
103
- "eval_steps_per_second": 0.142,
104
  "step": 122
105
  },
106
  {
107
  "epoch": 1.06,
108
- "grad_norm": 0.5247851610183716,
109
  "learning_rate": 4.371584699453552e-05,
110
- "loss": 0.508,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 1.14,
115
- "grad_norm": 0.6510629653930664,
116
  "learning_rate": 4.280510018214937e-05,
117
- "loss": 0.5039,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 1.22,
122
- "grad_norm": 0.8890066146850586,
123
  "learning_rate": 4.189435336976321e-05,
124
- "loss": 0.5184,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 1.31,
129
- "grad_norm": 1.0160518884658813,
130
  "learning_rate": 4.098360655737705e-05,
131
- "loss": 0.4992,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 1.39,
136
- "grad_norm": 1.0044326782226562,
137
  "learning_rate": 4.007285974499089e-05,
138
- "loss": 0.4835,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 1.47,
143
- "grad_norm": 1.259665846824646,
144
  "learning_rate": 3.916211293260474e-05,
145
- "loss": 0.4324,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 1.55,
150
- "grad_norm": 1.2308465242385864,
151
  "learning_rate": 3.825136612021858e-05,
152
- "loss": 0.4026,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 1.63,
157
- "grad_norm": 1.2136099338531494,
158
  "learning_rate": 3.7340619307832425e-05,
159
- "loss": 0.4163,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 1.71,
164
- "grad_norm": 1.1175576448440552,
165
  "learning_rate": 3.6429872495446266e-05,
166
- "loss": 0.3774,
167
  "step": 210
168
  },
169
  {
170
  "epoch": 1.8,
171
- "grad_norm": 2.0764663219451904,
172
  "learning_rate": 3.551912568306011e-05,
173
- "loss": 0.366,
174
  "step": 220
175
  },
176
  {
177
  "epoch": 1.88,
178
- "grad_norm": 1.3543003797531128,
179
  "learning_rate": 3.4608378870673954e-05,
180
- "loss": 0.332,
181
  "step": 230
182
  },
183
  {
184
  "epoch": 1.96,
185
- "grad_norm": 2.1058902740478516,
186
  "learning_rate": 3.36976320582878e-05,
187
- "loss": 0.3308,
188
  "step": 240
189
  },
190
  {
191
  "epoch": 2.0,
192
- "eval_accuracy": 0.79,
193
- "eval_f1": 0.82,
194
- "eval_loss": 0.33306625485420227,
195
- "eval_roc_auc": 0.86,
196
- "eval_runtime": 252.3349,
197
- "eval_samples_per_second": 2.742,
198
- "eval_steps_per_second": 0.174,
199
  "step": 245
200
  },
201
  {
202
  "epoch": 2.04,
203
- "grad_norm": 1.6840165853500366,
204
  "learning_rate": 3.2786885245901635e-05,
205
- "loss": 0.292,
206
  "step": 250
207
  },
208
  {
209
  "epoch": 2.12,
210
- "grad_norm": 1.5220506191253662,
211
  "learning_rate": 3.187613843351548e-05,
212
- "loss": 0.2798,
213
  "step": 260
214
  },
215
  {
216
  "epoch": 2.2,
217
- "grad_norm": 1.846103310585022,
218
  "learning_rate": 3.096539162112932e-05,
219
- "loss": 0.2568,
220
  "step": 270
221
  },
222
  {
223
  "epoch": 2.29,
224
- "grad_norm": 1.4235060214996338,
225
  "learning_rate": 3.005464480874317e-05,
226
- "loss": 0.2553,
227
  "step": 280
228
  },
229
  {
230
  "epoch": 2.37,
231
- "grad_norm": 0.7757242918014526,
232
  "learning_rate": 2.9143897996357018e-05,
233
- "loss": 0.2266,
234
  "step": 290
235
  },
236
  {
237
  "epoch": 2.45,
238
- "grad_norm": 0.7044312357902527,
239
  "learning_rate": 2.823315118397086e-05,
240
- "loss": 0.2444,
241
  "step": 300
242
  },
243
  {
244
  "epoch": 2.53,
245
- "grad_norm": 1.387548565864563,
246
  "learning_rate": 2.7322404371584703e-05,
247
- "loss": 0.2519,
248
  "step": 310
249
  },
250
  {
251
  "epoch": 2.61,
252
- "grad_norm": 1.388034462928772,
253
  "learning_rate": 2.6411657559198543e-05,
254
- "loss": 0.2271,
255
  "step": 320
256
  },
257
  {
258
  "epoch": 2.69,
259
- "grad_norm": 1.846086859703064,
260
  "learning_rate": 2.550091074681239e-05,
261
- "loss": 0.2093,
262
  "step": 330
263
  },
264
  {
265
  "epoch": 2.78,
266
- "grad_norm": 0.7766602039337158,
267
  "learning_rate": 2.459016393442623e-05,
268
- "loss": 0.2211,
269
  "step": 340
270
  },
271
  {
272
  "epoch": 2.86,
273
- "grad_norm": 0.8391594290733337,
274
  "learning_rate": 2.3679417122040072e-05,
275
- "loss": 0.204,
276
  "step": 350
277
  },
278
  {
279
  "epoch": 2.94,
280
- "grad_norm": 2.0943243503570557,
281
  "learning_rate": 2.2768670309653916e-05,
282
- "loss": 0.1989,
283
  "step": 360
284
  },
285
  {
286
  "epoch": 3.0,
287
- "eval_accuracy": 0.9,
288
- "eval_f1": 0.91,
289
- "eval_loss": 0.22652386128902435,
290
- "eval_roc_auc": 0.93,
291
- "eval_runtime": 245.4746,
292
- "eval_samples_per_second": 2.819,
293
- "eval_steps_per_second": 0.179,
294
  "step": 367
295
  },
296
  {
297
  "epoch": 3.02,
298
- "grad_norm": 1.7276263236999512,
299
  "learning_rate": 2.185792349726776e-05,
300
- "loss": 0.189,
301
  "step": 370
302
  },
303
  {
304
  "epoch": 3.1,
305
- "grad_norm": 0.5598946809768677,
306
  "learning_rate": 2.0947176684881604e-05,
307
- "loss": 0.1644,
308
  "step": 380
309
  },
310
  {
311
  "epoch": 3.18,
312
- "grad_norm": 0.7145459651947021,
313
  "learning_rate": 2.0036429872495445e-05,
314
- "loss": 0.139,
315
  "step": 390
316
  },
317
  {
318
  "epoch": 3.27,
319
- "grad_norm": 1.7051069736480713,
320
  "learning_rate": 1.912568306010929e-05,
321
- "loss": 0.136,
322
  "step": 400
323
  },
324
  {
325
  "epoch": 3.35,
326
- "grad_norm": 0.9204809665679932,
327
  "learning_rate": 1.8214936247723133e-05,
328
- "loss": 0.1481,
329
  "step": 410
330
  },
331
  {
332
  "epoch": 3.43,
333
- "grad_norm": 0.36123162508010864,
334
  "learning_rate": 1.7304189435336977e-05,
335
- "loss": 0.1329,
336
  "step": 420
337
  },
338
  {
339
  "epoch": 3.51,
340
- "grad_norm": 1.1382514238357544,
341
  "learning_rate": 1.6393442622950818e-05,
342
- "loss": 0.1317,
343
  "step": 430
344
  },
345
  {
346
  "epoch": 3.59,
347
- "grad_norm": 0.461958110332489,
348
  "learning_rate": 1.548269581056466e-05,
349
- "loss": 0.1353,
350
  "step": 440
351
  },
352
  {
353
  "epoch": 3.67,
354
- "grad_norm": 0.41453880071640015,
355
  "learning_rate": 1.4571948998178509e-05,
356
- "loss": 0.1446,
357
  "step": 450
358
  },
359
  {
360
  "epoch": 3.76,
361
- "grad_norm": 0.7464944124221802,
362
  "learning_rate": 1.3661202185792351e-05,
363
- "loss": 0.1263,
364
  "step": 460
365
  },
366
  {
367
  "epoch": 3.84,
368
- "grad_norm": 3.2197680473327637,
369
  "learning_rate": 1.2750455373406195e-05,
370
- "loss": 0.1328,
371
  "step": 470
372
  },
373
  {
374
  "epoch": 3.92,
375
- "grad_norm": 1.6838792562484741,
376
  "learning_rate": 1.1839708561020036e-05,
377
- "loss": 0.13,
378
  "step": 480
379
  },
380
  {
381
  "epoch": 4.0,
382
- "grad_norm": 1.1785459518432617,
383
  "learning_rate": 1.092896174863388e-05,
384
- "loss": 0.1182,
385
  "step": 490
386
  },
387
  {
388
  "epoch": 4.0,
389
- "eval_accuracy": 0.92,
390
- "eval_f1": 0.92,
391
- "eval_loss": 0.19490335881710052,
392
- "eval_roc_auc": 0.94,
393
- "eval_runtime": 244.4085,
394
- "eval_samples_per_second": 2.831,
395
- "eval_steps_per_second": 0.18,
396
  "step": 490
397
  },
398
  {
399
  "epoch": 4.08,
400
- "grad_norm": 1.0171513557434082,
401
  "learning_rate": 1.0018214936247722e-05,
402
- "loss": 0.1215,
403
  "step": 500
404
  },
405
  {
406
  "epoch": 4.16,
407
- "grad_norm": 0.2800655961036682,
408
  "learning_rate": 9.107468123861566e-06,
409
- "loss": 0.1079,
410
  "step": 510
411
  },
412
  {
413
  "epoch": 4.24,
414
- "grad_norm": 0.21738438308238983,
415
  "learning_rate": 8.196721311475409e-06,
416
- "loss": 0.1035,
417
  "step": 520
418
  },
419
  {
420
  "epoch": 4.33,
421
- "grad_norm": 1.4126193523406982,
422
  "learning_rate": 7.2859744990892545e-06,
423
- "loss": 0.0983,
424
  "step": 530
425
  },
426
  {
427
  "epoch": 4.41,
428
- "grad_norm": 0.337568074464798,
429
  "learning_rate": 6.375227686703098e-06,
430
- "loss": 0.097,
431
  "step": 540
432
  },
433
  {
434
  "epoch": 4.49,
435
- "grad_norm": 0.7930494546890259,
436
  "learning_rate": 5.46448087431694e-06,
437
- "loss": 0.0995,
438
  "step": 550
439
  },
440
  {
441
  "epoch": 4.57,
442
- "grad_norm": 3.2153773307800293,
443
  "learning_rate": 4.553734061930783e-06,
444
- "loss": 0.1041,
445
  "step": 560
446
  },
447
  {
448
  "epoch": 4.65,
449
- "grad_norm": 0.3979400098323822,
450
  "learning_rate": 3.6429872495446273e-06,
451
- "loss": 0.1106,
452
  "step": 570
453
  },
454
  {
455
  "epoch": 4.73,
456
- "grad_norm": 0.2404131442308426,
457
  "learning_rate": 2.73224043715847e-06,
458
- "loss": 0.1115,
459
  "step": 580
460
  },
461
  {
462
  "epoch": 4.82,
463
- "grad_norm": 2.076061248779297,
464
  "learning_rate": 1.8214936247723136e-06,
465
- "loss": 0.1026,
466
  "step": 590
467
  },
468
  {
469
  "epoch": 4.9,
470
- "grad_norm": 0.37801551818847656,
471
  "learning_rate": 9.107468123861568e-07,
472
- "loss": 0.1055,
473
  "step": 600
474
  },
475
  {
476
  "epoch": 4.98,
477
- "grad_norm": 0.21578004956245422,
478
  "learning_rate": 0.0,
479
- "loss": 0.0936,
480
  "step": 610
481
  },
482
  {
483
  "epoch": 4.98,
484
- "eval_accuracy": 0.94,
485
- "eval_f1": 0.95,
486
- "eval_loss": 0.1643325686454773,
487
- "eval_roc_auc": 0.96,
488
- "eval_runtime": 246.0383,
489
- "eval_samples_per_second": 2.813,
490
- "eval_steps_per_second": 0.179,
491
  "step": 610
492
  },
493
  {
494
  "epoch": 4.98,
495
  "step": 610,
496
  "total_flos": 1.511107340940288e+18,
497
- "train_loss": 0.2953212790801877,
498
- "train_runtime": 23053.216,
499
- "train_samples_per_second": 0.849,
500
- "train_steps_per_second": 0.026
501
  }
502
  ],
503
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.92,
3
  "best_model_checkpoint": "vedt-lg/checkpoint-610",
4
  "epoch": 4.979591836734694,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.08,
13
+ "grad_norm": 0.682578980922699,
14
  "learning_rate": 8.196721311475409e-06,
15
+ "loss": 0.6807,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.16,
20
+ "grad_norm": 0.5046971440315247,
21
  "learning_rate": 1.6393442622950818e-05,
22
+ "loss": 0.6364,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.24,
27
+ "grad_norm": 0.33521437644958496,
28
  "learning_rate": 2.459016393442623e-05,
29
+ "loss": 0.5854,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.33,
34
+ "grad_norm": 0.43238601088523865,
35
  "learning_rate": 3.2786885245901635e-05,
36
+ "loss": 0.5798,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.41,
41
+ "grad_norm": 0.4953717291355133,
42
  "learning_rate": 4.098360655737705e-05,
43
+ "loss": 0.566,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.49,
48
+ "grad_norm": 0.2957334816455841,
49
  "learning_rate": 4.918032786885246e-05,
50
+ "loss": 0.57,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.57,
55
+ "grad_norm": 0.5350973010063171,
56
  "learning_rate": 4.918032786885246e-05,
57
+ "loss": 0.5502,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.65,
62
+ "grad_norm": 0.4942874312400818,
63
  "learning_rate": 4.8269581056466304e-05,
64
+ "loss": 0.5717,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.73,
69
+ "grad_norm": 0.6144607067108154,
70
  "learning_rate": 4.7358834244080144e-05,
71
+ "loss": 0.5507,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.82,
76
+ "grad_norm": 0.5439937710762024,
77
  "learning_rate": 4.644808743169399e-05,
78
+ "loss": 0.5512,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.9,
83
+ "grad_norm": 1.0248185396194458,
84
  "learning_rate": 4.553734061930783e-05,
85
+ "loss": 0.5299,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.98,
90
+ "grad_norm": 0.7169288992881775,
91
  "learning_rate": 4.462659380692168e-05,
92
+ "loss": 0.5369,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "eval_accuracy": 0.41,
98
  "eval_f1": 0.53,
99
+ "eval_loss": 0.5338725447654724,
100
  "eval_roc_auc": 0.67,
101
+ "eval_runtime": 232.797,
102
+ "eval_samples_per_second": 2.973,
103
+ "eval_steps_per_second": 0.189,
104
  "step": 122
105
  },
106
  {
107
  "epoch": 1.06,
108
+ "grad_norm": 0.40648889541625977,
109
  "learning_rate": 4.371584699453552e-05,
110
+ "loss": 0.5107,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 1.14,
115
+ "grad_norm": 0.8224670886993408,
116
  "learning_rate": 4.280510018214937e-05,
117
+ "loss": 0.5076,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 1.22,
122
+ "grad_norm": 1.0490410327911377,
123
  "learning_rate": 4.189435336976321e-05,
124
+ "loss": 0.4701,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 1.31,
129
+ "grad_norm": 0.9794094562530518,
130
  "learning_rate": 4.098360655737705e-05,
131
+ "loss": 0.4728,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 1.39,
136
+ "grad_norm": 1.6970316171646118,
137
  "learning_rate": 4.007285974499089e-05,
138
+ "loss": 0.4583,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 1.47,
143
+ "grad_norm": 0.9203856587409973,
144
  "learning_rate": 3.916211293260474e-05,
145
+ "loss": 0.4424,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 1.55,
150
+ "grad_norm": 1.466894507408142,
151
  "learning_rate": 3.825136612021858e-05,
152
+ "loss": 0.4705,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 1.63,
157
+ "grad_norm": 1.467274785041809,
158
  "learning_rate": 3.7340619307832425e-05,
159
+ "loss": 0.3944,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 1.71,
164
+ "grad_norm": 2.0932650566101074,
165
  "learning_rate": 3.6429872495446266e-05,
166
+ "loss": 0.4124,
167
  "step": 210
168
  },
169
  {
170
  "epoch": 1.8,
171
+ "grad_norm": 1.7254140377044678,
172
  "learning_rate": 3.551912568306011e-05,
173
+ "loss": 0.3879,
174
  "step": 220
175
  },
176
  {
177
  "epoch": 1.88,
178
+ "grad_norm": 1.5929466485977173,
179
  "learning_rate": 3.4608378870673954e-05,
180
+ "loss": 0.3973,
181
  "step": 230
182
  },
183
  {
184
  "epoch": 1.96,
185
+ "grad_norm": 1.1917792558670044,
186
  "learning_rate": 3.36976320582878e-05,
187
+ "loss": 0.3995,
188
  "step": 240
189
  },
190
  {
191
  "epoch": 2.0,
192
+ "eval_accuracy": 0.73,
193
+ "eval_f1": 0.8,
194
+ "eval_loss": 0.35908573865890503,
195
+ "eval_roc_auc": 0.84,
196
+ "eval_runtime": 233.574,
197
+ "eval_samples_per_second": 2.963,
198
+ "eval_steps_per_second": 0.188,
199
  "step": 245
200
  },
201
  {
202
  "epoch": 2.04,
203
+ "grad_norm": 1.0108511447906494,
204
  "learning_rate": 3.2786885245901635e-05,
205
+ "loss": 0.3515,
206
  "step": 250
207
  },
208
  {
209
  "epoch": 2.12,
210
+ "grad_norm": 2.756605386734009,
211
  "learning_rate": 3.187613843351548e-05,
212
+ "loss": 0.3185,
213
  "step": 260
214
  },
215
  {
216
  "epoch": 2.2,
217
+ "grad_norm": 1.808558702468872,
218
  "learning_rate": 3.096539162112932e-05,
219
+ "loss": 0.2923,
220
  "step": 270
221
  },
222
  {
223
  "epoch": 2.29,
224
+ "grad_norm": 2.1279032230377197,
225
  "learning_rate": 3.005464480874317e-05,
226
+ "loss": 0.3079,
227
  "step": 280
228
  },
229
  {
230
  "epoch": 2.37,
231
+ "grad_norm": 2.7758231163024902,
232
  "learning_rate": 2.9143897996357018e-05,
233
+ "loss": 0.2895,
234
  "step": 290
235
  },
236
  {
237
  "epoch": 2.45,
238
+ "grad_norm": 3.106663465499878,
239
  "learning_rate": 2.823315118397086e-05,
240
+ "loss": 0.2637,
241
  "step": 300
242
  },
243
  {
244
  "epoch": 2.53,
245
+ "grad_norm": 2.2975656986236572,
246
  "learning_rate": 2.7322404371584703e-05,
247
+ "loss": 0.2647,
248
  "step": 310
249
  },
250
  {
251
  "epoch": 2.61,
252
+ "grad_norm": 2.8662543296813965,
253
  "learning_rate": 2.6411657559198543e-05,
254
+ "loss": 0.2605,
255
  "step": 320
256
  },
257
  {
258
  "epoch": 2.69,
259
+ "grad_norm": 1.3953065872192383,
260
  "learning_rate": 2.550091074681239e-05,
261
+ "loss": 0.2555,
262
  "step": 330
263
  },
264
  {
265
  "epoch": 2.78,
266
+ "grad_norm": 1.7778942584991455,
267
  "learning_rate": 2.459016393442623e-05,
268
+ "loss": 0.2632,
269
  "step": 340
270
  },
271
  {
272
  "epoch": 2.86,
273
+ "grad_norm": 1.884192943572998,
274
  "learning_rate": 2.3679417122040072e-05,
275
+ "loss": 0.2273,
276
  "step": 350
277
  },
278
  {
279
  "epoch": 2.94,
280
+ "grad_norm": 1.6550705432891846,
281
  "learning_rate": 2.2768670309653916e-05,
282
+ "loss": 0.2357,
283
  "step": 360
284
  },
285
  {
286
  "epoch": 3.0,
287
+ "eval_accuracy": 0.88,
288
+ "eval_f1": 0.89,
289
+ "eval_loss": 0.24918493628501892,
290
+ "eval_roc_auc": 0.92,
291
+ "eval_runtime": 229.7594,
292
+ "eval_samples_per_second": 3.012,
293
+ "eval_steps_per_second": 0.192,
294
  "step": 367
295
  },
296
  {
297
  "epoch": 3.02,
298
+ "grad_norm": 2.5777928829193115,
299
  "learning_rate": 2.185792349726776e-05,
300
+ "loss": 0.2088,
301
  "step": 370
302
  },
303
  {
304
  "epoch": 3.1,
305
+ "grad_norm": 0.8032457232475281,
306
  "learning_rate": 2.0947176684881604e-05,
307
+ "loss": 0.1768,
308
  "step": 380
309
  },
310
  {
311
  "epoch": 3.18,
312
+ "grad_norm": 0.6953706741333008,
313
  "learning_rate": 2.0036429872495445e-05,
314
+ "loss": 0.1507,
315
  "step": 390
316
  },
317
  {
318
  "epoch": 3.27,
319
+ "grad_norm": 2.908647060394287,
320
  "learning_rate": 1.912568306010929e-05,
321
+ "loss": 0.158,
322
  "step": 400
323
  },
324
  {
325
  "epoch": 3.35,
326
+ "grad_norm": 0.8021059036254883,
327
  "learning_rate": 1.8214936247723133e-05,
328
+ "loss": 0.1645,
329
  "step": 410
330
  },
331
  {
332
  "epoch": 3.43,
333
+ "grad_norm": 0.3419005274772644,
334
  "learning_rate": 1.7304189435336977e-05,
335
+ "loss": 0.1502,
336
  "step": 420
337
  },
338
  {
339
  "epoch": 3.51,
340
+ "grad_norm": 1.2981460094451904,
341
  "learning_rate": 1.6393442622950818e-05,
342
+ "loss": 0.1494,
343
  "step": 430
344
  },
345
  {
346
  "epoch": 3.59,
347
+ "grad_norm": 1.8636257648468018,
348
  "learning_rate": 1.548269581056466e-05,
349
+ "loss": 0.1536,
350
  "step": 440
351
  },
352
  {
353
  "epoch": 3.67,
354
+ "grad_norm": 2.633089780807495,
355
  "learning_rate": 1.4571948998178509e-05,
356
+ "loss": 0.1471,
357
  "step": 450
358
  },
359
  {
360
  "epoch": 3.76,
361
+ "grad_norm": 0.5719029903411865,
362
  "learning_rate": 1.3661202185792351e-05,
363
+ "loss": 0.1278,
364
  "step": 460
365
  },
366
  {
367
  "epoch": 3.84,
368
+ "grad_norm": 0.7035483121871948,
369
  "learning_rate": 1.2750455373406195e-05,
370
+ "loss": 0.1415,
371
  "step": 470
372
  },
373
  {
374
  "epoch": 3.92,
375
+ "grad_norm": 2.4820892810821533,
376
  "learning_rate": 1.1839708561020036e-05,
377
+ "loss": 0.1419,
378
  "step": 480
379
  },
380
  {
381
  "epoch": 4.0,
382
+ "grad_norm": 0.7846884727478027,
383
  "learning_rate": 1.092896174863388e-05,
384
+ "loss": 0.1409,
385
  "step": 490
386
  },
387
  {
388
  "epoch": 4.0,
389
+ "eval_accuracy": 0.9,
390
+ "eval_f1": 0.91,
391
+ "eval_loss": 0.20149800181388855,
392
+ "eval_roc_auc": 0.93,
393
+ "eval_runtime": 234.0099,
394
+ "eval_samples_per_second": 2.957,
395
+ "eval_steps_per_second": 0.188,
396
  "step": 490
397
  },
398
  {
399
  "epoch": 4.08,
400
+ "grad_norm": 1.5027194023132324,
401
  "learning_rate": 1.0018214936247722e-05,
402
+ "loss": 0.1238,
403
  "step": 500
404
  },
405
  {
406
  "epoch": 4.16,
407
+ "grad_norm": 0.26064255833625793,
408
  "learning_rate": 9.107468123861566e-06,
409
+ "loss": 0.1198,
410
  "step": 510
411
  },
412
  {
413
  "epoch": 4.24,
414
+ "grad_norm": 1.2067747116088867,
415
  "learning_rate": 8.196721311475409e-06,
416
+ "loss": 0.1172,
417
  "step": 520
418
  },
419
  {
420
  "epoch": 4.33,
421
+ "grad_norm": 1.3866766691207886,
422
  "learning_rate": 7.2859744990892545e-06,
423
+ "loss": 0.122,
424
  "step": 530
425
  },
426
  {
427
  "epoch": 4.41,
428
+ "grad_norm": 1.4562671184539795,
429
  "learning_rate": 6.375227686703098e-06,
430
+ "loss": 0.1041,
431
  "step": 540
432
  },
433
  {
434
  "epoch": 4.49,
435
+ "grad_norm": 0.4889439344406128,
436
  "learning_rate": 5.46448087431694e-06,
437
+ "loss": 0.1097,
438
  "step": 550
439
  },
440
  {
441
  "epoch": 4.57,
442
+ "grad_norm": 0.21975009143352509,
443
  "learning_rate": 4.553734061930783e-06,
444
+ "loss": 0.1122,
445
  "step": 560
446
  },
447
  {
448
  "epoch": 4.65,
449
+ "grad_norm": 0.25279122591018677,
450
  "learning_rate": 3.6429872495446273e-06,
451
+ "loss": 0.102,
452
  "step": 570
453
  },
454
  {
455
  "epoch": 4.73,
456
+ "grad_norm": 1.0802370309829712,
457
  "learning_rate": 2.73224043715847e-06,
458
+ "loss": 0.1086,
459
  "step": 580
460
  },
461
  {
462
  "epoch": 4.82,
463
+ "grad_norm": 0.4069725275039673,
464
  "learning_rate": 1.8214936247723136e-06,
465
+ "loss": 0.1145,
466
  "step": 590
467
  },
468
  {
469
  "epoch": 4.9,
470
+ "grad_norm": 1.47029709815979,
471
  "learning_rate": 9.107468123861568e-07,
472
+ "loss": 0.104,
473
  "step": 600
474
  },
475
  {
476
  "epoch": 4.98,
477
+ "grad_norm": 0.2243444174528122,
478
  "learning_rate": 0.0,
479
+ "loss": 0.1137,
480
  "step": 610
481
  },
482
  {
483
  "epoch": 4.98,
484
+ "eval_accuracy": 0.92,
485
+ "eval_f1": 0.93,
486
+ "eval_loss": 0.18165849149227142,
487
+ "eval_roc_auc": 0.95,
488
+ "eval_runtime": 241.8614,
489
+ "eval_samples_per_second": 2.861,
490
+ "eval_steps_per_second": 0.182,
491
  "step": 610
492
  },
493
  {
494
  "epoch": 4.98,
495
  "step": 610,
496
  "total_flos": 1.511107340940288e+18,
497
+ "train_loss": 0.31025831718913843,
498
+ "train_runtime": 21358.5627,
499
+ "train_samples_per_second": 0.916,
500
+ "train_steps_per_second": 0.029
501
  }
502
  ],
503
  "logging_steps": 10,