selmamalak commited on
Commit
63b281e
·
verified ·
1 Parent(s): e11e64a

End of training

Browse files
Files changed (5) hide show
  1. README.md +5 -5
  2. all_results.json +12 -12
  3. eval_results.json +8 -8
  4. train_results.json +4 -4
  5. trainer_state.json +633 -633
README.md CHANGED
@@ -23,11 +23,11 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the medmnist-v2 dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.0732
27
- - Accuracy: 0.9808
28
- - Precision: 0.9830
29
- - Recall: 0.9826
30
- - F1: 0.9825
31
 
32
  ## Model description
33
 
 
23
 
24
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the medmnist-v2 dataset.
25
  It achieves the following results on the evaluation set:
26
+ - Loss: 0.2714
27
+ - Accuracy: 0.9141
28
+ - Precision: 0.9095
29
+ - Recall: 0.9007
30
+ - F1: 0.9042
31
 
32
  ## Model description
33
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 9.99,
3
- "eval_accuracy": 0.9127963231736816,
4
- "eval_f1": 0.9018775351313549,
5
- "eval_loss": 0.26074379682540894,
6
- "eval_precision": 0.9094064911689247,
7
- "eval_recall": 0.8975649435800629,
8
- "eval_runtime": 44.5141,
9
- "eval_samples_per_second": 185.739,
10
- "eval_steps_per_second": 11.614,
11
  "total_flos": 1.0133154899356189e+19,
12
- "train_loss": 0.5219255947714369,
13
- "train_runtime": 1565.9689,
14
- "train_samples_per_second": 83.016,
15
- "train_steps_per_second": 1.296
16
  }
 
1
  {
2
  "epoch": 9.99,
3
+ "eval_accuracy": 0.9141267537493952,
4
+ "eval_f1": 0.9041810830566877,
5
+ "eval_loss": 0.2713584899902344,
6
+ "eval_precision": 0.9094870552968058,
7
+ "eval_recall": 0.9007044159397597,
8
+ "eval_runtime": 44.7123,
9
+ "eval_samples_per_second": 184.915,
10
+ "eval_steps_per_second": 11.563,
11
  "total_flos": 1.0133154899356189e+19,
12
+ "train_loss": 0.5615053875692959,
13
+ "train_runtime": 1549.2005,
14
+ "train_samples_per_second": 83.914,
15
+ "train_steps_per_second": 1.31
16
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 9.99,
3
- "eval_accuracy": 0.9127963231736816,
4
- "eval_f1": 0.9018775351313549,
5
- "eval_loss": 0.26074379682540894,
6
- "eval_precision": 0.9094064911689247,
7
- "eval_recall": 0.8975649435800629,
8
- "eval_runtime": 44.5141,
9
- "eval_samples_per_second": 185.739,
10
- "eval_steps_per_second": 11.614
11
  }
 
1
  {
2
  "epoch": 9.99,
3
+ "eval_accuracy": 0.9141267537493952,
4
+ "eval_f1": 0.9041810830566877,
5
+ "eval_loss": 0.2713584899902344,
6
+ "eval_precision": 0.9094870552968058,
7
+ "eval_recall": 0.9007044159397597,
8
+ "eval_runtime": 44.7123,
9
+ "eval_samples_per_second": 184.915,
10
+ "eval_steps_per_second": 11.563
11
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 9.99,
3
  "total_flos": 1.0133154899356189e+19,
4
- "train_loss": 0.5219255947714369,
5
- "train_runtime": 1565.9689,
6
- "train_samples_per_second": 83.016,
7
- "train_steps_per_second": 1.296
8
  }
 
1
  {
2
  "epoch": 9.99,
3
  "total_flos": 1.0133154899356189e+19,
4
+ "train_loss": 0.5615053875692959,
5
+ "train_runtime": 1549.2005,
6
+ "train_samples_per_second": 83.914,
7
+ "train_steps_per_second": 1.31
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.9870401337792643,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-lora-medmnistv2/checkpoint-1626",
4
  "epoch": 9.98769987699877,
5
  "eval_steps": 500,
6
  "global_step": 2030,
@@ -10,1553 +10,1553 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
- "grad_norm": 1.1422470808029175,
14
  "learning_rate": 0.004975369458128079,
15
- "loss": 1.9628,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1,
20
- "grad_norm": 1.255900502204895,
21
  "learning_rate": 0.004950738916256157,
22
- "loss": 1.3552,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.15,
27
- "grad_norm": 1.5407381057739258,
28
- "learning_rate": 0.00493103448275862,
29
- "loss": 1.0858,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2,
34
- "grad_norm": 2.6224355697631836,
35
- "learning_rate": 0.0049064039408866994,
36
- "loss": 1.0843,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.25,
41
- "grad_norm": 0.8357070088386536,
42
- "learning_rate": 0.0048817733990147785,
43
- "loss": 0.96,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.3,
48
- "grad_norm": 1.080548644065857,
49
- "learning_rate": 0.004857142857142858,
50
- "loss": 0.8978,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.34,
55
- "grad_norm": 1.4522780179977417,
56
- "learning_rate": 0.004832512315270936,
57
- "loss": 0.8698,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.39,
62
- "grad_norm": 0.9388962388038635,
63
- "learning_rate": 0.004807881773399015,
64
- "loss": 0.9796,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.44,
69
- "grad_norm": 1.6392161846160889,
70
- "learning_rate": 0.004783251231527094,
71
- "loss": 0.814,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.49,
76
- "grad_norm": 0.7927560210227966,
77
- "learning_rate": 0.004758620689655172,
78
- "loss": 0.8127,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.54,
83
- "grad_norm": 0.9725190997123718,
84
- "learning_rate": 0.004733990147783251,
85
- "loss": 0.7416,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.59,
90
- "grad_norm": 1.2956442832946777,
91
- "learning_rate": 0.00470935960591133,
92
- "loss": 0.7281,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.64,
97
- "grad_norm": 1.2394323348999023,
98
- "learning_rate": 0.0046847290640394095,
99
- "loss": 0.7632,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.69,
104
- "grad_norm": 1.1683493852615356,
105
- "learning_rate": 0.004660098522167488,
106
- "loss": 0.7589,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.74,
111
- "grad_norm": 0.8499715328216553,
112
  "learning_rate": 0.004635467980295567,
113
- "loss": 0.6864,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.79,
118
- "grad_norm": 0.9673293232917786,
119
  "learning_rate": 0.004610837438423646,
120
- "loss": 0.6827,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.84,
125
- "grad_norm": 0.7566954493522644,
126
- "learning_rate": 0.0045886699507389165,
127
- "loss": 0.667,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.89,
132
- "grad_norm": 1.0029590129852295,
133
  "learning_rate": 0.004564039408866995,
134
- "loss": 0.7118,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.93,
139
- "grad_norm": 0.9083508849143982,
140
  "learning_rate": 0.004539408866995074,
141
- "loss": 0.6388,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.98,
146
- "grad_norm": 0.9889862537384033,
147
  "learning_rate": 0.004514778325123153,
148
- "loss": 0.7007,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 1.0,
153
- "eval_accuracy": 0.9113712374581939,
154
- "eval_f1": 0.8529028766456601,
155
- "eval_loss": 0.2457016110420227,
156
- "eval_precision": 0.9018735870596898,
157
- "eval_recall": 0.8643919959588723,
158
- "eval_runtime": 12.8411,
159
- "eval_samples_per_second": 186.276,
160
- "eval_steps_per_second": 11.681,
161
  "step": 203
162
  },
163
  {
164
  "epoch": 1.03,
165
- "grad_norm": 1.352356195449829,
166
  "learning_rate": 0.004490147783251232,
167
- "loss": 0.7043,
168
  "step": 210
169
  },
170
  {
171
  "epoch": 1.08,
172
- "grad_norm": 0.9853256940841675,
173
  "learning_rate": 0.00446551724137931,
174
- "loss": 0.5867,
175
  "step": 220
176
  },
177
  {
178
  "epoch": 1.13,
179
- "grad_norm": 1.2557787895202637,
180
  "learning_rate": 0.004440886699507389,
181
- "loss": 0.6101,
182
  "step": 230
183
  },
184
  {
185
  "epoch": 1.18,
186
- "grad_norm": 0.9615167379379272,
187
  "learning_rate": 0.004416256157635468,
188
- "loss": 0.7143,
189
  "step": 240
190
  },
191
  {
192
  "epoch": 1.23,
193
- "grad_norm": 0.9772002696990967,
194
  "learning_rate": 0.004391625615763547,
195
- "loss": 0.6388,
196
  "step": 250
197
  },
198
  {
199
  "epoch": 1.28,
200
- "grad_norm": 1.6855661869049072,
201
  "learning_rate": 0.004366995073891626,
202
- "loss": 0.5621,
203
  "step": 260
204
  },
205
  {
206
  "epoch": 1.33,
207
- "grad_norm": 2.439969062805176,
208
  "learning_rate": 0.004342364532019705,
209
- "loss": 0.6839,
210
  "step": 270
211
  },
212
  {
213
  "epoch": 1.38,
214
- "grad_norm": 1.0421786308288574,
215
  "learning_rate": 0.004317733990147784,
216
- "loss": 0.6545,
217
  "step": 280
218
  },
219
  {
220
  "epoch": 1.43,
221
- "grad_norm": 1.445427417755127,
222
  "learning_rate": 0.004293103448275862,
223
- "loss": 0.6383,
224
  "step": 290
225
  },
226
  {
227
  "epoch": 1.48,
228
- "grad_norm": 1.585347056388855,
229
  "learning_rate": 0.00426847290640394,
230
- "loss": 0.6446,
231
  "step": 300
232
  },
233
  {
234
  "epoch": 1.53,
235
- "grad_norm": 1.3732099533081055,
236
  "learning_rate": 0.004243842364532019,
237
- "loss": 0.5853,
238
  "step": 310
239
  },
240
  {
241
  "epoch": 1.57,
242
- "grad_norm": 1.9869935512542725,
243
  "learning_rate": 0.0042192118226600985,
244
- "loss": 0.6442,
245
  "step": 320
246
  },
247
  {
248
  "epoch": 1.62,
249
- "grad_norm": 2.5270144939422607,
250
  "learning_rate": 0.004194581280788178,
251
- "loss": 0.6744,
252
  "step": 330
253
  },
254
  {
255
  "epoch": 1.67,
256
- "grad_norm": 1.2350406646728516,
257
  "learning_rate": 0.004169950738916256,
258
- "loss": 0.711,
259
  "step": 340
260
  },
261
  {
262
  "epoch": 1.72,
263
- "grad_norm": 2.2132787704467773,
264
  "learning_rate": 0.004145320197044335,
265
- "loss": 0.7377,
266
  "step": 350
267
  },
268
  {
269
  "epoch": 1.77,
270
- "grad_norm": 1.414409875869751,
271
  "learning_rate": 0.004120689655172414,
272
- "loss": 0.6735,
273
  "step": 360
274
  },
275
  {
276
  "epoch": 1.82,
277
- "grad_norm": 1.505163311958313,
278
  "learning_rate": 0.004096059113300492,
279
- "loss": 0.6941,
280
  "step": 370
281
  },
282
  {
283
  "epoch": 1.87,
284
- "grad_norm": 1.286877155303955,
285
  "learning_rate": 0.004071428571428571,
286
- "loss": 0.5817,
287
  "step": 380
288
  },
289
  {
290
  "epoch": 1.92,
291
- "grad_norm": 2.4780466556549072,
292
  "learning_rate": 0.00404679802955665,
293
- "loss": 0.6474,
294
  "step": 390
295
  },
296
  {
297
  "epoch": 1.97,
298
- "grad_norm": 1.735863208770752,
299
  "learning_rate": 0.0040221674876847295,
300
- "loss": 0.6322,
301
  "step": 400
302
  },
303
  {
304
  "epoch": 2.0,
305
- "eval_accuracy": 0.9423076923076923,
306
- "eval_f1": 0.9340154565022668,
307
- "eval_loss": 0.21482966840267181,
308
- "eval_precision": 0.9423994421185976,
309
- "eval_recall": 0.9291886861477701,
310
- "eval_runtime": 12.9083,
311
- "eval_samples_per_second": 185.306,
312
- "eval_steps_per_second": 11.62,
313
  "step": 406
314
  },
315
  {
316
  "epoch": 2.02,
317
- "grad_norm": 1.8992809057235718,
318
  "learning_rate": 0.003997536945812808,
319
- "loss": 0.7819,
320
  "step": 410
321
  },
322
  {
323
  "epoch": 2.07,
324
- "grad_norm": 3.8816006183624268,
325
  "learning_rate": 0.003972906403940887,
326
- "loss": 0.7579,
327
  "step": 420
328
  },
329
  {
330
  "epoch": 2.12,
331
- "grad_norm": 2.3438549041748047,
332
  "learning_rate": 0.003948275862068966,
333
- "loss": 0.6899,
334
  "step": 430
335
  },
336
  {
337
  "epoch": 2.16,
338
- "grad_norm": 2.111189126968384,
339
  "learning_rate": 0.003923645320197044,
340
- "loss": 0.6711,
341
  "step": 440
342
  },
343
  {
344
  "epoch": 2.21,
345
- "grad_norm": 2.7600784301757812,
346
  "learning_rate": 0.0038990147783251232,
347
- "loss": 0.6031,
348
  "step": 450
349
  },
350
  {
351
  "epoch": 2.26,
352
- "grad_norm": 1.7545028924942017,
353
  "learning_rate": 0.0038743842364532023,
354
- "loss": 0.6801,
355
  "step": 460
356
  },
357
  {
358
  "epoch": 2.31,
359
- "grad_norm": 3.5373642444610596,
360
  "learning_rate": 0.003849753694581281,
361
- "loss": 0.6683,
362
  "step": 470
363
  },
364
  {
365
  "epoch": 2.36,
366
- "grad_norm": 2.0872020721435547,
367
  "learning_rate": 0.00382512315270936,
368
- "loss": 0.5974,
369
  "step": 480
370
  },
371
  {
372
  "epoch": 2.41,
373
- "grad_norm": 2.178804636001587,
374
  "learning_rate": 0.0038004926108374383,
375
- "loss": 0.5688,
376
  "step": 490
377
  },
378
  {
379
  "epoch": 2.46,
380
- "grad_norm": 2.1402218341827393,
381
  "learning_rate": 0.003775862068965517,
382
- "loss": 0.6617,
383
  "step": 500
384
  },
385
  {
386
  "epoch": 2.51,
387
- "grad_norm": 3.2831871509552,
388
  "learning_rate": 0.003751231527093596,
389
- "loss": 0.6706,
390
  "step": 510
391
  },
392
  {
393
  "epoch": 2.56,
394
- "grad_norm": 1.9515079259872437,
395
  "learning_rate": 0.0037266009852216747,
396
- "loss": 0.6616,
397
  "step": 520
398
  },
399
  {
400
  "epoch": 2.61,
401
- "grad_norm": 1.7052913904190063,
402
  "learning_rate": 0.003701970443349754,
403
- "loss": 0.6816,
404
  "step": 530
405
  },
406
  {
407
  "epoch": 2.66,
408
- "grad_norm": 1.4746874570846558,
409
  "learning_rate": 0.0036773399014778324,
410
- "loss": 0.613,
411
  "step": 540
412
  },
413
  {
414
  "epoch": 2.71,
415
- "grad_norm": 1.1124660968780518,
416
  "learning_rate": 0.0036527093596059115,
417
- "loss": 0.6295,
418
  "step": 550
419
  },
420
  {
421
  "epoch": 2.76,
422
- "grad_norm": 1.4814788103103638,
423
  "learning_rate": 0.00362807881773399,
424
- "loss": 0.618,
425
  "step": 560
426
  },
427
  {
428
  "epoch": 2.8,
429
- "grad_norm": 1.1870466470718384,
430
  "learning_rate": 0.003603448275862069,
431
- "loss": 0.6529,
432
  "step": 570
433
  },
434
  {
435
  "epoch": 2.85,
436
- "grad_norm": 1.1089264154434204,
437
  "learning_rate": 0.003578817733990148,
438
- "loss": 0.5042,
439
  "step": 580
440
  },
441
  {
442
  "epoch": 2.9,
443
- "grad_norm": 2.6037137508392334,
444
  "learning_rate": 0.0035541871921182266,
445
- "loss": 0.7198,
446
  "step": 590
447
  },
448
  {
449
  "epoch": 2.95,
450
- "grad_norm": 1.4666210412979126,
451
  "learning_rate": 0.0035295566502463057,
452
- "loss": 0.6353,
453
  "step": 600
454
  },
455
  {
456
  "epoch": 3.0,
457
- "eval_accuracy": 0.9632107023411371,
458
- "eval_f1": 0.9529335555710525,
459
- "eval_loss": 0.121844083070755,
460
- "eval_precision": 0.9546285621864314,
461
- "eval_recall": 0.9542450303236223,
462
- "eval_runtime": 12.9205,
463
- "eval_samples_per_second": 185.132,
464
- "eval_steps_per_second": 11.609,
465
  "step": 609
466
  },
467
  {
468
  "epoch": 3.0,
469
- "grad_norm": 1.863990068435669,
470
  "learning_rate": 0.0035049261083743843,
471
- "loss": 0.6389,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 3.05,
476
- "grad_norm": 1.3299729824066162,
477
  "learning_rate": 0.0034802955665024634,
478
- "loss": 0.6781,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 3.1,
483
- "grad_norm": 2.5126378536224365,
484
  "learning_rate": 0.003455665024630542,
485
- "loss": 0.6492,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 3.15,
490
- "grad_norm": 1.380492925643921,
491
  "learning_rate": 0.0034310344827586207,
492
- "loss": 0.631,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 3.2,
497
- "grad_norm": 2.03764009475708,
498
  "learning_rate": 0.0034064039408867,
499
- "loss": 0.6221,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 3.25,
504
- "grad_norm": 1.1895209550857544,
505
  "learning_rate": 0.0033817733990147785,
506
- "loss": 0.5881,
507
  "step": 660
508
  },
509
  {
510
  "epoch": 3.3,
511
- "grad_norm": 1.354785442352295,
512
  "learning_rate": 0.003357142857142857,
513
- "loss": 0.584,
514
  "step": 670
515
  },
516
  {
517
  "epoch": 3.35,
518
- "grad_norm": 1.9887776374816895,
519
  "learning_rate": 0.003332512315270936,
520
- "loss": 0.5914,
521
  "step": 680
522
  },
523
  {
524
  "epoch": 3.39,
525
- "grad_norm": 1.7261571884155273,
526
  "learning_rate": 0.0033078817733990145,
527
- "loss": 0.5608,
528
  "step": 690
529
  },
530
  {
531
  "epoch": 3.44,
532
- "grad_norm": 1.3888462781906128,
533
  "learning_rate": 0.0032832512315270936,
534
- "loss": 0.5832,
535
  "step": 700
536
  },
537
  {
538
  "epoch": 3.49,
539
- "grad_norm": 1.6422044038772583,
540
  "learning_rate": 0.003258620689655172,
541
- "loss": 0.5759,
542
  "step": 710
543
  },
544
  {
545
  "epoch": 3.54,
546
- "grad_norm": 1.2814769744873047,
547
  "learning_rate": 0.0032339901477832513,
548
- "loss": 0.5845,
549
  "step": 720
550
  },
551
  {
552
  "epoch": 3.59,
553
- "grad_norm": 1.835681676864624,
554
  "learning_rate": 0.00320935960591133,
555
- "loss": 0.5756,
556
  "step": 730
557
  },
558
  {
559
  "epoch": 3.64,
560
- "grad_norm": 1.3922501802444458,
561
  "learning_rate": 0.003184729064039409,
562
- "loss": 0.5878,
563
  "step": 740
564
  },
565
  {
566
  "epoch": 3.69,
567
- "grad_norm": 1.3808457851409912,
568
  "learning_rate": 0.0031600985221674877,
569
- "loss": 0.5593,
570
  "step": 750
571
  },
572
  {
573
  "epoch": 3.74,
574
- "grad_norm": 1.3295152187347412,
575
- "learning_rate": 0.0031379310344827587,
576
- "loss": 0.5563,
577
  "step": 760
578
  },
579
  {
580
  "epoch": 3.79,
581
- "grad_norm": 2.4613001346588135,
582
- "learning_rate": 0.003113300492610838,
583
- "loss": 0.5341,
584
  "step": 770
585
  },
586
  {
587
  "epoch": 3.84,
588
- "grad_norm": 1.1632391214370728,
589
- "learning_rate": 0.0030886699507389165,
590
- "loss": 0.6108,
591
  "step": 780
592
  },
593
  {
594
  "epoch": 3.89,
595
- "grad_norm": 1.0384527444839478,
596
- "learning_rate": 0.0030640394088669956,
597
- "loss": 0.5597,
598
  "step": 790
599
  },
600
  {
601
  "epoch": 3.94,
602
- "grad_norm": 1.5166749954223633,
603
- "learning_rate": 0.0030394088669950738,
604
- "loss": 0.5153,
605
  "step": 800
606
  },
607
  {
608
  "epoch": 3.99,
609
- "grad_norm": 1.5253658294677734,
610
- "learning_rate": 0.0030147783251231524,
611
- "loss": 0.6176,
612
  "step": 810
613
  },
614
  {
615
  "epoch": 4.0,
616
- "eval_accuracy": 0.979933110367893,
617
- "eval_f1": 0.9781756712736359,
618
- "eval_loss": 0.08385530859231949,
619
- "eval_precision": 0.9775461189264221,
620
- "eval_recall": 0.979303313253159,
621
- "eval_runtime": 12.9634,
622
- "eval_samples_per_second": 184.519,
623
- "eval_steps_per_second": 11.571,
624
  "step": 813
625
  },
626
  {
627
  "epoch": 4.03,
628
- "grad_norm": 1.8829090595245361,
629
- "learning_rate": 0.0029901477832512315,
630
- "loss": 0.5208,
631
  "step": 820
632
  },
633
  {
634
  "epoch": 4.08,
635
- "grad_norm": 1.4730515480041504,
636
- "learning_rate": 0.00296551724137931,
637
- "loss": 0.5205,
638
  "step": 830
639
  },
640
  {
641
  "epoch": 4.13,
642
- "grad_norm": 0.6617820858955383,
643
- "learning_rate": 0.0029408866995073893,
644
- "loss": 0.5029,
645
  "step": 840
646
  },
647
  {
648
  "epoch": 4.18,
649
- "grad_norm": 0.9558489918708801,
650
- "learning_rate": 0.002916256157635468,
651
- "loss": 0.533,
652
  "step": 850
653
  },
654
  {
655
  "epoch": 4.23,
656
- "grad_norm": 2.0359411239624023,
657
- "learning_rate": 0.0028916256157635466,
658
- "loss": 0.4633,
659
  "step": 860
660
  },
661
  {
662
  "epoch": 4.28,
663
- "grad_norm": 2.325270414352417,
664
- "learning_rate": 0.0028669950738916257,
665
- "loss": 0.5877,
666
  "step": 870
667
  },
668
  {
669
  "epoch": 4.33,
670
- "grad_norm": 1.3358855247497559,
671
- "learning_rate": 0.0028423645320197043,
672
- "loss": 0.4447,
673
  "step": 880
674
  },
675
  {
676
  "epoch": 4.38,
677
- "grad_norm": 1.4927520751953125,
678
- "learning_rate": 0.0028177339901477834,
679
- "loss": 0.5783,
680
  "step": 890
681
  },
682
  {
683
  "epoch": 4.43,
684
- "grad_norm": 1.231078028678894,
685
- "learning_rate": 0.002793103448275862,
686
- "loss": 0.5606,
687
  "step": 900
688
  },
689
  {
690
  "epoch": 4.48,
691
- "grad_norm": 1.4861023426055908,
692
- "learning_rate": 0.002768472906403941,
693
- "loss": 0.5006,
694
  "step": 910
695
  },
696
  {
697
  "epoch": 4.53,
698
- "grad_norm": 1.0326859951019287,
699
- "learning_rate": 0.00274384236453202,
700
- "loss": 0.4989,
701
  "step": 920
702
  },
703
  {
704
  "epoch": 4.58,
705
- "grad_norm": 1.290980577468872,
706
- "learning_rate": 0.0027192118226600985,
707
- "loss": 0.514,
708
  "step": 930
709
  },
710
  {
711
  "epoch": 4.62,
712
- "grad_norm": 1.3768541812896729,
713
- "learning_rate": 0.0026945812807881776,
714
- "loss": 0.5221,
715
  "step": 940
716
  },
717
  {
718
  "epoch": 4.67,
719
- "grad_norm": 4.485782623291016,
720
- "learning_rate": 0.0026699507389162562,
721
- "loss": 0.4992,
722
  "step": 950
723
  },
724
  {
725
  "epoch": 4.72,
726
- "grad_norm": 1.4199550151824951,
727
- "learning_rate": 0.0026453201970443353,
728
- "loss": 0.5256,
729
  "step": 960
730
  },
731
  {
732
  "epoch": 4.77,
733
- "grad_norm": 1.4900827407836914,
734
- "learning_rate": 0.002620689655172414,
735
- "loss": 0.5111,
736
  "step": 970
737
  },
738
  {
739
  "epoch": 4.82,
740
- "grad_norm": 1.874714970588684,
741
- "learning_rate": 0.002596059113300493,
742
- "loss": 0.4774,
743
  "step": 980
744
  },
745
  {
746
  "epoch": 4.87,
747
- "grad_norm": 1.254228115081787,
748
- "learning_rate": 0.0025714285714285713,
749
- "loss": 0.4789,
750
  "step": 990
751
  },
752
  {
753
  "epoch": 4.92,
754
- "grad_norm": 2.317281723022461,
755
- "learning_rate": 0.00254679802955665,
756
- "loss": 0.5338,
757
  "step": 1000
758
  },
759
  {
760
  "epoch": 4.97,
761
- "grad_norm": 1.482914686203003,
762
- "learning_rate": 0.002522167487684729,
763
- "loss": 0.4913,
764
  "step": 1010
765
  },
766
  {
767
  "epoch": 5.0,
768
- "eval_accuracy": 0.9711538461538461,
769
- "eval_f1": 0.9707062805947578,
770
- "eval_loss": 0.10083355009555817,
771
- "eval_precision": 0.9712614809682641,
772
- "eval_recall": 0.9717429310005482,
773
- "eval_runtime": 12.9897,
774
- "eval_samples_per_second": 184.146,
775
- "eval_steps_per_second": 11.548,
776
  "step": 1016
777
  },
778
  {
779
  "epoch": 5.02,
780
- "grad_norm": 1.070559024810791,
781
- "learning_rate": 0.002497536945812808,
782
- "loss": 0.4719,
783
  "step": 1020
784
  },
785
  {
786
  "epoch": 5.07,
787
- "grad_norm": 1.206444263458252,
788
- "learning_rate": 0.002472906403940887,
789
- "loss": 0.4973,
790
  "step": 1030
791
  },
792
  {
793
  "epoch": 5.12,
794
- "grad_norm": 1.2651773691177368,
795
- "learning_rate": 0.0024482758620689654,
796
- "loss": 0.5203,
797
  "step": 1040
798
  },
799
  {
800
  "epoch": 5.17,
801
- "grad_norm": 1.162040114402771,
802
- "learning_rate": 0.002423645320197044,
803
- "loss": 0.4781,
804
  "step": 1050
805
  },
806
  {
807
  "epoch": 5.22,
808
- "grad_norm": 1.2628921270370483,
809
- "learning_rate": 0.002399014778325123,
810
- "loss": 0.558,
811
  "step": 1060
812
  },
813
  {
814
  "epoch": 5.26,
815
- "grad_norm": 0.6029661297798157,
816
- "learning_rate": 0.002374384236453202,
817
- "loss": 0.4991,
818
  "step": 1070
819
  },
820
  {
821
  "epoch": 5.31,
822
- "grad_norm": 1.7291909456253052,
823
- "learning_rate": 0.002349753694581281,
824
- "loss": 0.5331,
825
  "step": 1080
826
  },
827
  {
828
  "epoch": 5.36,
829
- "grad_norm": 0.8556851148605347,
830
- "learning_rate": 0.0023251231527093596,
831
- "loss": 0.4594,
832
  "step": 1090
833
  },
834
  {
835
  "epoch": 5.41,
836
- "grad_norm": 0.9938213229179382,
837
- "learning_rate": 0.0023004926108374387,
838
- "loss": 0.4841,
839
  "step": 1100
840
  },
841
  {
842
  "epoch": 5.46,
843
- "grad_norm": 2.558023452758789,
844
- "learning_rate": 0.0022758620689655173,
845
- "loss": 0.5241,
846
  "step": 1110
847
  },
848
  {
849
  "epoch": 5.51,
850
- "grad_norm": 1.3117694854736328,
851
- "learning_rate": 0.0022512315270935964,
852
- "loss": 0.4821,
853
  "step": 1120
854
  },
855
  {
856
  "epoch": 5.56,
857
- "grad_norm": 1.2305413484573364,
858
- "learning_rate": 0.0022266009852216747,
859
- "loss": 0.4234,
860
  "step": 1130
861
  },
862
  {
863
  "epoch": 5.61,
864
- "grad_norm": 1.0140172243118286,
865
- "learning_rate": 0.0022019704433497538,
866
- "loss": 0.4008,
867
  "step": 1140
868
  },
869
  {
870
  "epoch": 5.66,
871
- "grad_norm": 1.2372822761535645,
872
- "learning_rate": 0.0021773399014778324,
873
- "loss": 0.4507,
874
  "step": 1150
875
  },
876
  {
877
  "epoch": 5.71,
878
- "grad_norm": 0.8806868195533752,
879
- "learning_rate": 0.0021527093596059115,
880
- "loss": 0.5398,
881
  "step": 1160
882
  },
883
  {
884
  "epoch": 5.76,
885
- "grad_norm": 0.9182419180870056,
886
- "learning_rate": 0.00212807881773399,
887
- "loss": 0.4085,
888
  "step": 1170
889
  },
890
  {
891
  "epoch": 5.81,
892
- "grad_norm": 1.4466331005096436,
893
- "learning_rate": 0.0021034482758620692,
894
- "loss": 0.4966,
895
  "step": 1180
896
  },
897
  {
898
  "epoch": 5.85,
899
- "grad_norm": 1.1195909976959229,
900
- "learning_rate": 0.002078817733990148,
901
- "loss": 0.4349,
902
  "step": 1190
903
  },
904
  {
905
  "epoch": 5.9,
906
- "grad_norm": 1.7363073825836182,
907
- "learning_rate": 0.0020541871921182266,
908
- "loss": 0.4528,
909
  "step": 1200
910
  },
911
  {
912
  "epoch": 5.95,
913
- "grad_norm": 1.3727185726165771,
914
- "learning_rate": 0.0020295566502463057,
915
- "loss": 0.4943,
916
  "step": 1210
917
  },
918
  {
919
  "epoch": 6.0,
920
- "eval_accuracy": 0.979933110367893,
921
- "eval_f1": 0.9828335162681155,
922
- "eval_loss": 0.08049997687339783,
923
- "eval_precision": 0.9843107832686134,
924
- "eval_recall": 0.9819606365532088,
925
- "eval_runtime": 12.9302,
926
- "eval_samples_per_second": 184.993,
927
- "eval_steps_per_second": 11.601,
928
  "step": 1219
929
  },
930
  {
931
  "epoch": 6.0,
932
- "grad_norm": 0.9201492667198181,
933
- "learning_rate": 0.0020049261083743843,
934
- "loss": 0.4137,
935
  "step": 1220
936
  },
937
  {
938
  "epoch": 6.05,
939
- "grad_norm": 1.0448100566864014,
940
- "learning_rate": 0.001980295566502463,
941
- "loss": 0.4731,
942
  "step": 1230
943
  },
944
  {
945
  "epoch": 6.1,
946
- "grad_norm": 7.53134298324585,
947
- "learning_rate": 0.001955665024630542,
948
- "loss": 0.4018,
949
  "step": 1240
950
  },
951
  {
952
  "epoch": 6.15,
953
- "grad_norm": 1.0926949977874756,
954
- "learning_rate": 0.0019310344827586207,
955
- "loss": 0.4643,
956
  "step": 1250
957
  },
958
  {
959
  "epoch": 6.2,
960
- "grad_norm": 1.1615904569625854,
961
- "learning_rate": 0.0019064039408866996,
962
- "loss": 0.4885,
963
  "step": 1260
964
  },
965
  {
966
  "epoch": 6.25,
967
- "grad_norm": 1.0068614482879639,
968
- "learning_rate": 0.0018817733990147785,
969
- "loss": 0.4001,
970
  "step": 1270
971
  },
972
  {
973
  "epoch": 6.3,
974
- "grad_norm": 1.080955982208252,
975
- "learning_rate": 0.0018571428571428573,
976
- "loss": 0.4136,
977
  "step": 1280
978
  },
979
  {
980
  "epoch": 6.35,
981
- "grad_norm": 1.4813597202301025,
982
- "learning_rate": 0.001832512315270936,
983
- "loss": 0.3908,
984
  "step": 1290
985
  },
986
  {
987
  "epoch": 6.4,
988
- "grad_norm": 0.8198271989822388,
989
- "learning_rate": 0.0018078817733990149,
990
- "loss": 0.4096,
991
  "step": 1300
992
  },
993
  {
994
  "epoch": 6.45,
995
- "grad_norm": 0.9625017642974854,
996
- "learning_rate": 0.0017832512315270935,
997
- "loss": 0.408,
998
  "step": 1310
999
  },
1000
  {
1001
  "epoch": 6.49,
1002
- "grad_norm": 0.969749391078949,
1003
- "learning_rate": 0.0017586206896551724,
1004
- "loss": 0.4371,
1005
  "step": 1320
1006
  },
1007
  {
1008
  "epoch": 6.54,
1009
- "grad_norm": 1.013454794883728,
1010
- "learning_rate": 0.0017339901477832513,
1011
- "loss": 0.4635,
1012
  "step": 1330
1013
  },
1014
  {
1015
  "epoch": 6.59,
1016
- "grad_norm": 1.1573213338851929,
1017
- "learning_rate": 0.0017093596059113301,
1018
- "loss": 0.3503,
1019
  "step": 1340
1020
  },
1021
  {
1022
  "epoch": 6.64,
1023
- "grad_norm": 0.9733229875564575,
1024
- "learning_rate": 0.0016847290640394088,
1025
- "loss": 0.447,
1026
  "step": 1350
1027
  },
1028
  {
1029
  "epoch": 6.69,
1030
- "grad_norm": 0.6445940136909485,
1031
- "learning_rate": 0.0016600985221674877,
1032
- "loss": 0.4405,
1033
  "step": 1360
1034
  },
1035
  {
1036
  "epoch": 6.74,
1037
- "grad_norm": 1.0442399978637695,
1038
- "learning_rate": 0.0016354679802955666,
1039
- "loss": 0.3957,
1040
  "step": 1370
1041
  },
1042
  {
1043
  "epoch": 6.79,
1044
- "grad_norm": 1.03929603099823,
1045
- "learning_rate": 0.0016108374384236454,
1046
- "loss": 0.4509,
1047
  "step": 1380
1048
  },
1049
  {
1050
  "epoch": 6.84,
1051
- "grad_norm": 0.7856884598731995,
1052
- "learning_rate": 0.0015862068965517243,
1053
- "loss": 0.3689,
1054
  "step": 1390
1055
  },
1056
  {
1057
  "epoch": 6.89,
1058
- "grad_norm": 0.6504011750221252,
1059
- "learning_rate": 0.001561576354679803,
1060
- "loss": 0.4059,
1061
  "step": 1400
1062
  },
1063
  {
1064
  "epoch": 6.94,
1065
- "grad_norm": 1.0724025964736938,
1066
- "learning_rate": 0.0015369458128078816,
1067
- "loss": 0.4188,
1068
  "step": 1410
1069
  },
1070
  {
1071
  "epoch": 6.99,
1072
- "grad_norm": 0.8927863836288452,
1073
- "learning_rate": 0.0015123152709359605,
1074
- "loss": 0.426,
1075
  "step": 1420
1076
  },
1077
  {
1078
  "epoch": 7.0,
1079
- "eval_accuracy": 0.979933110367893,
1080
- "eval_f1": 0.9780623859702803,
1081
- "eval_loss": 0.06709808856248856,
1082
- "eval_precision": 0.9782146710296142,
1083
- "eval_recall": 0.9786844286441465,
1084
- "eval_runtime": 12.9674,
1085
- "eval_samples_per_second": 184.462,
1086
- "eval_steps_per_second": 11.567,
1087
  "step": 1422
1088
  },
1089
  {
1090
  "epoch": 7.04,
1091
- "grad_norm": 0.7571507096290588,
1092
- "learning_rate": 0.0014876847290640394,
1093
- "loss": 0.3513,
1094
  "step": 1430
1095
  },
1096
  {
1097
  "epoch": 7.08,
1098
- "grad_norm": 0.9653282761573792,
1099
- "learning_rate": 0.0014630541871921182,
1100
- "loss": 0.3661,
1101
  "step": 1440
1102
  },
1103
  {
1104
  "epoch": 7.13,
1105
- "grad_norm": 1.1139851808547974,
1106
- "learning_rate": 0.001438423645320197,
1107
- "loss": 0.3978,
1108
  "step": 1450
1109
  },
1110
  {
1111
  "epoch": 7.18,
1112
- "grad_norm": 1.7360243797302246,
1113
- "learning_rate": 0.001413793103448276,
1114
- "loss": 0.3504,
1115
  "step": 1460
1116
  },
1117
  {
1118
  "epoch": 7.23,
1119
- "grad_norm": 1.009930968284607,
1120
- "learning_rate": 0.0013891625615763549,
1121
- "loss": 0.3718,
1122
  "step": 1470
1123
  },
1124
  {
1125
  "epoch": 7.28,
1126
- "grad_norm": 1.2110596895217896,
1127
- "learning_rate": 0.0013645320197044337,
1128
- "loss": 0.4109,
1129
  "step": 1480
1130
  },
1131
  {
1132
  "epoch": 7.33,
1133
- "grad_norm": 1.1028022766113281,
1134
- "learning_rate": 0.0013399014778325122,
1135
- "loss": 0.4052,
1136
  "step": 1490
1137
  },
1138
  {
1139
  "epoch": 7.38,
1140
- "grad_norm": 1.0521297454833984,
1141
- "learning_rate": 0.001315270935960591,
1142
- "loss": 0.4152,
1143
  "step": 1500
1144
  },
1145
  {
1146
  "epoch": 7.43,
1147
- "grad_norm": 0.8621806502342224,
1148
- "learning_rate": 0.00129064039408867,
1149
- "loss": 0.3573,
1150
  "step": 1510
1151
  },
1152
  {
1153
  "epoch": 7.48,
1154
- "grad_norm": 1.0334779024124146,
1155
- "learning_rate": 0.0012660098522167488,
1156
- "loss": 0.3673,
1157
  "step": 1520
1158
  },
1159
  {
1160
  "epoch": 7.53,
1161
- "grad_norm": 0.7882916331291199,
1162
- "learning_rate": 0.0012413793103448277,
1163
- "loss": 0.3391,
1164
  "step": 1530
1165
  },
1166
  {
1167
  "epoch": 7.58,
1168
- "grad_norm": 1.2786757946014404,
1169
- "learning_rate": 0.0012167487684729065,
1170
- "loss": 0.3939,
1171
  "step": 1540
1172
  },
1173
  {
1174
  "epoch": 7.63,
1175
- "grad_norm": 1.4354298114776611,
1176
- "learning_rate": 0.0011921182266009852,
1177
- "loss": 0.4541,
1178
  "step": 1550
1179
  },
1180
  {
1181
  "epoch": 7.68,
1182
- "grad_norm": 1.0900424718856812,
1183
- "learning_rate": 0.001167487684729064,
1184
- "loss": 0.3183,
1185
  "step": 1560
1186
  },
1187
  {
1188
  "epoch": 7.72,
1189
- "grad_norm": 0.8424840569496155,
1190
- "learning_rate": 0.001142857142857143,
1191
- "loss": 0.3408,
1192
  "step": 1570
1193
  },
1194
  {
1195
  "epoch": 7.77,
1196
- "grad_norm": 0.6967119574546814,
1197
- "learning_rate": 0.0011182266009852216,
1198
- "loss": 0.4029,
1199
  "step": 1580
1200
  },
1201
  {
1202
  "epoch": 7.82,
1203
- "grad_norm": 0.7286412119865417,
1204
- "learning_rate": 0.0010935960591133005,
1205
- "loss": 0.3717,
1206
  "step": 1590
1207
  },
1208
  {
1209
  "epoch": 7.87,
1210
- "grad_norm": 0.9532930254936218,
1211
- "learning_rate": 0.0010689655172413793,
1212
- "loss": 0.3215,
1213
  "step": 1600
1214
  },
1215
  {
1216
  "epoch": 7.92,
1217
- "grad_norm": 1.0920195579528809,
1218
- "learning_rate": 0.001044334975369458,
1219
- "loss": 0.3885,
1220
  "step": 1610
1221
  },
1222
  {
1223
  "epoch": 7.97,
1224
- "grad_norm": 0.8902508020401001,
1225
- "learning_rate": 0.0010197044334975369,
1226
- "loss": 0.4152,
1227
  "step": 1620
1228
  },
1229
  {
1230
  "epoch": 8.0,
1231
- "eval_accuracy": 0.9870401337792643,
1232
- "eval_f1": 0.9877879659430536,
1233
- "eval_loss": 0.056644052267074585,
1234
- "eval_precision": 0.9862146260055851,
1235
- "eval_recall": 0.9896401486739793,
1236
- "eval_runtime": 12.9254,
1237
- "eval_samples_per_second": 185.062,
1238
- "eval_steps_per_second": 11.605,
1239
  "step": 1626
1240
  },
1241
  {
1242
  "epoch": 8.02,
1243
- "grad_norm": 0.998824954032898,
1244
- "learning_rate": 0.0009950738916256158,
1245
- "loss": 0.289,
1246
  "step": 1630
1247
  },
1248
  {
1249
  "epoch": 8.07,
1250
- "grad_norm": 0.6882240772247314,
1251
- "learning_rate": 0.0009704433497536946,
1252
- "loss": 0.3786,
1253
  "step": 1640
1254
  },
1255
  {
1256
  "epoch": 8.12,
1257
- "grad_norm": 0.8674384951591492,
1258
- "learning_rate": 0.0009458128078817735,
1259
- "loss": 0.4118,
1260
  "step": 1650
1261
  },
1262
  {
1263
  "epoch": 8.17,
1264
- "grad_norm": 1.109112024307251,
1265
- "learning_rate": 0.0009211822660098522,
1266
- "loss": 0.3298,
1267
  "step": 1660
1268
  },
1269
  {
1270
  "epoch": 8.22,
1271
- "grad_norm": 0.8515803217887878,
1272
- "learning_rate": 0.000896551724137931,
1273
- "loss": 0.2702,
1274
  "step": 1670
1275
  },
1276
  {
1277
  "epoch": 8.27,
1278
- "grad_norm": 1.003696322441101,
1279
- "learning_rate": 0.0008719211822660099,
1280
- "loss": 0.3509,
1281
  "step": 1680
1282
  },
1283
  {
1284
  "epoch": 8.31,
1285
- "grad_norm": 0.8540720343589783,
1286
- "learning_rate": 0.0008472906403940888,
1287
- "loss": 0.31,
1288
  "step": 1690
1289
  },
1290
  {
1291
  "epoch": 8.36,
1292
- "grad_norm": 1.6798268556594849,
1293
- "learning_rate": 0.0008226600985221674,
1294
- "loss": 0.3727,
1295
  "step": 1700
1296
  },
1297
  {
1298
  "epoch": 8.41,
1299
- "grad_norm": 0.8054636120796204,
1300
- "learning_rate": 0.0007980295566502463,
1301
- "loss": 0.3159,
1302
  "step": 1710
1303
  },
1304
  {
1305
  "epoch": 8.46,
1306
- "grad_norm": 1.4890103340148926,
1307
- "learning_rate": 0.0007733990147783252,
1308
- "loss": 0.3387,
1309
  "step": 1720
1310
  },
1311
  {
1312
  "epoch": 8.51,
1313
- "grad_norm": 0.7350850105285645,
1314
- "learning_rate": 0.000748768472906404,
1315
- "loss": 0.3237,
1316
  "step": 1730
1317
  },
1318
  {
1319
  "epoch": 8.56,
1320
- "grad_norm": 1.4844990968704224,
1321
- "learning_rate": 0.0007241379310344828,
1322
- "loss": 0.3395,
1323
  "step": 1740
1324
  },
1325
  {
1326
  "epoch": 8.61,
1327
- "grad_norm": 0.887062132358551,
1328
- "learning_rate": 0.0006995073891625616,
1329
- "loss": 0.3418,
1330
  "step": 1750
1331
  },
1332
  {
1333
  "epoch": 8.66,
1334
- "grad_norm": 0.7276543378829956,
1335
- "learning_rate": 0.0006748768472906404,
1336
- "loss": 0.3433,
1337
  "step": 1760
1338
  },
1339
  {
1340
  "epoch": 8.71,
1341
- "grad_norm": 0.6962388753890991,
1342
- "learning_rate": 0.0006502463054187192,
1343
- "loss": 0.3588,
1344
  "step": 1770
1345
  },
1346
  {
1347
  "epoch": 8.76,
1348
- "grad_norm": 1.764138102531433,
1349
- "learning_rate": 0.0006256157635467981,
1350
- "loss": 0.3565,
1351
  "step": 1780
1352
  },
1353
  {
1354
  "epoch": 8.81,
1355
- "grad_norm": 0.7240989208221436,
1356
- "learning_rate": 0.0006009852216748769,
1357
- "loss": 0.3361,
1358
  "step": 1790
1359
  },
1360
  {
1361
  "epoch": 8.86,
1362
- "grad_norm": 0.9319175481796265,
1363
- "learning_rate": 0.0005763546798029557,
1364
- "loss": 0.3313,
1365
  "step": 1800
1366
  },
1367
  {
1368
  "epoch": 8.91,
1369
- "grad_norm": 0.8015260100364685,
1370
- "learning_rate": 0.0005517241379310345,
1371
- "loss": 0.3776,
1372
  "step": 1810
1373
  },
1374
  {
1375
  "epoch": 8.95,
1376
- "grad_norm": 0.6193283200263977,
1377
- "learning_rate": 0.0005270935960591134,
1378
- "loss": 0.2927,
1379
  "step": 1820
1380
  },
1381
  {
1382
  "epoch": 9.0,
1383
- "eval_accuracy": 0.9836956521739131,
1384
- "eval_f1": 0.9847638806045893,
1385
- "eval_loss": 0.06579139083623886,
1386
- "eval_precision": 0.9850023004987147,
1387
- "eval_recall": 0.9849849759564541,
1388
- "eval_runtime": 12.7582,
1389
- "eval_samples_per_second": 187.488,
1390
- "eval_steps_per_second": 11.757,
1391
  "step": 1829
1392
  },
1393
  {
1394
  "epoch": 9.0,
1395
- "grad_norm": 0.6149892210960388,
1396
- "learning_rate": 0.0005024630541871921,
1397
- "loss": 0.3451,
1398
  "step": 1830
1399
  },
1400
  {
1401
  "epoch": 9.05,
1402
- "grad_norm": 0.7571848034858704,
1403
- "learning_rate": 0.00047783251231527096,
1404
- "loss": 0.3187,
1405
  "step": 1840
1406
  },
1407
  {
1408
  "epoch": 9.1,
1409
- "grad_norm": 0.8765355348587036,
1410
- "learning_rate": 0.00045320197044334973,
1411
- "loss": 0.3447,
1412
  "step": 1850
1413
  },
1414
  {
1415
  "epoch": 9.15,
1416
- "grad_norm": 0.5563300848007202,
1417
- "learning_rate": 0.0004285714285714286,
1418
- "loss": 0.3529,
1419
  "step": 1860
1420
  },
1421
  {
1422
  "epoch": 9.2,
1423
- "grad_norm": 0.7253230810165405,
1424
- "learning_rate": 0.0004039408866995074,
1425
- "loss": 0.2695,
1426
  "step": 1870
1427
  },
1428
  {
1429
  "epoch": 9.25,
1430
- "grad_norm": 0.6665922403335571,
1431
- "learning_rate": 0.0003793103448275862,
1432
- "loss": 0.3091,
1433
  "step": 1880
1434
  },
1435
  {
1436
  "epoch": 9.3,
1437
- "grad_norm": 1.43943452835083,
1438
- "learning_rate": 0.00035467980295566506,
1439
- "loss": 0.2942,
1440
  "step": 1890
1441
  },
1442
  {
1443
  "epoch": 9.35,
1444
- "grad_norm": 2.071869373321533,
1445
- "learning_rate": 0.00033004926108374383,
1446
- "loss": 0.3097,
1447
  "step": 1900
1448
  },
1449
  {
1450
  "epoch": 9.4,
1451
- "grad_norm": 0.6491404175758362,
1452
- "learning_rate": 0.0003054187192118227,
1453
- "loss": 0.2895,
1454
  "step": 1910
1455
  },
1456
  {
1457
  "epoch": 9.45,
1458
- "grad_norm": 0.9849537014961243,
1459
- "learning_rate": 0.0002807881773399015,
1460
- "loss": 0.3157,
1461
  "step": 1920
1462
  },
1463
  {
1464
  "epoch": 9.5,
1465
- "grad_norm": 8.456196784973145,
1466
- "learning_rate": 0.0002561576354679803,
1467
- "loss": 0.322,
1468
  "step": 1930
1469
  },
1470
  {
1471
  "epoch": 9.54,
1472
- "grad_norm": 0.5972003936767578,
1473
- "learning_rate": 0.0002315270935960591,
1474
- "loss": 0.2867,
1475
  "step": 1940
1476
  },
1477
  {
1478
  "epoch": 9.59,
1479
- "grad_norm": 0.42175132036209106,
1480
- "learning_rate": 0.00020689655172413793,
1481
- "loss": 0.2804,
1482
  "step": 1950
1483
  },
1484
  {
1485
  "epoch": 9.64,
1486
- "grad_norm": 1.2799464464187622,
1487
- "learning_rate": 0.00018226600985221675,
1488
- "loss": 0.2941,
1489
  "step": 1960
1490
  },
1491
  {
1492
  "epoch": 9.69,
1493
- "grad_norm": 0.8577640056610107,
1494
- "learning_rate": 0.00015763546798029557,
1495
- "loss": 0.3202,
1496
  "step": 1970
1497
  },
1498
  {
1499
  "epoch": 9.74,
1500
- "grad_norm": 0.9166315197944641,
1501
- "learning_rate": 0.00013300492610837438,
1502
- "loss": 0.3404,
1503
  "step": 1980
1504
  },
1505
  {
1506
  "epoch": 9.79,
1507
- "grad_norm": 1.3336009979248047,
1508
- "learning_rate": 0.0001083743842364532,
1509
- "loss": 0.3071,
1510
  "step": 1990
1511
  },
1512
  {
1513
  "epoch": 9.84,
1514
- "grad_norm": 0.7033586502075195,
1515
- "learning_rate": 8.374384236453201e-05,
1516
- "loss": 0.3037,
1517
  "step": 2000
1518
  },
1519
  {
1520
  "epoch": 9.89,
1521
- "grad_norm": 0.48131078481674194,
1522
- "learning_rate": 5.9113300492610844e-05,
1523
- "loss": 0.289,
1524
  "step": 2010
1525
  },
1526
  {
1527
  "epoch": 9.94,
1528
- "grad_norm": 0.8269424438476562,
1529
- "learning_rate": 3.4482758620689657e-05,
1530
- "loss": 0.3021,
1531
  "step": 2020
1532
  },
1533
  {
1534
  "epoch": 9.99,
1535
- "grad_norm": 1.1306647062301636,
1536
- "learning_rate": 9.852216748768473e-06,
1537
- "loss": 0.3154,
1538
  "step": 2030
1539
  },
1540
  {
1541
  "epoch": 9.99,
1542
- "eval_accuracy": 0.9841137123745819,
1543
- "eval_f1": 0.9852639526448546,
1544
- "eval_loss": 0.0536968968808651,
1545
- "eval_precision": 0.9855005493602992,
1546
- "eval_recall": 0.9854135689141749,
1547
- "eval_runtime": 12.8562,
1548
- "eval_samples_per_second": 186.058,
1549
- "eval_steps_per_second": 11.668,
1550
  "step": 2030
1551
  },
1552
  {
1553
  "epoch": 9.99,
1554
  "step": 2030,
1555
  "total_flos": 1.0133154899356189e+19,
1556
- "train_loss": 0.5219255947714369,
1557
- "train_runtime": 1565.9689,
1558
- "train_samples_per_second": 83.016,
1559
- "train_steps_per_second": 1.296
1560
  }
1561
  ],
1562
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.9807692307692307,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-lora-medmnistv2/checkpoint-2030",
4
  "epoch": 9.98769987699877,
5
  "eval_steps": 500,
6
  "global_step": 2030,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
+ "grad_norm": 1.854022741317749,
14
  "learning_rate": 0.004975369458128079,
15
+ "loss": 0.5157,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1,
20
+ "grad_norm": 2.006150245666504,
21
  "learning_rate": 0.004950738916256157,
22
+ "loss": 0.645,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.15,
27
+ "grad_norm": 1.8034334182739258,
28
+ "learning_rate": 0.0049261083743842365,
29
+ "loss": 0.5907,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2,
34
+ "grad_norm": 1.7398325204849243,
35
+ "learning_rate": 0.004903940886699507,
36
+ "loss": 0.6186,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.25,
41
+ "grad_norm": 1.3694934844970703,
42
+ "learning_rate": 0.004879310344827586,
43
+ "loss": 0.5955,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.3,
48
+ "grad_norm": 1.134981632232666,
49
+ "learning_rate": 0.004854679802955665,
50
+ "loss": 0.5948,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.34,
55
+ "grad_norm": 1.2749730348587036,
56
+ "learning_rate": 0.0048300492610837435,
57
+ "loss": 0.598,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.39,
62
+ "grad_norm": 2.5001230239868164,
63
+ "learning_rate": 0.004805418719211823,
64
+ "loss": 0.6648,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.44,
69
+ "grad_norm": 1.3467851877212524,
70
+ "learning_rate": 0.004780788177339902,
71
+ "loss": 0.6488,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.49,
76
+ "grad_norm": 1.319006323814392,
77
+ "learning_rate": 0.00475615763546798,
78
+ "loss": 0.6469,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.54,
83
+ "grad_norm": 1.564982295036316,
84
+ "learning_rate": 0.004731527093596059,
85
+ "loss": 0.5583,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.59,
90
+ "grad_norm": 1.2751026153564453,
91
+ "learning_rate": 0.004706896551724138,
92
+ "loss": 0.5683,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.64,
97
+ "grad_norm": 1.8151675462722778,
98
+ "learning_rate": 0.004682266009852217,
99
+ "loss": 0.6268,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.69,
104
+ "grad_norm": 1.337044358253479,
105
+ "learning_rate": 0.004657635467980295,
106
+ "loss": 0.6423,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.74,
111
+ "grad_norm": 1.5720871686935425,
112
  "learning_rate": 0.004635467980295567,
113
+ "loss": 0.6581,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.79,
118
+ "grad_norm": 1.5531283617019653,
119
  "learning_rate": 0.004610837438423646,
120
+ "loss": 0.6095,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.84,
125
+ "grad_norm": 3.048377513885498,
126
+ "learning_rate": 0.004586206896551724,
127
+ "loss": 0.661,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.89,
132
+ "grad_norm": 1.9743188619613647,
133
  "learning_rate": 0.004564039408866995,
134
+ "loss": 0.6506,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.93,
139
+ "grad_norm": 1.9349859952926636,
140
  "learning_rate": 0.004539408866995074,
141
+ "loss": 0.6681,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.98,
146
+ "grad_norm": 1.7585110664367676,
147
  "learning_rate": 0.004514778325123153,
148
+ "loss": 0.6525,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 1.0,
153
+ "eval_accuracy": 0.9326923076923077,
154
+ "eval_f1": 0.9090572948739559,
155
+ "eval_loss": 0.20250679552555084,
156
+ "eval_precision": 0.9259619040327134,
157
+ "eval_recall": 0.9130050125624415,
158
+ "eval_runtime": 12.8179,
159
+ "eval_samples_per_second": 186.614,
160
+ "eval_steps_per_second": 11.702,
161
  "step": 203
162
  },
163
  {
164
  "epoch": 1.03,
165
+ "grad_norm": 1.5931968688964844,
166
  "learning_rate": 0.004490147783251232,
167
+ "loss": 0.7042,
168
  "step": 210
169
  },
170
  {
171
  "epoch": 1.08,
172
+ "grad_norm": 4.101116180419922,
173
  "learning_rate": 0.00446551724137931,
174
+ "loss": 0.4773,
175
  "step": 220
176
  },
177
  {
178
  "epoch": 1.13,
179
+ "grad_norm": 3.1857903003692627,
180
  "learning_rate": 0.004440886699507389,
181
+ "loss": 0.5874,
182
  "step": 230
183
  },
184
  {
185
  "epoch": 1.18,
186
+ "grad_norm": 1.9867714643478394,
187
  "learning_rate": 0.004416256157635468,
188
+ "loss": 0.6094,
189
  "step": 240
190
  },
191
  {
192
  "epoch": 1.23,
193
+ "grad_norm": 2.3485002517700195,
194
  "learning_rate": 0.004391625615763547,
195
+ "loss": 0.589,
196
  "step": 250
197
  },
198
  {
199
  "epoch": 1.28,
200
+ "grad_norm": 2.4279944896698,
201
  "learning_rate": 0.004366995073891626,
202
+ "loss": 0.6718,
203
  "step": 260
204
  },
205
  {
206
  "epoch": 1.33,
207
+ "grad_norm": 2.5905745029449463,
208
  "learning_rate": 0.004342364532019705,
209
+ "loss": 0.6379,
210
  "step": 270
211
  },
212
  {
213
  "epoch": 1.38,
214
+ "grad_norm": 1.4976040124893188,
215
  "learning_rate": 0.004317733990147784,
216
+ "loss": 0.6386,
217
  "step": 280
218
  },
219
  {
220
  "epoch": 1.43,
221
+ "grad_norm": 2.4124257564544678,
222
  "learning_rate": 0.004293103448275862,
223
+ "loss": 0.6483,
224
  "step": 290
225
  },
226
  {
227
  "epoch": 1.48,
228
+ "grad_norm": 1.5323392152786255,
229
  "learning_rate": 0.00426847290640394,
230
+ "loss": 0.639,
231
  "step": 300
232
  },
233
  {
234
  "epoch": 1.53,
235
+ "grad_norm": 2.58333683013916,
236
  "learning_rate": 0.004243842364532019,
237
+ "loss": 0.5527,
238
  "step": 310
239
  },
240
  {
241
  "epoch": 1.57,
242
+ "grad_norm": 2.3336663246154785,
243
  "learning_rate": 0.0042192118226600985,
244
+ "loss": 0.7489,
245
  "step": 320
246
  },
247
  {
248
  "epoch": 1.62,
249
+ "grad_norm": 3.3794748783111572,
250
  "learning_rate": 0.004194581280788178,
251
+ "loss": 0.6175,
252
  "step": 330
253
  },
254
  {
255
  "epoch": 1.67,
256
+ "grad_norm": 2.078139543533325,
257
  "learning_rate": 0.004169950738916256,
258
+ "loss": 0.6445,
259
  "step": 340
260
  },
261
  {
262
  "epoch": 1.72,
263
+ "grad_norm": 3.8369834423065186,
264
  "learning_rate": 0.004145320197044335,
265
+ "loss": 0.7733,
266
  "step": 350
267
  },
268
  {
269
  "epoch": 1.77,
270
+ "grad_norm": 5.295024871826172,
271
  "learning_rate": 0.004120689655172414,
272
+ "loss": 0.8105,
273
  "step": 360
274
  },
275
  {
276
  "epoch": 1.82,
277
+ "grad_norm": 6.0653181076049805,
278
  "learning_rate": 0.004096059113300492,
279
+ "loss": 0.8405,
280
  "step": 370
281
  },
282
  {
283
  "epoch": 1.87,
284
+ "grad_norm": 3.94065260887146,
285
  "learning_rate": 0.004071428571428571,
286
+ "loss": 0.8379,
287
  "step": 380
288
  },
289
  {
290
  "epoch": 1.92,
291
+ "grad_norm": 4.302937030792236,
292
  "learning_rate": 0.00404679802955665,
293
+ "loss": 0.8095,
294
  "step": 390
295
  },
296
  {
297
  "epoch": 1.97,
298
+ "grad_norm": 4.810948371887207,
299
  "learning_rate": 0.0040221674876847295,
300
+ "loss": 0.765,
301
  "step": 400
302
  },
303
  {
304
  "epoch": 2.0,
305
+ "eval_accuracy": 0.9377090301003345,
306
+ "eval_f1": 0.9344003800244746,
307
+ "eval_loss": 0.2109694927930832,
308
+ "eval_precision": 0.9440505095684633,
309
+ "eval_recall": 0.9288923374288455,
310
+ "eval_runtime": 12.7867,
311
+ "eval_samples_per_second": 187.069,
312
+ "eval_steps_per_second": 11.731,
313
  "step": 406
314
  },
315
  {
316
  "epoch": 2.02,
317
+ "grad_norm": 9.29123306274414,
318
  "learning_rate": 0.003997536945812808,
319
+ "loss": 0.8761,
320
  "step": 410
321
  },
322
  {
323
  "epoch": 2.07,
324
+ "grad_norm": 3.074866533279419,
325
  "learning_rate": 0.003972906403940887,
326
+ "loss": 1.0319,
327
  "step": 420
328
  },
329
  {
330
  "epoch": 2.12,
331
+ "grad_norm": 1.6303837299346924,
332
  "learning_rate": 0.003948275862068966,
333
+ "loss": 0.7979,
334
  "step": 430
335
  },
336
  {
337
  "epoch": 2.16,
338
+ "grad_norm": 1.928175687789917,
339
  "learning_rate": 0.003923645320197044,
340
+ "loss": 0.7132,
341
  "step": 440
342
  },
343
  {
344
  "epoch": 2.21,
345
+ "grad_norm": 4.943911552429199,
346
  "learning_rate": 0.0038990147783251232,
347
+ "loss": 0.7863,
348
  "step": 450
349
  },
350
  {
351
  "epoch": 2.26,
352
+ "grad_norm": 2.6557776927948,
353
  "learning_rate": 0.0038743842364532023,
354
+ "loss": 0.6522,
355
  "step": 460
356
  },
357
  {
358
  "epoch": 2.31,
359
+ "grad_norm": 15.654020309448242,
360
  "learning_rate": 0.003849753694581281,
361
+ "loss": 0.8122,
362
  "step": 470
363
  },
364
  {
365
  "epoch": 2.36,
366
+ "grad_norm": 4.983061790466309,
367
  "learning_rate": 0.00382512315270936,
368
+ "loss": 0.757,
369
  "step": 480
370
  },
371
  {
372
  "epoch": 2.41,
373
+ "grad_norm": 2.0351402759552,
374
  "learning_rate": 0.0038004926108374383,
375
+ "loss": 0.5953,
376
  "step": 490
377
  },
378
  {
379
  "epoch": 2.46,
380
+ "grad_norm": 4.033576488494873,
381
  "learning_rate": 0.003775862068965517,
382
+ "loss": 0.6497,
383
  "step": 500
384
  },
385
  {
386
  "epoch": 2.51,
387
+ "grad_norm": 3.6152243614196777,
388
  "learning_rate": 0.003751231527093596,
389
+ "loss": 0.6615,
390
  "step": 510
391
  },
392
  {
393
  "epoch": 2.56,
394
+ "grad_norm": 3.6705260276794434,
395
  "learning_rate": 0.0037266009852216747,
396
+ "loss": 0.6813,
397
  "step": 520
398
  },
399
  {
400
  "epoch": 2.61,
401
+ "grad_norm": 4.159302234649658,
402
  "learning_rate": 0.003701970443349754,
403
+ "loss": 0.6579,
404
  "step": 530
405
  },
406
  {
407
  "epoch": 2.66,
408
+ "grad_norm": 2.04949951171875,
409
  "learning_rate": 0.0036773399014778324,
410
+ "loss": 0.6976,
411
  "step": 540
412
  },
413
  {
414
  "epoch": 2.71,
415
+ "grad_norm": 1.9712501764297485,
416
  "learning_rate": 0.0036527093596059115,
417
+ "loss": 0.6729,
418
  "step": 550
419
  },
420
  {
421
  "epoch": 2.76,
422
+ "grad_norm": 2.4759750366210938,
423
  "learning_rate": 0.00362807881773399,
424
+ "loss": 0.7195,
425
  "step": 560
426
  },
427
  {
428
  "epoch": 2.8,
429
+ "grad_norm": 1.463779091835022,
430
  "learning_rate": 0.003603448275862069,
431
+ "loss": 0.7682,
432
  "step": 570
433
  },
434
  {
435
  "epoch": 2.85,
436
+ "grad_norm": 3.0022284984588623,
437
  "learning_rate": 0.003578817733990148,
438
+ "loss": 0.5556,
439
  "step": 580
440
  },
441
  {
442
  "epoch": 2.9,
443
+ "grad_norm": 2.585496187210083,
444
  "learning_rate": 0.0035541871921182266,
445
+ "loss": 0.7779,
446
  "step": 590
447
  },
448
  {
449
  "epoch": 2.95,
450
+ "grad_norm": 1.619903326034546,
451
  "learning_rate": 0.0035295566502463057,
452
+ "loss": 0.6514,
453
  "step": 600
454
  },
455
  {
456
  "epoch": 3.0,
457
+ "eval_accuracy": 0.9489966555183946,
458
+ "eval_f1": 0.9427879032940832,
459
+ "eval_loss": 0.20264820754528046,
460
+ "eval_precision": 0.9456850153895534,
461
+ "eval_recall": 0.9442313976349834,
462
+ "eval_runtime": 12.7657,
463
+ "eval_samples_per_second": 187.377,
464
+ "eval_steps_per_second": 11.75,
465
  "step": 609
466
  },
467
  {
468
  "epoch": 3.0,
469
+ "grad_norm": 1.8233991861343384,
470
  "learning_rate": 0.0035049261083743843,
471
+ "loss": 0.7235,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 3.05,
476
+ "grad_norm": 2.941433906555176,
477
  "learning_rate": 0.0034802955665024634,
478
+ "loss": 0.8571,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 3.1,
483
+ "grad_norm": 2.526165723800659,
484
  "learning_rate": 0.003455665024630542,
485
+ "loss": 0.7949,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 3.15,
490
+ "grad_norm": 2.683561086654663,
491
  "learning_rate": 0.0034310344827586207,
492
+ "loss": 0.811,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 3.2,
497
+ "grad_norm": 4.099231243133545,
498
  "learning_rate": 0.0034064039408867,
499
+ "loss": 0.694,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 3.25,
504
+ "grad_norm": 7.0789289474487305,
505
  "learning_rate": 0.0033817733990147785,
506
+ "loss": 0.6623,
507
  "step": 660
508
  },
509
  {
510
  "epoch": 3.3,
511
+ "grad_norm": 4.69476842880249,
512
  "learning_rate": 0.003357142857142857,
513
+ "loss": 0.6777,
514
  "step": 670
515
  },
516
  {
517
  "epoch": 3.35,
518
+ "grad_norm": 3.9503209590911865,
519
  "learning_rate": 0.003332512315270936,
520
+ "loss": 0.6973,
521
  "step": 680
522
  },
523
  {
524
  "epoch": 3.39,
525
+ "grad_norm": 2.756246328353882,
526
  "learning_rate": 0.0033078817733990145,
527
+ "loss": 0.6769,
528
  "step": 690
529
  },
530
  {
531
  "epoch": 3.44,
532
+ "grad_norm": 2.2053091526031494,
533
  "learning_rate": 0.0032832512315270936,
534
+ "loss": 0.694,
535
  "step": 700
536
  },
537
  {
538
  "epoch": 3.49,
539
+ "grad_norm": 2.942899465560913,
540
  "learning_rate": 0.003258620689655172,
541
+ "loss": 0.6559,
542
  "step": 710
543
  },
544
  {
545
  "epoch": 3.54,
546
+ "grad_norm": 3.347179651260376,
547
  "learning_rate": 0.0032339901477832513,
548
+ "loss": 0.7284,
549
  "step": 720
550
  },
551
  {
552
  "epoch": 3.59,
553
+ "grad_norm": 3.367220163345337,
554
  "learning_rate": 0.00320935960591133,
555
+ "loss": 0.6454,
556
  "step": 730
557
  },
558
  {
559
  "epoch": 3.64,
560
+ "grad_norm": 3.6450581550598145,
561
  "learning_rate": 0.003184729064039409,
562
+ "loss": 0.6734,
563
  "step": 740
564
  },
565
  {
566
  "epoch": 3.69,
567
+ "grad_norm": 2.7060301303863525,
568
  "learning_rate": 0.0031600985221674877,
569
+ "loss": 0.682,
570
  "step": 750
571
  },
572
  {
573
  "epoch": 3.74,
574
+ "grad_norm": 2.4949233531951904,
575
+ "learning_rate": 0.0031354679802955664,
576
+ "loss": 0.6368,
577
  "step": 760
578
  },
579
  {
580
  "epoch": 3.79,
581
+ "grad_norm": 2.195974826812744,
582
+ "learning_rate": 0.0031108374384236455,
583
+ "loss": 0.5631,
584
  "step": 770
585
  },
586
  {
587
  "epoch": 3.84,
588
+ "grad_norm": 2.8718490600585938,
589
+ "learning_rate": 0.003086206896551724,
590
+ "loss": 0.6699,
591
  "step": 780
592
  },
593
  {
594
  "epoch": 3.89,
595
+ "grad_norm": 2.896594524383545,
596
+ "learning_rate": 0.003061576354679803,
597
+ "loss": 0.6374,
598
  "step": 790
599
  },
600
  {
601
  "epoch": 3.94,
602
+ "grad_norm": 2.3888590335845947,
603
+ "learning_rate": 0.003036945812807882,
604
+ "loss": 0.6104,
605
  "step": 800
606
  },
607
  {
608
  "epoch": 3.99,
609
+ "grad_norm": 3.1360650062561035,
610
+ "learning_rate": 0.003012315270935961,
611
+ "loss": 0.6405,
612
  "step": 810
613
  },
614
  {
615
  "epoch": 4.0,
616
+ "eval_accuracy": 0.9289297658862876,
617
+ "eval_f1": 0.9267094039271597,
618
+ "eval_loss": 0.20555561780929565,
619
+ "eval_precision": 0.9480713468179534,
620
+ "eval_recall": 0.9174817160045543,
621
+ "eval_runtime": 12.7217,
622
+ "eval_samples_per_second": 188.025,
623
+ "eval_steps_per_second": 11.791,
624
  "step": 813
625
  },
626
  {
627
  "epoch": 4.03,
628
+ "grad_norm": 2.944392681121826,
629
+ "learning_rate": 0.0029876847290640396,
630
+ "loss": 0.6386,
631
  "step": 820
632
  },
633
  {
634
  "epoch": 4.08,
635
+ "grad_norm": 2.1877291202545166,
636
+ "learning_rate": 0.0029630541871921187,
637
+ "loss": 0.554,
638
  "step": 830
639
  },
640
  {
641
  "epoch": 4.13,
642
+ "grad_norm": 2.3481550216674805,
643
+ "learning_rate": 0.0029384236453201974,
644
+ "loss": 0.6096,
645
  "step": 840
646
  },
647
  {
648
  "epoch": 4.18,
649
+ "grad_norm": 1.2819880247116089,
650
+ "learning_rate": 0.002913793103448276,
651
+ "loss": 0.6056,
652
  "step": 850
653
  },
654
  {
655
  "epoch": 4.23,
656
+ "grad_norm": 2.8151116371154785,
657
+ "learning_rate": 0.0028891625615763547,
658
+ "loss": 0.529,
659
  "step": 860
660
  },
661
  {
662
  "epoch": 4.28,
663
+ "grad_norm": 1.4771710634231567,
664
+ "learning_rate": 0.0028645320197044333,
665
+ "loss": 0.6179,
666
  "step": 870
667
  },
668
  {
669
  "epoch": 4.33,
670
+ "grad_norm": 1.6470296382904053,
671
+ "learning_rate": 0.002839901477832512,
672
+ "loss": 0.5378,
673
  "step": 880
674
  },
675
  {
676
  "epoch": 4.38,
677
+ "grad_norm": 1.7947298288345337,
678
+ "learning_rate": 0.002815270935960591,
679
+ "loss": 0.6106,
680
  "step": 890
681
  },
682
  {
683
  "epoch": 4.43,
684
+ "grad_norm": 1.3656386137008667,
685
+ "learning_rate": 0.0027906403940886697,
686
+ "loss": 0.559,
687
  "step": 900
688
  },
689
  {
690
  "epoch": 4.48,
691
+ "grad_norm": 1.5877206325531006,
692
+ "learning_rate": 0.002766009852216749,
693
+ "loss": 0.6011,
694
  "step": 910
695
  },
696
  {
697
  "epoch": 4.53,
698
+ "grad_norm": 1.7223550081253052,
699
+ "learning_rate": 0.0027413793103448275,
700
+ "loss": 0.569,
701
  "step": 920
702
  },
703
  {
704
  "epoch": 4.58,
705
+ "grad_norm": 3.068004608154297,
706
+ "learning_rate": 0.0027167487684729066,
707
+ "loss": 0.579,
708
  "step": 930
709
  },
710
  {
711
  "epoch": 4.62,
712
+ "grad_norm": 2.741926908493042,
713
+ "learning_rate": 0.0026921182266009852,
714
+ "loss": 0.632,
715
  "step": 940
716
  },
717
  {
718
  "epoch": 4.67,
719
+ "grad_norm": 2.4220759868621826,
720
+ "learning_rate": 0.0026674876847290643,
721
+ "loss": 0.676,
722
  "step": 950
723
  },
724
  {
725
  "epoch": 4.72,
726
+ "grad_norm": 2.406053066253662,
727
+ "learning_rate": 0.002642857142857143,
728
+ "loss": 0.6331,
729
  "step": 960
730
  },
731
  {
732
  "epoch": 4.77,
733
+ "grad_norm": 3.668957471847534,
734
+ "learning_rate": 0.0026182266009852216,
735
+ "loss": 0.5617,
736
  "step": 970
737
  },
738
  {
739
  "epoch": 4.82,
740
+ "grad_norm": 2.904611825942993,
741
+ "learning_rate": 0.0025935960591133007,
742
+ "loss": 0.5911,
743
  "step": 980
744
  },
745
  {
746
  "epoch": 4.87,
747
+ "grad_norm": 2.3104119300842285,
748
+ "learning_rate": 0.0025689655172413794,
749
+ "loss": 0.6124,
750
  "step": 990
751
  },
752
  {
753
  "epoch": 4.92,
754
+ "grad_norm": 3.4378082752227783,
755
+ "learning_rate": 0.0025443349753694585,
756
+ "loss": 0.6497,
757
  "step": 1000
758
  },
759
  {
760
  "epoch": 4.97,
761
+ "grad_norm": 3.1942694187164307,
762
+ "learning_rate": 0.002519704433497537,
763
+ "loss": 0.6514,
764
  "step": 1010
765
  },
766
  {
767
  "epoch": 5.0,
768
+ "eval_accuracy": 0.9523411371237458,
769
+ "eval_f1": 0.9382471321690677,
770
+ "eval_loss": 0.1361980438232422,
771
+ "eval_precision": 0.9459238480466342,
772
+ "eval_recall": 0.9385166217680354,
773
+ "eval_runtime": 12.797,
774
+ "eval_samples_per_second": 186.918,
775
+ "eval_steps_per_second": 11.721,
776
  "step": 1016
777
  },
778
  {
779
  "epoch": 5.02,
780
+ "grad_norm": 2.186038017272949,
781
+ "learning_rate": 0.002495073891625616,
782
+ "loss": 0.6427,
783
  "step": 1020
784
  },
785
  {
786
  "epoch": 5.07,
787
+ "grad_norm": 1.6945507526397705,
788
+ "learning_rate": 0.0024704433497536944,
789
+ "loss": 0.6153,
790
  "step": 1030
791
  },
792
  {
793
  "epoch": 5.12,
794
+ "grad_norm": 1.6330933570861816,
795
+ "learning_rate": 0.0024458128078817735,
796
+ "loss": 0.5831,
797
  "step": 1040
798
  },
799
  {
800
  "epoch": 5.17,
801
+ "grad_norm": 5.190056800842285,
802
+ "learning_rate": 0.002421182266009852,
803
+ "loss": 0.5627,
804
  "step": 1050
805
  },
806
  {
807
  "epoch": 5.22,
808
+ "grad_norm": 1.9557955265045166,
809
+ "learning_rate": 0.0023965517241379313,
810
+ "loss": 0.6279,
811
  "step": 1060
812
  },
813
  {
814
  "epoch": 5.26,
815
+ "grad_norm": 2.5824782848358154,
816
+ "learning_rate": 0.00237192118226601,
817
+ "loss": 0.6097,
818
  "step": 1070
819
  },
820
  {
821
  "epoch": 5.31,
822
+ "grad_norm": 2.0462582111358643,
823
+ "learning_rate": 0.002347290640394089,
824
+ "loss": 0.6527,
825
  "step": 1080
826
  },
827
  {
828
  "epoch": 5.36,
829
+ "grad_norm": 2.071093797683716,
830
+ "learning_rate": 0.0023226600985221672,
831
+ "loss": 0.6164,
832
  "step": 1090
833
  },
834
  {
835
  "epoch": 5.41,
836
+ "grad_norm": 2.4309191703796387,
837
+ "learning_rate": 0.0022980295566502463,
838
+ "loss": 0.5927,
839
  "step": 1100
840
  },
841
  {
842
  "epoch": 5.46,
843
+ "grad_norm": 2.204646110534668,
844
+ "learning_rate": 0.002273399014778325,
845
+ "loss": 0.5524,
846
  "step": 1110
847
  },
848
  {
849
  "epoch": 5.51,
850
+ "grad_norm": 2.399622917175293,
851
+ "learning_rate": 0.002248768472906404,
852
+ "loss": 0.493,
853
  "step": 1120
854
  },
855
  {
856
  "epoch": 5.56,
857
+ "grad_norm": 2.0351855754852295,
858
+ "learning_rate": 0.0022241379310344827,
859
+ "loss": 0.4818,
860
  "step": 1130
861
  },
862
  {
863
  "epoch": 5.61,
864
+ "grad_norm": 1.677395224571228,
865
+ "learning_rate": 0.002199507389162562,
866
+ "loss": 0.523,
867
  "step": 1140
868
  },
869
  {
870
  "epoch": 5.66,
871
+ "grad_norm": 3.4297292232513428,
872
+ "learning_rate": 0.0021748768472906405,
873
+ "loss": 0.5055,
874
  "step": 1150
875
  },
876
  {
877
  "epoch": 5.71,
878
+ "grad_norm": 1.9771558046340942,
879
+ "learning_rate": 0.002150246305418719,
880
+ "loss": 0.5898,
881
  "step": 1160
882
  },
883
  {
884
  "epoch": 5.76,
885
+ "grad_norm": 1.6579110622406006,
886
+ "learning_rate": 0.0021256157635467982,
887
+ "loss": 0.5073,
888
  "step": 1170
889
  },
890
  {
891
  "epoch": 5.81,
892
+ "grad_norm": 2.7034878730773926,
893
+ "learning_rate": 0.002100985221674877,
894
+ "loss": 0.6095,
895
  "step": 1180
896
  },
897
  {
898
  "epoch": 5.85,
899
+ "grad_norm": 2.2419373989105225,
900
+ "learning_rate": 0.0020763546798029556,
901
+ "loss": 0.5336,
902
  "step": 1190
903
  },
904
  {
905
  "epoch": 5.9,
906
+ "grad_norm": 5.060290813446045,
907
+ "learning_rate": 0.0020517241379310346,
908
+ "loss": 0.5068,
909
  "step": 1200
910
  },
911
  {
912
  "epoch": 5.95,
913
+ "grad_norm": 2.3167221546173096,
914
+ "learning_rate": 0.0020270935960591133,
915
+ "loss": 0.5778,
916
  "step": 1210
917
  },
918
  {
919
  "epoch": 6.0,
920
+ "eval_accuracy": 0.9770066889632107,
921
+ "eval_f1": 0.9737037280009198,
922
+ "eval_loss": 0.07868464291095734,
923
+ "eval_precision": 0.9739305124429692,
924
+ "eval_recall": 0.9746007095898861,
925
+ "eval_runtime": 12.8063,
926
+ "eval_samples_per_second": 186.783,
927
+ "eval_steps_per_second": 11.713,
928
  "step": 1219
929
  },
930
  {
931
  "epoch": 6.0,
932
+ "grad_norm": 2.2392728328704834,
933
+ "learning_rate": 0.002002463054187192,
934
+ "loss": 0.4562,
935
  "step": 1220
936
  },
937
  {
938
  "epoch": 6.05,
939
+ "grad_norm": 1.9161555767059326,
940
+ "learning_rate": 0.001977832512315271,
941
+ "loss": 0.5549,
942
  "step": 1230
943
  },
944
  {
945
  "epoch": 6.1,
946
+ "grad_norm": 3.461010694503784,
947
+ "learning_rate": 0.0019532019704433497,
948
+ "loss": 0.4686,
949
  "step": 1240
950
  },
951
  {
952
  "epoch": 6.15,
953
+ "grad_norm": 2.4330320358276367,
954
+ "learning_rate": 0.0019285714285714288,
955
+ "loss": 0.5092,
956
  "step": 1250
957
  },
958
  {
959
  "epoch": 6.2,
960
+ "grad_norm": 3.475228786468506,
961
+ "learning_rate": 0.0019039408866995075,
962
+ "loss": 0.5897,
963
  "step": 1260
964
  },
965
  {
966
  "epoch": 6.25,
967
+ "grad_norm": 2.0373263359069824,
968
+ "learning_rate": 0.0018793103448275861,
969
+ "loss": 0.488,
970
  "step": 1270
971
  },
972
  {
973
  "epoch": 6.3,
974
+ "grad_norm": 2.285243272781372,
975
+ "learning_rate": 0.001854679802955665,
976
+ "loss": 0.5263,
977
  "step": 1280
978
  },
979
  {
980
  "epoch": 6.35,
981
+ "grad_norm": 2.0365989208221436,
982
+ "learning_rate": 0.0018300492610837439,
983
+ "loss": 0.4668,
984
  "step": 1290
985
  },
986
  {
987
  "epoch": 6.4,
988
+ "grad_norm": 1.457801103591919,
989
+ "learning_rate": 0.0018054187192118227,
990
+ "loss": 0.4518,
991
  "step": 1300
992
  },
993
  {
994
  "epoch": 6.45,
995
+ "grad_norm": 1.9687854051589966,
996
+ "learning_rate": 0.0017807881773399016,
997
+ "loss": 0.4986,
998
  "step": 1310
999
  },
1000
  {
1001
  "epoch": 6.49,
1002
+ "grad_norm": 3.3556230068206787,
1003
+ "learning_rate": 0.0017561576354679803,
1004
+ "loss": 0.4955,
1005
  "step": 1320
1006
  },
1007
  {
1008
  "epoch": 6.54,
1009
+ "grad_norm": 2.372751474380493,
1010
+ "learning_rate": 0.0017315270935960591,
1011
+ "loss": 0.542,
1012
  "step": 1330
1013
  },
1014
  {
1015
  "epoch": 6.59,
1016
+ "grad_norm": 2.2720744609832764,
1017
+ "learning_rate": 0.001706896551724138,
1018
+ "loss": 0.4917,
1019
  "step": 1340
1020
  },
1021
  {
1022
  "epoch": 6.64,
1023
+ "grad_norm": 1.2895252704620361,
1024
+ "learning_rate": 0.0016822660098522169,
1025
+ "loss": 0.5833,
1026
  "step": 1350
1027
  },
1028
  {
1029
  "epoch": 6.69,
1030
+ "grad_norm": 2.9902422428131104,
1031
+ "learning_rate": 0.0016576354679802955,
1032
+ "loss": 0.5227,
1033
  "step": 1360
1034
  },
1035
  {
1036
  "epoch": 6.74,
1037
+ "grad_norm": 1.3934229612350464,
1038
+ "learning_rate": 0.0016330049261083744,
1039
+ "loss": 0.503,
1040
  "step": 1370
1041
  },
1042
  {
1043
  "epoch": 6.79,
1044
+ "grad_norm": 1.479251503944397,
1045
+ "learning_rate": 0.001608374384236453,
1046
+ "loss": 0.506,
1047
  "step": 1380
1048
  },
1049
  {
1050
  "epoch": 6.84,
1051
+ "grad_norm": 2.6870193481445312,
1052
+ "learning_rate": 0.001583743842364532,
1053
+ "loss": 0.4654,
1054
  "step": 1390
1055
  },
1056
  {
1057
  "epoch": 6.89,
1058
+ "grad_norm": 1.2971521615982056,
1059
+ "learning_rate": 0.0015591133004926108,
1060
+ "loss": 0.4906,
1061
  "step": 1400
1062
  },
1063
  {
1064
  "epoch": 6.94,
1065
+ "grad_norm": 1.7734407186508179,
1066
+ "learning_rate": 0.0015344827586206897,
1067
+ "loss": 0.4829,
1068
  "step": 1410
1069
  },
1070
  {
1071
  "epoch": 6.99,
1072
+ "grad_norm": 1.2960278987884521,
1073
+ "learning_rate": 0.0015098522167487686,
1074
+ "loss": 0.4759,
1075
  "step": 1420
1076
  },
1077
  {
1078
  "epoch": 7.0,
1079
+ "eval_accuracy": 0.9724080267558528,
1080
+ "eval_f1": 0.9714127059477171,
1081
+ "eval_loss": 0.09588505327701569,
1082
+ "eval_precision": 0.9743979044505011,
1083
+ "eval_recall": 0.9692755583510763,
1084
+ "eval_runtime": 12.8496,
1085
+ "eval_samples_per_second": 186.154,
1086
+ "eval_steps_per_second": 11.674,
1087
  "step": 1422
1088
  },
1089
  {
1090
  "epoch": 7.04,
1091
+ "grad_norm": 0.9376949071884155,
1092
+ "learning_rate": 0.0014852216748768474,
1093
+ "loss": 0.3886,
1094
  "step": 1430
1095
  },
1096
  {
1097
  "epoch": 7.08,
1098
+ "grad_norm": 1.449591040611267,
1099
+ "learning_rate": 0.0014605911330049263,
1100
+ "loss": 0.4327,
1101
  "step": 1440
1102
  },
1103
  {
1104
  "epoch": 7.13,
1105
+ "grad_norm": 1.4483599662780762,
1106
+ "learning_rate": 0.0014359605911330052,
1107
+ "loss": 0.4035,
1108
  "step": 1450
1109
  },
1110
  {
1111
  "epoch": 7.18,
1112
+ "grad_norm": 1.8822619915008545,
1113
+ "learning_rate": 0.0014113300492610836,
1114
+ "loss": 0.4234,
1115
  "step": 1460
1116
  },
1117
  {
1118
  "epoch": 7.23,
1119
+ "grad_norm": 2.542346239089966,
1120
+ "learning_rate": 0.0013866995073891625,
1121
+ "loss": 0.4178,
1122
  "step": 1470
1123
  },
1124
  {
1125
  "epoch": 7.28,
1126
+ "grad_norm": 2.098578929901123,
1127
+ "learning_rate": 0.0013620689655172414,
1128
+ "loss": 0.4904,
1129
  "step": 1480
1130
  },
1131
  {
1132
  "epoch": 7.33,
1133
+ "grad_norm": 2.8648364543914795,
1134
+ "learning_rate": 0.0013374384236453203,
1135
+ "loss": 0.4314,
1136
  "step": 1490
1137
  },
1138
  {
1139
  "epoch": 7.38,
1140
+ "grad_norm": 2.2441766262054443,
1141
+ "learning_rate": 0.0013128078817733991,
1142
+ "loss": 0.4915,
1143
  "step": 1500
1144
  },
1145
  {
1146
  "epoch": 7.43,
1147
+ "grad_norm": 2.867161989212036,
1148
+ "learning_rate": 0.001288177339901478,
1149
+ "loss": 0.4285,
1150
  "step": 1510
1151
  },
1152
  {
1153
  "epoch": 7.48,
1154
+ "grad_norm": 2.074552297592163,
1155
+ "learning_rate": 0.0012635467980295567,
1156
+ "loss": 0.4875,
1157
  "step": 1520
1158
  },
1159
  {
1160
  "epoch": 7.53,
1161
+ "grad_norm": 1.6760947704315186,
1162
+ "learning_rate": 0.0012389162561576355,
1163
+ "loss": 0.4438,
1164
  "step": 1530
1165
  },
1166
  {
1167
  "epoch": 7.58,
1168
+ "grad_norm": 3.0626227855682373,
1169
+ "learning_rate": 0.0012142857142857144,
1170
+ "loss": 0.4202,
1171
  "step": 1540
1172
  },
1173
  {
1174
  "epoch": 7.63,
1175
+ "grad_norm": 1.759521722793579,
1176
+ "learning_rate": 0.001189655172413793,
1177
+ "loss": 0.6058,
1178
  "step": 1550
1179
  },
1180
  {
1181
  "epoch": 7.68,
1182
+ "grad_norm": 1.6488869190216064,
1183
+ "learning_rate": 0.001165024630541872,
1184
+ "loss": 0.4106,
1185
  "step": 1560
1186
  },
1187
  {
1188
  "epoch": 7.72,
1189
+ "grad_norm": 1.598101019859314,
1190
+ "learning_rate": 0.0011403940886699508,
1191
+ "loss": 0.3934,
1192
  "step": 1570
1193
  },
1194
  {
1195
  "epoch": 7.77,
1196
+ "grad_norm": 1.9214413166046143,
1197
+ "learning_rate": 0.0011157635467980295,
1198
+ "loss": 0.4484,
1199
  "step": 1580
1200
  },
1201
  {
1202
  "epoch": 7.82,
1203
+ "grad_norm": 1.8111754655838013,
1204
+ "learning_rate": 0.0010911330049261083,
1205
+ "loss": 0.4292,
1206
  "step": 1590
1207
  },
1208
  {
1209
  "epoch": 7.87,
1210
+ "grad_norm": 1.7007110118865967,
1211
+ "learning_rate": 0.0010665024630541872,
1212
+ "loss": 0.3767,
1213
  "step": 1600
1214
  },
1215
  {
1216
  "epoch": 7.92,
1217
+ "grad_norm": 1.5511890649795532,
1218
+ "learning_rate": 0.001041871921182266,
1219
+ "loss": 0.4394,
1220
  "step": 1610
1221
  },
1222
  {
1223
  "epoch": 7.97,
1224
+ "grad_norm": 1.2566757202148438,
1225
+ "learning_rate": 0.0010172413793103447,
1226
+ "loss": 0.482,
1227
  "step": 1620
1228
  },
1229
  {
1230
  "epoch": 8.0,
1231
+ "eval_accuracy": 0.9761705685618729,
1232
+ "eval_f1": 0.9732825494984669,
1233
+ "eval_loss": 0.07430998235940933,
1234
+ "eval_precision": 0.9737460251788703,
1235
+ "eval_recall": 0.9737015525572723,
1236
+ "eval_runtime": 12.8992,
1237
+ "eval_samples_per_second": 185.438,
1238
+ "eval_steps_per_second": 11.629,
1239
  "step": 1626
1240
  },
1241
  {
1242
  "epoch": 8.02,
1243
+ "grad_norm": 2.213076114654541,
1244
+ "learning_rate": 0.0009926108374384236,
1245
+ "loss": 0.3732,
1246
  "step": 1630
1247
  },
1248
  {
1249
  "epoch": 8.07,
1250
+ "grad_norm": 1.5906304121017456,
1251
+ "learning_rate": 0.0009679802955665025,
1252
+ "loss": 0.4375,
1253
  "step": 1640
1254
  },
1255
  {
1256
  "epoch": 8.12,
1257
+ "grad_norm": 1.559348464012146,
1258
+ "learning_rate": 0.0009433497536945814,
1259
+ "loss": 0.4711,
1260
  "step": 1650
1261
  },
1262
  {
1263
  "epoch": 8.17,
1264
+ "grad_norm": 1.3519264459609985,
1265
+ "learning_rate": 0.00091871921182266,
1266
+ "loss": 0.4236,
1267
  "step": 1660
1268
  },
1269
  {
1270
  "epoch": 8.22,
1271
+ "grad_norm": 1.181723952293396,
1272
+ "learning_rate": 0.0008940886699507389,
1273
+ "loss": 0.3528,
1274
  "step": 1670
1275
  },
1276
  {
1277
  "epoch": 8.27,
1278
+ "grad_norm": 1.1211256980895996,
1279
+ "learning_rate": 0.0008694581280788178,
1280
+ "loss": 0.4419,
1281
  "step": 1680
1282
  },
1283
  {
1284
  "epoch": 8.31,
1285
+ "grad_norm": 1.1640053987503052,
1286
+ "learning_rate": 0.0008448275862068966,
1287
+ "loss": 0.3888,
1288
  "step": 1690
1289
  },
1290
  {
1291
  "epoch": 8.36,
1292
+ "grad_norm": 2.279167652130127,
1293
+ "learning_rate": 0.0008201970443349754,
1294
+ "loss": 0.4163,
1295
  "step": 1700
1296
  },
1297
  {
1298
  "epoch": 8.41,
1299
+ "grad_norm": 1.6173532009124756,
1300
+ "learning_rate": 0.0007955665024630542,
1301
+ "loss": 0.4028,
1302
  "step": 1710
1303
  },
1304
  {
1305
  "epoch": 8.46,
1306
+ "grad_norm": 2.8598406314849854,
1307
+ "learning_rate": 0.000770935960591133,
1308
+ "loss": 0.4218,
1309
  "step": 1720
1310
  },
1311
  {
1312
  "epoch": 8.51,
1313
+ "grad_norm": 1.727655053138733,
1314
+ "learning_rate": 0.0007463054187192118,
1315
+ "loss": 0.4014,
1316
  "step": 1730
1317
  },
1318
  {
1319
  "epoch": 8.56,
1320
+ "grad_norm": 6.330834865570068,
1321
+ "learning_rate": 0.0007216748768472907,
1322
+ "loss": 0.3819,
1323
  "step": 1740
1324
  },
1325
  {
1326
  "epoch": 8.61,
1327
+ "grad_norm": 2.008993625640869,
1328
+ "learning_rate": 0.0006970443349753696,
1329
+ "loss": 0.4146,
1330
  "step": 1750
1331
  },
1332
  {
1333
  "epoch": 8.66,
1334
+ "grad_norm": 1.3601949214935303,
1335
+ "learning_rate": 0.0006724137931034482,
1336
+ "loss": 0.4091,
1337
  "step": 1760
1338
  },
1339
  {
1340
  "epoch": 8.71,
1341
+ "grad_norm": 1.5751508474349976,
1342
+ "learning_rate": 0.0006477832512315271,
1343
+ "loss": 0.4721,
1344
  "step": 1770
1345
  },
1346
  {
1347
  "epoch": 8.76,
1348
+ "grad_norm": 2.352508544921875,
1349
+ "learning_rate": 0.000623152709359606,
1350
+ "loss": 0.4087,
1351
  "step": 1780
1352
  },
1353
  {
1354
  "epoch": 8.81,
1355
+ "grad_norm": 1.4975714683532715,
1356
+ "learning_rate": 0.0005985221674876847,
1357
+ "loss": 0.3899,
1358
  "step": 1790
1359
  },
1360
  {
1361
  "epoch": 8.86,
1362
+ "grad_norm": 1.4178757667541504,
1363
+ "learning_rate": 0.0005738916256157636,
1364
+ "loss": 0.3813,
1365
  "step": 1800
1366
  },
1367
  {
1368
  "epoch": 8.91,
1369
+ "grad_norm": 2.137474536895752,
1370
+ "learning_rate": 0.0005492610837438424,
1371
+ "loss": 0.4743,
1372
  "step": 1810
1373
  },
1374
  {
1375
  "epoch": 8.95,
1376
+ "grad_norm": 0.9968273639678955,
1377
+ "learning_rate": 0.0005246305418719212,
1378
+ "loss": 0.3729,
1379
  "step": 1820
1380
  },
1381
  {
1382
  "epoch": 9.0,
1383
+ "eval_accuracy": 0.975752508361204,
1384
+ "eval_f1": 0.9761552417357652,
1385
+ "eval_loss": 0.090341717004776,
1386
+ "eval_precision": 0.9777538442850773,
1387
+ "eval_recall": 0.9753874426406991,
1388
+ "eval_runtime": 12.9146,
1389
+ "eval_samples_per_second": 185.217,
1390
+ "eval_steps_per_second": 11.615,
1391
  "step": 1829
1392
  },
1393
  {
1394
  "epoch": 9.0,
1395
+ "grad_norm": 1.4125550985336304,
1396
+ "learning_rate": 0.0005,
1397
+ "loss": 0.4285,
1398
  "step": 1830
1399
  },
1400
  {
1401
  "epoch": 9.05,
1402
+ "grad_norm": 1.411934494972229,
1403
+ "learning_rate": 0.00047536945812807883,
1404
+ "loss": 0.4119,
1405
  "step": 1840
1406
  },
1407
  {
1408
  "epoch": 9.1,
1409
+ "grad_norm": 1.0983872413635254,
1410
+ "learning_rate": 0.00045073891625615765,
1411
+ "loss": 0.4449,
1412
  "step": 1850
1413
  },
1414
  {
1415
  "epoch": 9.15,
1416
+ "grad_norm": 1.1691962480545044,
1417
+ "learning_rate": 0.0004261083743842364,
1418
+ "loss": 0.4178,
1419
  "step": 1860
1420
  },
1421
  {
1422
  "epoch": 9.2,
1423
+ "grad_norm": 1.168599247932434,
1424
+ "learning_rate": 0.0004014778325123153,
1425
+ "loss": 0.356,
1426
  "step": 1870
1427
  },
1428
  {
1429
  "epoch": 9.25,
1430
+ "grad_norm": 2.360109329223633,
1431
+ "learning_rate": 0.00037684729064039405,
1432
+ "loss": 0.3666,
1433
  "step": 1880
1434
  },
1435
  {
1436
  "epoch": 9.3,
1437
+ "grad_norm": 1.499463438987732,
1438
+ "learning_rate": 0.00035221674876847293,
1439
+ "loss": 0.3928,
1440
  "step": 1890
1441
  },
1442
  {
1443
  "epoch": 9.35,
1444
+ "grad_norm": 1.5823447704315186,
1445
+ "learning_rate": 0.00032758620689655175,
1446
+ "loss": 0.3461,
1447
  "step": 1900
1448
  },
1449
  {
1450
  "epoch": 9.4,
1451
+ "grad_norm": 1.3279311656951904,
1452
+ "learning_rate": 0.0003029556650246305,
1453
+ "loss": 0.4023,
1454
  "step": 1910
1455
  },
1456
  {
1457
  "epoch": 9.45,
1458
+ "grad_norm": 1.4666024446487427,
1459
+ "learning_rate": 0.00027832512315270933,
1460
+ "loss": 0.3929,
1461
  "step": 1920
1462
  },
1463
  {
1464
  "epoch": 9.5,
1465
+ "grad_norm": 1.5968170166015625,
1466
+ "learning_rate": 0.0002536945812807882,
1467
+ "loss": 0.3738,
1468
  "step": 1930
1469
  },
1470
  {
1471
  "epoch": 9.54,
1472
+ "grad_norm": 0.8691931962966919,
1473
+ "learning_rate": 0.000229064039408867,
1474
+ "loss": 0.3323,
1475
  "step": 1940
1476
  },
1477
  {
1478
  "epoch": 9.59,
1479
+ "grad_norm": 1.582350254058838,
1480
+ "learning_rate": 0.00020443349753694582,
1481
+ "loss": 0.327,
1482
  "step": 1950
1483
  },
1484
  {
1485
  "epoch": 9.64,
1486
+ "grad_norm": 1.616529107093811,
1487
+ "learning_rate": 0.00017980295566502464,
1488
+ "loss": 0.3701,
1489
  "step": 1960
1490
  },
1491
  {
1492
  "epoch": 9.69,
1493
+ "grad_norm": 2.23984432220459,
1494
+ "learning_rate": 0.00015517241379310346,
1495
+ "loss": 0.3796,
1496
  "step": 1970
1497
  },
1498
  {
1499
  "epoch": 9.74,
1500
+ "grad_norm": 1.5370508432388306,
1501
+ "learning_rate": 0.00013054187192118225,
1502
+ "loss": 0.405,
1503
  "step": 1980
1504
  },
1505
  {
1506
  "epoch": 9.79,
1507
+ "grad_norm": 2.253962755203247,
1508
+ "learning_rate": 0.0001059113300492611,
1509
+ "loss": 0.3793,
1510
  "step": 1990
1511
  },
1512
  {
1513
  "epoch": 9.84,
1514
+ "grad_norm": 1.1481009721755981,
1515
+ "learning_rate": 8.12807881773399e-05,
1516
+ "loss": 0.3496,
1517
  "step": 2000
1518
  },
1519
  {
1520
  "epoch": 9.89,
1521
+ "grad_norm": 0.9593034982681274,
1522
+ "learning_rate": 5.6650246305418716e-05,
1523
+ "loss": 0.3418,
1524
  "step": 2010
1525
  },
1526
  {
1527
  "epoch": 9.94,
1528
+ "grad_norm": 1.4056353569030762,
1529
+ "learning_rate": 3.2019704433497536e-05,
1530
+ "loss": 0.3796,
1531
  "step": 2020
1532
  },
1533
  {
1534
  "epoch": 9.99,
1535
+ "grad_norm": 1.489027500152588,
1536
+ "learning_rate": 7.3891625615763555e-06,
1537
+ "loss": 0.3705,
1538
  "step": 2030
1539
  },
1540
  {
1541
  "epoch": 9.99,
1542
+ "eval_accuracy": 0.9807692307692307,
1543
+ "eval_f1": 0.9825245324491525,
1544
+ "eval_loss": 0.07319223880767822,
1545
+ "eval_precision": 0.9829956117956687,
1546
+ "eval_recall": 0.9825747689517654,
1547
+ "eval_runtime": 12.8452,
1548
+ "eval_samples_per_second": 186.217,
1549
+ "eval_steps_per_second": 11.677,
1550
  "step": 2030
1551
  },
1552
  {
1553
  "epoch": 9.99,
1554
  "step": 2030,
1555
  "total_flos": 1.0133154899356189e+19,
1556
+ "train_loss": 0.5615053875692959,
1557
+ "train_runtime": 1549.2005,
1558
+ "train_samples_per_second": 83.914,
1559
+ "train_steps_per_second": 1.31
1560
  }
1561
  ],
1562
  "logging_steps": 10,