dmartincc commited on
Commit
fc851ef
·
verified ·
1 Parent(s): 1a8c905

StressTech/vet-sm

Browse files
Files changed (5) hide show
  1. README.md +0 -1
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +4 -4
  5. trainer_state.json +234 -234
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
  - generated_from_trainer
 
1
  ---
 
2
  base_model: google/vit-base-patch16-224-in21k
3
  tags:
4
  - generated_from_trainer
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 4.99,
3
- "eval_accuracy": 0.6643894107600341,
4
- "eval_loss": 1.0087940692901611,
5
- "eval_runtime": 400.6761,
6
- "eval_samples_per_second": 2.923,
7
- "eval_steps_per_second": 0.185,
8
- "train_loss": 0.9699792993241462,
9
- "train_runtime": 35269.8788,
10
- "train_samples_per_second": 0.941,
11
- "train_steps_per_second": 0.029
12
  }
 
1
  {
2
  "epoch": 4.99,
3
+ "eval_accuracy": 0.6857386848847139,
4
+ "eval_loss": 0.9283789396286011,
5
+ "eval_runtime": 411.7956,
6
+ "eval_samples_per_second": 2.844,
7
+ "eval_steps_per_second": 0.18,
8
+ "train_loss": 0.9461189188243111,
9
+ "train_runtime": 36633.1283,
10
+ "train_samples_per_second": 0.906,
11
+ "train_steps_per_second": 0.028
12
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.99,
3
- "eval_accuracy": 0.6643894107600341,
4
- "eval_loss": 1.0087940692901611,
5
- "eval_runtime": 400.6761,
6
- "eval_samples_per_second": 2.923,
7
- "eval_steps_per_second": 0.185
8
  }
 
1
  {
2
  "epoch": 4.99,
3
+ "eval_accuracy": 0.6857386848847139,
4
+ "eval_loss": 0.9283789396286011,
5
+ "eval_runtime": 411.7956,
6
+ "eval_samples_per_second": 2.844,
7
+ "eval_steps_per_second": 0.18
8
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 4.99,
3
- "train_loss": 0.9699792993241462,
4
- "train_runtime": 35269.8788,
5
- "train_samples_per_second": 0.941,
6
- "train_steps_per_second": 0.029
7
  }
 
1
  {
2
  "epoch": 4.99,
3
+ "train_loss": 0.9461189188243111,
4
+ "train_runtime": 36633.1283,
5
+ "train_samples_per_second": 0.906,
6
+ "train_steps_per_second": 0.028
7
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.6643894107600341,
3
  "best_model_checkpoint": "vet-sm/checkpoint-1035",
4
  "epoch": 4.9879518072289155,
5
  "eval_steps": 500,
@@ -10,778 +10,778 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
- "grad_norm": 1.4342966079711914,
14
  "learning_rate": 4.807692307692308e-06,
15
- "loss": 2.0812,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1,
20
- "grad_norm": 1.0532125234603882,
21
  "learning_rate": 9.615384615384616e-06,
22
- "loss": 2.0532,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.14,
27
- "grad_norm": 2.140876293182373,
28
  "learning_rate": 1.4423076923076923e-05,
29
- "loss": 1.9972,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.19,
34
- "grad_norm": 1.4464075565338135,
35
  "learning_rate": 1.923076923076923e-05,
36
- "loss": 1.9533,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.24,
41
- "grad_norm": 2.417072296142578,
42
  "learning_rate": 2.4038461538461542e-05,
43
- "loss": 1.9359,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.29,
48
- "grad_norm": 1.399131417274475,
49
  "learning_rate": 2.8846153846153845e-05,
50
- "loss": 1.9036,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.34,
55
- "grad_norm": 2.0284457206726074,
56
  "learning_rate": 3.365384615384616e-05,
57
- "loss": 1.8338,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.39,
62
- "grad_norm": 2.100050449371338,
63
  "learning_rate": 3.846153846153846e-05,
64
- "loss": 1.8059,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.43,
69
- "grad_norm": 1.7329950332641602,
70
  "learning_rate": 4.326923076923077e-05,
71
- "loss": 1.7837,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.48,
76
- "grad_norm": 1.6801304817199707,
77
  "learning_rate": 4.8076923076923084e-05,
78
- "loss": 1.7468,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.53,
83
- "grad_norm": 1.8338978290557861,
84
  "learning_rate": 4.967776584317938e-05,
85
- "loss": 1.6753,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.58,
90
- "grad_norm": 3.069058895111084,
91
  "learning_rate": 4.9140708915145005e-05,
92
- "loss": 1.6138,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.63,
97
- "grad_norm": 3.1650478839874268,
98
  "learning_rate": 4.860365198711064e-05,
99
- "loss": 1.6267,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.67,
104
- "grad_norm": 2.2301580905914307,
105
  "learning_rate": 4.806659505907626e-05,
106
- "loss": 1.5998,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.72,
111
- "grad_norm": 2.113203763961792,
112
  "learning_rate": 4.7529538131041896e-05,
113
- "loss": 1.4803,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.77,
118
- "grad_norm": 2.676020622253418,
119
  "learning_rate": 4.699248120300752e-05,
120
- "loss": 1.5458,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.82,
125
- "grad_norm": 2.697316884994507,
126
  "learning_rate": 4.645542427497315e-05,
127
- "loss": 1.5337,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.87,
132
- "grad_norm": 2.024156093597412,
133
  "learning_rate": 4.591836734693878e-05,
134
- "loss": 1.46,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.92,
139
- "grad_norm": 2.946873426437378,
140
  "learning_rate": 4.5381310418904406e-05,
141
- "loss": 1.4879,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.96,
146
- "grad_norm": 3.515526294708252,
147
  "learning_rate": 4.484425349087004e-05,
148
- "loss": 1.4639,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 1.0,
153
- "eval_accuracy": 0.47651579846285225,
154
- "eval_loss": 1.4477003812789917,
155
- "eval_runtime": 402.0376,
156
- "eval_samples_per_second": 2.913,
157
  "eval_steps_per_second": 0.184,
158
  "step": 207
159
  },
160
  {
161
  "epoch": 1.01,
162
- "grad_norm": 2.6239469051361084,
163
  "learning_rate": 4.4307196562835664e-05,
164
- "loss": 1.4386,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.06,
169
- "grad_norm": 2.535055160522461,
170
  "learning_rate": 4.3770139634801297e-05,
171
- "loss": 1.2969,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.11,
176
- "grad_norm": 2.1774981021881104,
177
  "learning_rate": 4.323308270676692e-05,
178
- "loss": 1.2833,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.16,
183
- "grad_norm": 3.0875790119171143,
184
  "learning_rate": 4.269602577873255e-05,
185
- "loss": 1.3138,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.2,
190
- "grad_norm": 3.3526086807250977,
191
  "learning_rate": 4.215896885069818e-05,
192
- "loss": 1.3177,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.25,
197
- "grad_norm": 2.8646128177642822,
198
  "learning_rate": 4.1621911922663806e-05,
199
- "loss": 1.2305,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.3,
204
- "grad_norm": 2.4031918048858643,
205
  "learning_rate": 4.108485499462943e-05,
206
- "loss": 1.2459,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.35,
211
- "grad_norm": 4.963438510894775,
212
  "learning_rate": 4.054779806659506e-05,
213
- "loss": 1.2425,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.4,
218
- "grad_norm": 4.87284517288208,
219
  "learning_rate": 4.0010741138560684e-05,
220
- "loss": 1.2287,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.45,
225
- "grad_norm": 3.761154890060425,
226
  "learning_rate": 3.9473684210526316e-05,
227
- "loss": 1.2937,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.49,
232
- "grad_norm": 3.1150503158569336,
233
  "learning_rate": 3.893662728249194e-05,
234
- "loss": 1.2641,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.54,
239
- "grad_norm": 4.133061408996582,
240
  "learning_rate": 3.8399570354457575e-05,
241
- "loss": 1.1355,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.59,
246
- "grad_norm": 4.659736633300781,
247
  "learning_rate": 3.78625134264232e-05,
248
- "loss": 1.2126,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.64,
253
- "grad_norm": 4.9169535636901855,
254
  "learning_rate": 3.732545649838883e-05,
255
- "loss": 1.1749,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.69,
260
- "grad_norm": 3.7532241344451904,
261
  "learning_rate": 3.678839957035446e-05,
262
- "loss": 1.1279,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.73,
267
- "grad_norm": 3.892486572265625,
268
  "learning_rate": 3.6251342642320084e-05,
269
- "loss": 1.1878,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.78,
274
- "grad_norm": 2.520615339279175,
275
  "learning_rate": 3.571428571428572e-05,
276
- "loss": 1.1508,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.83,
281
- "grad_norm": 4.7209930419921875,
282
  "learning_rate": 3.517722878625134e-05,
283
- "loss": 1.0644,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.88,
288
- "grad_norm": 3.167203664779663,
289
  "learning_rate": 3.4640171858216975e-05,
290
- "loss": 1.1183,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.93,
295
- "grad_norm": 6.361824989318848,
296
  "learning_rate": 3.41031149301826e-05,
297
- "loss": 1.0865,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.98,
302
- "grad_norm": 3.040663957595825,
303
  "learning_rate": 3.3566058002148234e-05,
304
- "loss": 1.1673,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 2.0,
309
- "eval_accuracy": 0.5824081981212639,
310
- "eval_loss": 1.1657731533050537,
311
- "eval_runtime": 430.9042,
312
- "eval_samples_per_second": 2.718,
313
- "eval_steps_per_second": 0.172,
314
  "step": 415
315
  },
316
  {
317
  "epoch": 2.02,
318
- "grad_norm": 4.152471542358398,
319
  "learning_rate": 3.302900107411386e-05,
320
- "loss": 1.0763,
321
  "step": 420
322
  },
323
  {
324
  "epoch": 2.07,
325
- "grad_norm": 2.840665817260742,
326
  "learning_rate": 3.2491944146079485e-05,
327
- "loss": 0.9653,
328
  "step": 430
329
  },
330
  {
331
  "epoch": 2.12,
332
- "grad_norm": 2.5569868087768555,
333
  "learning_rate": 3.195488721804512e-05,
334
- "loss": 0.9141,
335
  "step": 440
336
  },
337
  {
338
  "epoch": 2.17,
339
- "grad_norm": 4.580764293670654,
340
  "learning_rate": 3.1417830290010743e-05,
341
- "loss": 0.8512,
342
  "step": 450
343
  },
344
  {
345
  "epoch": 2.22,
346
- "grad_norm": 4.854053497314453,
347
  "learning_rate": 3.0880773361976376e-05,
348
- "loss": 0.9275,
349
  "step": 460
350
  },
351
  {
352
  "epoch": 2.27,
353
- "grad_norm": 2.7004103660583496,
354
  "learning_rate": 3.0343716433942e-05,
355
- "loss": 0.868,
356
  "step": 470
357
  },
358
  {
359
  "epoch": 2.31,
360
- "grad_norm": 4.4803996086120605,
361
  "learning_rate": 2.980665950590763e-05,
362
- "loss": 0.8957,
363
  "step": 480
364
  },
365
  {
366
  "epoch": 2.36,
367
- "grad_norm": 4.851820468902588,
368
  "learning_rate": 2.9269602577873257e-05,
369
- "loss": 0.8972,
370
  "step": 490
371
  },
372
  {
373
  "epoch": 2.41,
374
- "grad_norm": 5.090771675109863,
375
  "learning_rate": 2.8732545649838882e-05,
376
- "loss": 0.891,
377
  "step": 500
378
  },
379
  {
380
  "epoch": 2.46,
381
- "grad_norm": 5.008318901062012,
382
  "learning_rate": 2.8195488721804515e-05,
383
- "loss": 0.8807,
384
  "step": 510
385
  },
386
  {
387
  "epoch": 2.51,
388
- "grad_norm": 5.043464183807373,
389
  "learning_rate": 2.765843179377014e-05,
390
- "loss": 0.8187,
391
  "step": 520
392
  },
393
  {
394
  "epoch": 2.55,
395
- "grad_norm": 3.1173977851867676,
396
  "learning_rate": 2.712137486573577e-05,
397
- "loss": 0.8188,
398
  "step": 530
399
  },
400
  {
401
  "epoch": 2.6,
402
- "grad_norm": 4.679303169250488,
403
  "learning_rate": 2.6584317937701396e-05,
404
- "loss": 0.8636,
405
  "step": 540
406
  },
407
  {
408
  "epoch": 2.65,
409
- "grad_norm": 4.363391876220703,
410
  "learning_rate": 2.6047261009667025e-05,
411
- "loss": 0.8755,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.7,
416
- "grad_norm": 3.9236409664154053,
417
  "learning_rate": 2.5510204081632654e-05,
418
- "loss": 0.8461,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.75,
423
- "grad_norm": 4.52059268951416,
424
  "learning_rate": 2.4973147153598283e-05,
425
- "loss": 0.8537,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.8,
430
- "grad_norm": 3.85107421875,
431
  "learning_rate": 2.443609022556391e-05,
432
- "loss": 0.8863,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.84,
437
- "grad_norm": 4.591523170471191,
438
  "learning_rate": 2.3899033297529538e-05,
439
- "loss": 0.8945,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.89,
444
- "grad_norm": 8.31893253326416,
445
  "learning_rate": 2.3361976369495167e-05,
446
- "loss": 0.8145,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 2.94,
451
- "grad_norm": 4.312867641448975,
452
  "learning_rate": 2.2824919441460796e-05,
453
- "loss": 0.9139,
454
  "step": 610
455
  },
456
  {
457
  "epoch": 2.99,
458
- "grad_norm": 3.7743523120880127,
459
  "learning_rate": 2.2287862513426426e-05,
460
- "loss": 0.875,
461
  "step": 620
462
  },
463
  {
464
  "epoch": 3.0,
465
- "eval_accuracy": 0.6157130657557643,
466
- "eval_loss": 1.0804731845855713,
467
- "eval_runtime": 399.3798,
468
- "eval_samples_per_second": 2.932,
469
- "eval_steps_per_second": 0.185,
470
  "step": 622
471
  },
472
  {
473
  "epoch": 3.04,
474
- "grad_norm": 3.2989518642425537,
475
  "learning_rate": 2.1750805585392055e-05,
476
- "loss": 0.6775,
477
  "step": 630
478
  },
479
  {
480
  "epoch": 3.08,
481
- "grad_norm": 5.40819787979126,
482
  "learning_rate": 2.121374865735768e-05,
483
- "loss": 0.6652,
484
  "step": 640
485
  },
486
  {
487
  "epoch": 3.13,
488
- "grad_norm": 6.9740986824035645,
489
  "learning_rate": 2.067669172932331e-05,
490
- "loss": 0.7004,
491
  "step": 650
492
  },
493
  {
494
  "epoch": 3.18,
495
- "grad_norm": 4.180629730224609,
496
  "learning_rate": 2.0139634801288935e-05,
497
- "loss": 0.6034,
498
  "step": 660
499
  },
500
  {
501
  "epoch": 3.23,
502
- "grad_norm": 3.861298084259033,
503
  "learning_rate": 1.9602577873254565e-05,
504
- "loss": 0.595,
505
  "step": 670
506
  },
507
  {
508
  "epoch": 3.28,
509
- "grad_norm": 4.639169216156006,
510
  "learning_rate": 1.9065520945220194e-05,
511
- "loss": 0.596,
512
  "step": 680
513
  },
514
  {
515
  "epoch": 3.33,
516
- "grad_norm": 4.842735767364502,
517
  "learning_rate": 1.8528464017185823e-05,
518
- "loss": 0.5874,
519
  "step": 690
520
  },
521
  {
522
  "epoch": 3.37,
523
- "grad_norm": 4.9960222244262695,
524
  "learning_rate": 1.7991407089151452e-05,
525
- "loss": 0.7005,
526
  "step": 700
527
  },
528
  {
529
  "epoch": 3.42,
530
- "grad_norm": 7.5468268394470215,
531
  "learning_rate": 1.7454350161117078e-05,
532
- "loss": 0.6543,
533
  "step": 710
534
  },
535
  {
536
  "epoch": 3.47,
537
- "grad_norm": 5.474305629730225,
538
  "learning_rate": 1.6917293233082707e-05,
539
- "loss": 0.6795,
540
  "step": 720
541
  },
542
  {
543
  "epoch": 3.52,
544
- "grad_norm": 6.760745048522949,
545
  "learning_rate": 1.6380236305048336e-05,
546
- "loss": 0.5787,
547
  "step": 730
548
  },
549
  {
550
  "epoch": 3.57,
551
- "grad_norm": 7.216871738433838,
552
  "learning_rate": 1.5843179377013965e-05,
553
- "loss": 0.6581,
554
  "step": 740
555
  },
556
  {
557
  "epoch": 3.61,
558
- "grad_norm": 3.083861827850342,
559
  "learning_rate": 1.5306122448979594e-05,
560
- "loss": 0.5928,
561
  "step": 750
562
  },
563
  {
564
  "epoch": 3.66,
565
- "grad_norm": 3.188052177429199,
566
  "learning_rate": 1.4769065520945222e-05,
567
- "loss": 0.5587,
568
  "step": 760
569
  },
570
  {
571
  "epoch": 3.71,
572
- "grad_norm": 4.39857292175293,
573
  "learning_rate": 1.4232008592910851e-05,
574
- "loss": 0.5376,
575
  "step": 770
576
  },
577
  {
578
  "epoch": 3.76,
579
- "grad_norm": 5.987706661224365,
580
  "learning_rate": 1.3694951664876477e-05,
581
- "loss": 0.6057,
582
  "step": 780
583
  },
584
  {
585
  "epoch": 3.81,
586
- "grad_norm": 5.265353679656982,
587
  "learning_rate": 1.3157894736842106e-05,
588
- "loss": 0.5441,
589
  "step": 790
590
  },
591
  {
592
  "epoch": 3.86,
593
- "grad_norm": 5.077045440673828,
594
  "learning_rate": 1.2620837808807733e-05,
595
- "loss": 0.6288,
596
  "step": 800
597
  },
598
  {
599
  "epoch": 3.9,
600
- "grad_norm": 4.107018947601318,
601
  "learning_rate": 1.2083780880773363e-05,
602
- "loss": 0.6222,
603
  "step": 810
604
  },
605
  {
606
  "epoch": 3.95,
607
- "grad_norm": 4.016980171203613,
608
  "learning_rate": 1.1546723952738992e-05,
609
- "loss": 0.6185,
610
  "step": 820
611
  },
612
  {
613
  "epoch": 4.0,
614
- "grad_norm": 4.403396129608154,
615
  "learning_rate": 1.100966702470462e-05,
616
- "loss": 0.5449,
617
  "step": 830
618
  },
619
  {
620
  "epoch": 4.0,
621
- "eval_accuracy": 0.64133219470538,
622
- "eval_loss": 1.0399043560028076,
623
- "eval_runtime": 397.3852,
624
- "eval_samples_per_second": 2.947,
625
  "eval_steps_per_second": 0.186,
626
  "step": 830
627
  },
628
  {
629
  "epoch": 4.05,
630
- "grad_norm": 2.6687538623809814,
631
  "learning_rate": 1.0472610096670248e-05,
632
- "loss": 0.438,
633
  "step": 840
634
  },
635
  {
636
  "epoch": 4.1,
637
- "grad_norm": 3.949495315551758,
638
  "learning_rate": 9.935553168635876e-06,
639
- "loss": 0.4268,
640
  "step": 850
641
  },
642
  {
643
  "epoch": 4.14,
644
- "grad_norm": 6.026305198669434,
645
  "learning_rate": 9.398496240601503e-06,
646
- "loss": 0.4416,
647
  "step": 860
648
  },
649
  {
650
  "epoch": 4.19,
651
- "grad_norm": 4.212298393249512,
652
  "learning_rate": 8.861439312567132e-06,
653
- "loss": 0.4729,
654
  "step": 870
655
  },
656
  {
657
  "epoch": 4.24,
658
- "grad_norm": 4.4485883712768555,
659
  "learning_rate": 8.324382384532762e-06,
660
- "loss": 0.4277,
661
  "step": 880
662
  },
663
  {
664
  "epoch": 4.29,
665
- "grad_norm": 3.780879497528076,
666
  "learning_rate": 7.787325456498389e-06,
667
- "loss": 0.3816,
668
  "step": 890
669
  },
670
  {
671
  "epoch": 4.34,
672
- "grad_norm": 3.81066632270813,
673
  "learning_rate": 7.250268528464017e-06,
674
- "loss": 0.4598,
675
  "step": 900
676
  },
677
  {
678
  "epoch": 4.39,
679
- "grad_norm": 4.870803356170654,
680
  "learning_rate": 6.713211600429646e-06,
681
- "loss": 0.3641,
682
  "step": 910
683
  },
684
  {
685
  "epoch": 4.43,
686
- "grad_norm": 3.519808769226074,
687
  "learning_rate": 6.176154672395274e-06,
688
- "loss": 0.3854,
689
  "step": 920
690
  },
691
  {
692
  "epoch": 4.48,
693
- "grad_norm": 4.55733060836792,
694
  "learning_rate": 5.639097744360902e-06,
695
- "loss": 0.4251,
696
  "step": 930
697
  },
698
  {
699
  "epoch": 4.53,
700
- "grad_norm": 4.121368885040283,
701
  "learning_rate": 5.102040816326531e-06,
702
- "loss": 0.4341,
703
  "step": 940
704
  },
705
  {
706
  "epoch": 4.58,
707
- "grad_norm": 4.274528980255127,
708
  "learning_rate": 4.564983888292159e-06,
709
- "loss": 0.3847,
710
  "step": 950
711
  },
712
  {
713
  "epoch": 4.63,
714
- "grad_norm": 5.41162109375,
715
  "learning_rate": 4.027926960257788e-06,
716
- "loss": 0.3867,
717
  "step": 960
718
  },
719
  {
720
  "epoch": 4.67,
721
- "grad_norm": 5.056694030761719,
722
  "learning_rate": 3.490870032223416e-06,
723
- "loss": 0.4833,
724
  "step": 970
725
  },
726
  {
727
  "epoch": 4.72,
728
- "grad_norm": 4.149148941040039,
729
  "learning_rate": 2.9538131041890443e-06,
730
- "loss": 0.3655,
731
  "step": 980
732
  },
733
  {
734
  "epoch": 4.77,
735
- "grad_norm": 4.390629291534424,
736
  "learning_rate": 2.4167561761546726e-06,
737
- "loss": 0.4394,
738
  "step": 990
739
  },
740
  {
741
  "epoch": 4.82,
742
- "grad_norm": 2.702885389328003,
743
  "learning_rate": 1.8796992481203007e-06,
744
- "loss": 0.4119,
745
  "step": 1000
746
  },
747
  {
748
  "epoch": 4.87,
749
- "grad_norm": 4.201266288757324,
750
  "learning_rate": 1.3426423200859292e-06,
751
- "loss": 0.431,
752
  "step": 1010
753
  },
754
  {
755
  "epoch": 4.92,
756
- "grad_norm": 2.323716640472412,
757
  "learning_rate": 8.055853920515575e-07,
758
- "loss": 0.3856,
759
  "step": 1020
760
  },
761
  {
762
  "epoch": 4.96,
763
- "grad_norm": 3.6192519664764404,
764
  "learning_rate": 2.6852846401718585e-07,
765
- "loss": 0.4905,
766
  "step": 1030
767
  },
768
  {
769
  "epoch": 4.99,
770
- "eval_accuracy": 0.6643894107600341,
771
- "eval_loss": 1.0087940692901611,
772
- "eval_runtime": 392.6338,
773
- "eval_samples_per_second": 2.982,
774
- "eval_steps_per_second": 0.188,
775
  "step": 1035
776
  },
777
  {
778
  "epoch": 4.99,
779
  "step": 1035,
780
  "total_flos": 2.5651227911307264e+18,
781
- "train_loss": 0.9699792993241462,
782
- "train_runtime": 35269.8788,
783
- "train_samples_per_second": 0.941,
784
- "train_steps_per_second": 0.029
785
  }
786
  ],
787
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.6857386848847139,
3
  "best_model_checkpoint": "vet-sm/checkpoint-1035",
4
  "epoch": 4.9879518072289155,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
+ "grad_norm": 1.454635500907898,
14
  "learning_rate": 4.807692307692308e-06,
15
+ "loss": 2.0699,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1,
20
+ "grad_norm": 1.6190325021743774,
21
  "learning_rate": 9.615384615384616e-06,
22
+ "loss": 2.0402,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.14,
27
+ "grad_norm": 1.361482858657837,
28
  "learning_rate": 1.4423076923076923e-05,
29
+ "loss": 1.9857,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.19,
34
+ "grad_norm": 1.3599398136138916,
35
  "learning_rate": 1.923076923076923e-05,
36
+ "loss": 1.9319,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.24,
41
+ "grad_norm": 1.6369996070861816,
42
  "learning_rate": 2.4038461538461542e-05,
43
+ "loss": 1.8814,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.29,
48
+ "grad_norm": 1.192236065864563,
49
  "learning_rate": 2.8846153846153845e-05,
50
+ "loss": 1.8459,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.34,
55
+ "grad_norm": 1.7909432649612427,
56
  "learning_rate": 3.365384615384616e-05,
57
+ "loss": 1.8128,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.39,
62
+ "grad_norm": 1.8163617849349976,
63
  "learning_rate": 3.846153846153846e-05,
64
+ "loss": 1.7424,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.43,
69
+ "grad_norm": 2.151453733444214,
70
  "learning_rate": 4.326923076923077e-05,
71
+ "loss": 1.6964,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.48,
76
+ "grad_norm": 1.8711392879486084,
77
  "learning_rate": 4.8076923076923084e-05,
78
+ "loss": 1.6367,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.53,
83
+ "grad_norm": 1.7750871181488037,
84
  "learning_rate": 4.967776584317938e-05,
85
+ "loss": 1.6045,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.58,
90
+ "grad_norm": 1.7674392461776733,
91
  "learning_rate": 4.9140708915145005e-05,
92
+ "loss": 1.626,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.63,
97
+ "grad_norm": 1.8520715236663818,
98
  "learning_rate": 4.860365198711064e-05,
99
+ "loss": 1.5904,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.67,
104
+ "grad_norm": 2.2221274375915527,
105
  "learning_rate": 4.806659505907626e-05,
106
+ "loss": 1.5845,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.72,
111
+ "grad_norm": 2.5263822078704834,
112
  "learning_rate": 4.7529538131041896e-05,
113
+ "loss": 1.5236,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.77,
118
+ "grad_norm": 3.3485989570617676,
119
  "learning_rate": 4.699248120300752e-05,
120
+ "loss": 1.4698,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.82,
125
+ "grad_norm": 2.506840944290161,
126
  "learning_rate": 4.645542427497315e-05,
127
+ "loss": 1.4167,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.87,
132
+ "grad_norm": 3.2644572257995605,
133
  "learning_rate": 4.591836734693878e-05,
134
+ "loss": 1.4426,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.92,
139
+ "grad_norm": 2.8360097408294678,
140
  "learning_rate": 4.5381310418904406e-05,
141
+ "loss": 1.3924,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.96,
146
+ "grad_norm": 2.6382710933685303,
147
  "learning_rate": 4.484425349087004e-05,
148
+ "loss": 1.3437,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 1.0,
153
+ "eval_accuracy": 0.5456874466268147,
154
+ "eval_loss": 1.3443350791931152,
155
+ "eval_runtime": 402.2174,
156
+ "eval_samples_per_second": 2.911,
157
  "eval_steps_per_second": 0.184,
158
  "step": 207
159
  },
160
  {
161
  "epoch": 1.01,
162
+ "grad_norm": 3.3896210193634033,
163
  "learning_rate": 4.4307196562835664e-05,
164
+ "loss": 1.413,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.06,
169
+ "grad_norm": 6.438076972961426,
170
  "learning_rate": 4.3770139634801297e-05,
171
+ "loss": 1.3462,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.11,
176
+ "grad_norm": 3.4236090183258057,
177
  "learning_rate": 4.323308270676692e-05,
178
+ "loss": 1.2821,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.16,
183
+ "grad_norm": 1.8709990978240967,
184
  "learning_rate": 4.269602577873255e-05,
185
+ "loss": 1.2731,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.2,
190
+ "grad_norm": 2.8092925548553467,
191
  "learning_rate": 4.215896885069818e-05,
192
+ "loss": 1.2207,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.25,
197
+ "grad_norm": 3.147840976715088,
198
  "learning_rate": 4.1621911922663806e-05,
199
+ "loss": 1.2131,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.3,
204
+ "grad_norm": 3.0452332496643066,
205
  "learning_rate": 4.108485499462943e-05,
206
+ "loss": 1.1956,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.35,
211
+ "grad_norm": 3.2367162704467773,
212
  "learning_rate": 4.054779806659506e-05,
213
+ "loss": 1.1639,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.4,
218
+ "grad_norm": 3.8542873859405518,
219
  "learning_rate": 4.0010741138560684e-05,
220
+ "loss": 1.1577,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.45,
225
+ "grad_norm": 2.901261568069458,
226
  "learning_rate": 3.9473684210526316e-05,
227
+ "loss": 1.1535,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.49,
232
+ "grad_norm": 4.736692905426025,
233
  "learning_rate": 3.893662728249194e-05,
234
+ "loss": 1.1863,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.54,
239
+ "grad_norm": 2.820009708404541,
240
  "learning_rate": 3.8399570354457575e-05,
241
+ "loss": 1.2104,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.59,
246
+ "grad_norm": 4.279903888702393,
247
  "learning_rate": 3.78625134264232e-05,
248
+ "loss": 1.162,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.64,
253
+ "grad_norm": 4.341091156005859,
254
  "learning_rate": 3.732545649838883e-05,
255
+ "loss": 1.1758,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.69,
260
+ "grad_norm": 3.7722089290618896,
261
  "learning_rate": 3.678839957035446e-05,
262
+ "loss": 1.153,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.73,
267
+ "grad_norm": 2.462113857269287,
268
  "learning_rate": 3.6251342642320084e-05,
269
+ "loss": 1.1653,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.78,
274
+ "grad_norm": 3.2860846519470215,
275
  "learning_rate": 3.571428571428572e-05,
276
+ "loss": 1.0895,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.83,
281
+ "grad_norm": 4.360082626342773,
282
  "learning_rate": 3.517722878625134e-05,
283
+ "loss": 1.0896,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.88,
288
+ "grad_norm": 3.952023506164551,
289
  "learning_rate": 3.4640171858216975e-05,
290
+ "loss": 1.1883,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.93,
295
+ "grad_norm": 4.6793742179870605,
296
  "learning_rate": 3.41031149301826e-05,
297
+ "loss": 1.0039,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.98,
302
+ "grad_norm": 5.783022403717041,
303
  "learning_rate": 3.3566058002148234e-05,
304
+ "loss": 1.0892,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 2.0,
309
+ "eval_accuracy": 0.627668659265585,
310
+ "eval_loss": 1.0833462476730347,
311
+ "eval_runtime": 386.7346,
312
+ "eval_samples_per_second": 3.028,
313
+ "eval_steps_per_second": 0.191,
314
  "step": 415
315
  },
316
  {
317
  "epoch": 2.02,
318
+ "grad_norm": 2.7940785884857178,
319
  "learning_rate": 3.302900107411386e-05,
320
+ "loss": 1.0866,
321
  "step": 420
322
  },
323
  {
324
  "epoch": 2.07,
325
+ "grad_norm": 2.254930257797241,
326
  "learning_rate": 3.2491944146079485e-05,
327
+ "loss": 0.9175,
328
  "step": 430
329
  },
330
  {
331
  "epoch": 2.12,
332
+ "grad_norm": 4.631977558135986,
333
  "learning_rate": 3.195488721804512e-05,
334
+ "loss": 0.8898,
335
  "step": 440
336
  },
337
  {
338
  "epoch": 2.17,
339
+ "grad_norm": 4.229618549346924,
340
  "learning_rate": 3.1417830290010743e-05,
341
+ "loss": 0.9022,
342
  "step": 450
343
  },
344
  {
345
  "epoch": 2.22,
346
+ "grad_norm": 2.8936877250671387,
347
  "learning_rate": 3.0880773361976376e-05,
348
+ "loss": 0.902,
349
  "step": 460
350
  },
351
  {
352
  "epoch": 2.27,
353
+ "grad_norm": 3.5928826332092285,
354
  "learning_rate": 3.0343716433942e-05,
355
+ "loss": 0.8681,
356
  "step": 470
357
  },
358
  {
359
  "epoch": 2.31,
360
+ "grad_norm": 3.8170666694641113,
361
  "learning_rate": 2.980665950590763e-05,
362
+ "loss": 0.7539,
363
  "step": 480
364
  },
365
  {
366
  "epoch": 2.36,
367
+ "grad_norm": 4.207719326019287,
368
  "learning_rate": 2.9269602577873257e-05,
369
+ "loss": 0.9404,
370
  "step": 490
371
  },
372
  {
373
  "epoch": 2.41,
374
+ "grad_norm": 5.372262001037598,
375
  "learning_rate": 2.8732545649838882e-05,
376
+ "loss": 0.8323,
377
  "step": 500
378
  },
379
  {
380
  "epoch": 2.46,
381
+ "grad_norm": 4.851659297943115,
382
  "learning_rate": 2.8195488721804515e-05,
383
+ "loss": 0.8612,
384
  "step": 510
385
  },
386
  {
387
  "epoch": 2.51,
388
+ "grad_norm": 4.7718682289123535,
389
  "learning_rate": 2.765843179377014e-05,
390
+ "loss": 0.8427,
391
  "step": 520
392
  },
393
  {
394
  "epoch": 2.55,
395
+ "grad_norm": 5.901302337646484,
396
  "learning_rate": 2.712137486573577e-05,
397
+ "loss": 0.8489,
398
  "step": 530
399
  },
400
  {
401
  "epoch": 2.6,
402
+ "grad_norm": 5.132293224334717,
403
  "learning_rate": 2.6584317937701396e-05,
404
+ "loss": 0.8522,
405
  "step": 540
406
  },
407
  {
408
  "epoch": 2.65,
409
+ "grad_norm": 4.27319860458374,
410
  "learning_rate": 2.6047261009667025e-05,
411
+ "loss": 0.8314,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.7,
416
+ "grad_norm": 5.610922336578369,
417
  "learning_rate": 2.5510204081632654e-05,
418
+ "loss": 0.8804,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.75,
423
+ "grad_norm": 2.8495535850524902,
424
  "learning_rate": 2.4973147153598283e-05,
425
+ "loss": 0.7591,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.8,
430
+ "grad_norm": 4.183429718017578,
431
  "learning_rate": 2.443609022556391e-05,
432
+ "loss": 0.8702,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.84,
437
+ "grad_norm": 5.251755237579346,
438
  "learning_rate": 2.3899033297529538e-05,
439
+ "loss": 0.8275,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.89,
444
+ "grad_norm": 4.696440696716309,
445
  "learning_rate": 2.3361976369495167e-05,
446
+ "loss": 0.8131,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 2.94,
451
+ "grad_norm": 3.233407735824585,
452
  "learning_rate": 2.2824919441460796e-05,
453
+ "loss": 0.7995,
454
  "step": 610
455
  },
456
  {
457
  "epoch": 2.99,
458
+ "grad_norm": 6.206390857696533,
459
  "learning_rate": 2.2287862513426426e-05,
460
+ "loss": 0.883,
461
  "step": 620
462
  },
463
  {
464
  "epoch": 3.0,
465
+ "eval_accuracy": 0.6567036720751495,
466
+ "eval_loss": 0.99440997838974,
467
+ "eval_runtime": 391.9131,
468
+ "eval_samples_per_second": 2.988,
469
+ "eval_steps_per_second": 0.189,
470
  "step": 622
471
  },
472
  {
473
  "epoch": 3.04,
474
+ "grad_norm": 3.8997249603271484,
475
  "learning_rate": 2.1750805585392055e-05,
476
+ "loss": 0.7696,
477
  "step": 630
478
  },
479
  {
480
  "epoch": 3.08,
481
+ "grad_norm": 3.5104353427886963,
482
  "learning_rate": 2.121374865735768e-05,
483
+ "loss": 0.6644,
484
  "step": 640
485
  },
486
  {
487
  "epoch": 3.13,
488
+ "grad_norm": 4.623937129974365,
489
  "learning_rate": 2.067669172932331e-05,
490
+ "loss": 0.7019,
491
  "step": 650
492
  },
493
  {
494
  "epoch": 3.18,
495
+ "grad_norm": 3.191859722137451,
496
  "learning_rate": 2.0139634801288935e-05,
497
+ "loss": 0.6541,
498
  "step": 660
499
  },
500
  {
501
  "epoch": 3.23,
502
+ "grad_norm": 3.7715423107147217,
503
  "learning_rate": 1.9602577873254565e-05,
504
+ "loss": 0.5893,
505
  "step": 670
506
  },
507
  {
508
  "epoch": 3.28,
509
+ "grad_norm": 3.1906309127807617,
510
  "learning_rate": 1.9065520945220194e-05,
511
+ "loss": 0.5868,
512
  "step": 680
513
  },
514
  {
515
  "epoch": 3.33,
516
+ "grad_norm": 6.5846099853515625,
517
  "learning_rate": 1.8528464017185823e-05,
518
+ "loss": 0.6708,
519
  "step": 690
520
  },
521
  {
522
  "epoch": 3.37,
523
+ "grad_norm": 3.970404863357544,
524
  "learning_rate": 1.7991407089151452e-05,
525
+ "loss": 0.6117,
526
  "step": 700
527
  },
528
  {
529
  "epoch": 3.42,
530
+ "grad_norm": 4.3821845054626465,
531
  "learning_rate": 1.7454350161117078e-05,
532
+ "loss": 0.6104,
533
  "step": 710
534
  },
535
  {
536
  "epoch": 3.47,
537
+ "grad_norm": 5.36595344543457,
538
  "learning_rate": 1.6917293233082707e-05,
539
+ "loss": 0.5853,
540
  "step": 720
541
  },
542
  {
543
  "epoch": 3.52,
544
+ "grad_norm": 4.7466301918029785,
545
  "learning_rate": 1.6380236305048336e-05,
546
+ "loss": 0.5884,
547
  "step": 730
548
  },
549
  {
550
  "epoch": 3.57,
551
+ "grad_norm": 3.9577629566192627,
552
  "learning_rate": 1.5843179377013965e-05,
553
+ "loss": 0.5961,
554
  "step": 740
555
  },
556
  {
557
  "epoch": 3.61,
558
+ "grad_norm": 5.096789360046387,
559
  "learning_rate": 1.5306122448979594e-05,
560
+ "loss": 0.6155,
561
  "step": 750
562
  },
563
  {
564
  "epoch": 3.66,
565
+ "grad_norm": 6.176750183105469,
566
  "learning_rate": 1.4769065520945222e-05,
567
+ "loss": 0.5,
568
  "step": 760
569
  },
570
  {
571
  "epoch": 3.71,
572
+ "grad_norm": 3.7734062671661377,
573
  "learning_rate": 1.4232008592910851e-05,
574
+ "loss": 0.5895,
575
  "step": 770
576
  },
577
  {
578
  "epoch": 3.76,
579
+ "grad_norm": 3.7732903957366943,
580
  "learning_rate": 1.3694951664876477e-05,
581
+ "loss": 0.5221,
582
  "step": 780
583
  },
584
  {
585
  "epoch": 3.81,
586
+ "grad_norm": 5.260674953460693,
587
  "learning_rate": 1.3157894736842106e-05,
588
+ "loss": 0.6303,
589
  "step": 790
590
  },
591
  {
592
  "epoch": 3.86,
593
+ "grad_norm": 4.469279766082764,
594
  "learning_rate": 1.2620837808807733e-05,
595
+ "loss": 0.6151,
596
  "step": 800
597
  },
598
  {
599
  "epoch": 3.9,
600
+ "grad_norm": 4.325742244720459,
601
  "learning_rate": 1.2083780880773363e-05,
602
+ "loss": 0.5958,
603
  "step": 810
604
  },
605
  {
606
  "epoch": 3.95,
607
+ "grad_norm": 3.5742740631103516,
608
  "learning_rate": 1.1546723952738992e-05,
609
+ "loss": 0.5177,
610
  "step": 820
611
  },
612
  {
613
  "epoch": 4.0,
614
+ "grad_norm": 6.369570732116699,
615
  "learning_rate": 1.100966702470462e-05,
616
+ "loss": 0.5199,
617
  "step": 830
618
  },
619
  {
620
  "epoch": 4.0,
621
+ "eval_accuracy": 0.6754910333048676,
622
+ "eval_loss": 0.9295239448547363,
623
+ "eval_runtime": 398.874,
624
+ "eval_samples_per_second": 2.936,
625
  "eval_steps_per_second": 0.186,
626
  "step": 830
627
  },
628
  {
629
  "epoch": 4.05,
630
+ "grad_norm": 4.601716041564941,
631
  "learning_rate": 1.0472610096670248e-05,
632
+ "loss": 0.4864,
633
  "step": 840
634
  },
635
  {
636
  "epoch": 4.1,
637
+ "grad_norm": 2.6591596603393555,
638
  "learning_rate": 9.935553168635876e-06,
639
+ "loss": 0.4057,
640
  "step": 850
641
  },
642
  {
643
  "epoch": 4.14,
644
+ "grad_norm": 4.309298038482666,
645
  "learning_rate": 9.398496240601503e-06,
646
+ "loss": 0.4228,
647
  "step": 860
648
  },
649
  {
650
  "epoch": 4.19,
651
+ "grad_norm": 4.057296276092529,
652
  "learning_rate": 8.861439312567132e-06,
653
+ "loss": 0.4299,
654
  "step": 870
655
  },
656
  {
657
  "epoch": 4.24,
658
+ "grad_norm": 3.2583279609680176,
659
  "learning_rate": 8.324382384532762e-06,
660
+ "loss": 0.4938,
661
  "step": 880
662
  },
663
  {
664
  "epoch": 4.29,
665
+ "grad_norm": 4.6558451652526855,
666
  "learning_rate": 7.787325456498389e-06,
667
+ "loss": 0.4269,
668
  "step": 890
669
  },
670
  {
671
  "epoch": 4.34,
672
+ "grad_norm": 3.8659329414367676,
673
  "learning_rate": 7.250268528464017e-06,
674
+ "loss": 0.4189,
675
  "step": 900
676
  },
677
  {
678
  "epoch": 4.39,
679
+ "grad_norm": 2.4085872173309326,
680
  "learning_rate": 6.713211600429646e-06,
681
+ "loss": 0.4376,
682
  "step": 910
683
  },
684
  {
685
  "epoch": 4.43,
686
+ "grad_norm": 4.497287750244141,
687
  "learning_rate": 6.176154672395274e-06,
688
+ "loss": 0.4347,
689
  "step": 920
690
  },
691
  {
692
  "epoch": 4.48,
693
+ "grad_norm": 5.4745259284973145,
694
  "learning_rate": 5.639097744360902e-06,
695
+ "loss": 0.3766,
696
  "step": 930
697
  },
698
  {
699
  "epoch": 4.53,
700
+ "grad_norm": 3.8638925552368164,
701
  "learning_rate": 5.102040816326531e-06,
702
+ "loss": 0.4002,
703
  "step": 940
704
  },
705
  {
706
  "epoch": 4.58,
707
+ "grad_norm": 2.668515682220459,
708
  "learning_rate": 4.564983888292159e-06,
709
+ "loss": 0.4154,
710
  "step": 950
711
  },
712
  {
713
  "epoch": 4.63,
714
+ "grad_norm": 2.7704601287841797,
715
  "learning_rate": 4.027926960257788e-06,
716
+ "loss": 0.377,
717
  "step": 960
718
  },
719
  {
720
  "epoch": 4.67,
721
+ "grad_norm": 4.1865949630737305,
722
  "learning_rate": 3.490870032223416e-06,
723
+ "loss": 0.375,
724
  "step": 970
725
  },
726
  {
727
  "epoch": 4.72,
728
+ "grad_norm": 4.669921398162842,
729
  "learning_rate": 2.9538131041890443e-06,
730
+ "loss": 0.3799,
731
  "step": 980
732
  },
733
  {
734
  "epoch": 4.77,
735
+ "grad_norm": 4.289856433868408,
736
  "learning_rate": 2.4167561761546726e-06,
737
+ "loss": 0.3652,
738
  "step": 990
739
  },
740
  {
741
  "epoch": 4.82,
742
+ "grad_norm": 2.9988179206848145,
743
  "learning_rate": 1.8796992481203007e-06,
744
+ "loss": 0.389,
745
  "step": 1000
746
  },
747
  {
748
  "epoch": 4.87,
749
+ "grad_norm": 1.7334405183792114,
750
  "learning_rate": 1.3426423200859292e-06,
751
+ "loss": 0.3715,
752
  "step": 1010
753
  },
754
  {
755
  "epoch": 4.92,
756
+ "grad_norm": 4.97418737411499,
757
  "learning_rate": 8.055853920515575e-07,
758
+ "loss": 0.4106,
759
  "step": 1020
760
  },
761
  {
762
  "epoch": 4.96,
763
+ "grad_norm": 6.089245319366455,
764
  "learning_rate": 2.6852846401718585e-07,
765
+ "loss": 0.4526,
766
  "step": 1030
767
  },
768
  {
769
  "epoch": 4.99,
770
+ "eval_accuracy": 0.6857386848847139,
771
+ "eval_loss": 0.9283789396286011,
772
+ "eval_runtime": 402.142,
773
+ "eval_samples_per_second": 2.912,
774
+ "eval_steps_per_second": 0.184,
775
  "step": 1035
776
  },
777
  {
778
  "epoch": 4.99,
779
  "step": 1035,
780
  "total_flos": 2.5651227911307264e+18,
781
+ "train_loss": 0.9461189188243111,
782
+ "train_runtime": 36633.1283,
783
+ "train_samples_per_second": 0.906,
784
+ "train_steps_per_second": 0.028
785
  }
786
  ],
787
  "logging_steps": 10,