Encore02 commited on
Commit
b2c0689
·
verified ·
1 Parent(s): f00c9de

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
@@ -23,7 +24,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9388489208633094
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +34,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.3090
37
- - Accuracy: 0.9388
38
 
39
  ## Model description
40
 
 
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  datasets:
9
  - imagefolder
 
24
  metrics:
25
  - name: Accuracy
26
  type: accuracy
27
+ value: 0.9172661870503597
28
  ---
29
 
30
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
34
 
35
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
36
  It achieves the following results on the evaluation set:
37
+ - Loss: 0.2732
38
+ - Accuracy: 0.9173
39
 
40
  ## Model description
41
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 15.0,
3
- "eval_accuracy": 0.9244604316546763,
4
- "eval_loss": 0.3162367641925812,
5
- "eval_runtime": 2.9069,
6
- "eval_samples_per_second": 95.635,
7
- "eval_steps_per_second": 12.04,
8
- "total_flos": 2.900189697360077e+18,
9
- "train_loss": 0.14551133991808146,
10
- "train_runtime": 927.2479,
11
- "train_samples_per_second": 40.361,
12
- "train_steps_per_second": 2.524
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9172661870503597,
4
+ "eval_loss": 0.2732398509979248,
5
+ "eval_runtime": 2.9789,
6
+ "eval_samples_per_second": 93.324,
7
+ "eval_steps_per_second": 11.749,
8
+ "total_flos": 1.9334597982400512e+18,
9
+ "train_loss": 0.21612570865485722,
10
+ "train_runtime": 723.2579,
11
+ "train_samples_per_second": 34.497,
12
+ "train_steps_per_second": 2.157
13
  }
data/events.out.tfevents.1730702860.22f267685eef.199.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7f1b4a0aa1f7876e53631c3fa0300fafe5ec6b86ac99ec7e730cddc1a34bad2
3
+ size 411
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 15.0,
3
- "eval_accuracy": 0.9244604316546763,
4
- "eval_loss": 0.3162367641925812,
5
- "eval_runtime": 2.9069,
6
- "eval_samples_per_second": 95.635,
7
- "eval_steps_per_second": 12.04
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9172661870503597,
4
+ "eval_loss": 0.2732398509979248,
5
+ "eval_runtime": 2.9789,
6
+ "eval_samples_per_second": 93.324,
7
+ "eval_steps_per_second": 11.749
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 15.0,
3
- "total_flos": 2.900189697360077e+18,
4
- "train_loss": 0.14551133991808146,
5
- "train_runtime": 927.2479,
6
- "train_samples_per_second": 40.361,
7
- "train_steps_per_second": 2.524
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.9334597982400512e+18,
4
+ "train_loss": 0.21612570865485722,
5
+ "train_runtime": 723.2579,
6
+ "train_samples_per_second": 34.497,
7
+ "train_steps_per_second": 2.157
8
  }
trainer_state.json CHANGED
@@ -1,1872 +1,1254 @@
1
  {
2
- "best_metric": 0.3162367641925812,
3
- "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-1000",
4
- "epoch": 15.0,
5
  "eval_steps": 100,
6
- "global_step": 2340,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0641025641025641,
13
- "grad_norm": 1.9132781028747559,
14
- "learning_rate": 0.00019914529914529915,
15
- "loss": 1.2054,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1282051282051282,
20
- "grad_norm": 1.633124589920044,
21
- "learning_rate": 0.0001982905982905983,
22
- "loss": 1.0748,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.19230769230769232,
27
- "grad_norm": 2.4126267433166504,
28
- "learning_rate": 0.00019743589743589744,
29
- "loss": 1.0973,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2564102564102564,
34
- "grad_norm": 3.881457567214966,
35
- "learning_rate": 0.00019658119658119659,
36
- "loss": 1.0609,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.32051282051282054,
41
- "grad_norm": 3.1995434761047363,
42
- "learning_rate": 0.00019572649572649573,
43
- "loss": 1.0024,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.38461538461538464,
48
- "grad_norm": 2.410505533218384,
49
- "learning_rate": 0.00019487179487179487,
50
- "loss": 0.8658,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.44871794871794873,
55
- "grad_norm": 2.05910325050354,
56
- "learning_rate": 0.00019401709401709402,
57
- "loss": 0.9616,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5128205128205128,
62
- "grad_norm": 4.032101154327393,
63
- "learning_rate": 0.00019316239316239316,
64
- "loss": 0.9391,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5769230769230769,
69
- "grad_norm": 2.779008150100708,
70
- "learning_rate": 0.00019230769230769233,
71
- "loss": 0.9206,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6410256410256411,
76
- "grad_norm": 2.771672010421753,
77
- "learning_rate": 0.00019145299145299148,
78
- "loss": 0.8146,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6410256410256411,
83
- "eval_accuracy": 0.6834532374100719,
84
- "eval_loss": 0.7348725199699402,
85
- "eval_runtime": 2.6163,
86
- "eval_samples_per_second": 106.259,
87
- "eval_steps_per_second": 13.378,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7051282051282052,
92
- "grad_norm": 2.4781739711761475,
93
- "learning_rate": 0.0001905982905982906,
94
- "loss": 0.7542,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.7692307692307693,
99
- "grad_norm": 2.8731400966644287,
100
- "learning_rate": 0.00018974358974358974,
101
- "loss": 0.7726,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.8333333333333334,
106
- "grad_norm": 2.480015277862549,
107
- "learning_rate": 0.00018888888888888888,
108
- "loss": 0.6705,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.8974358974358975,
113
- "grad_norm": 3.7568867206573486,
114
- "learning_rate": 0.00018803418803418803,
115
- "loss": 0.6004,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.9615384615384616,
120
- "grad_norm": 2.720820903778076,
121
- "learning_rate": 0.0001871794871794872,
122
- "loss": 0.8144,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.0256410256410255,
127
- "grad_norm": 1.9295154809951782,
128
- "learning_rate": 0.00018632478632478634,
129
- "loss": 0.6479,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.0897435897435896,
134
- "grad_norm": 3.0400049686431885,
135
- "learning_rate": 0.0001854700854700855,
136
- "loss": 0.522,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.1538461538461537,
141
- "grad_norm": 3.7371773719787598,
142
- "learning_rate": 0.00018461538461538463,
143
- "loss": 0.5998,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.217948717948718,
148
- "grad_norm": 2.992065668106079,
149
- "learning_rate": 0.00018376068376068375,
150
- "loss": 0.6268,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.282051282051282,
155
- "grad_norm": 2.213074207305908,
156
- "learning_rate": 0.00018290598290598292,
157
- "loss": 0.6048,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.282051282051282,
162
- "eval_accuracy": 0.697841726618705,
163
- "eval_loss": 0.6820898056030273,
164
- "eval_runtime": 2.7768,
165
- "eval_samples_per_second": 100.114,
166
- "eval_steps_per_second": 12.604,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.3461538461538463,
171
- "grad_norm": 3.2302353382110596,
172
- "learning_rate": 0.00018205128205128207,
173
- "loss": 0.6054,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.4102564102564101,
178
- "grad_norm": 3.9419608116149902,
179
- "learning_rate": 0.0001811965811965812,
180
- "loss": 0.5863,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.4743589743589745,
185
- "grad_norm": 2.5351428985595703,
186
- "learning_rate": 0.00018034188034188035,
187
- "loss": 0.4328,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.5384615384615383,
192
- "grad_norm": 2.677548885345459,
193
- "learning_rate": 0.0001794871794871795,
194
- "loss": 0.4744,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.6025641025641026,
199
- "grad_norm": 2.3627212047576904,
200
- "learning_rate": 0.00017863247863247864,
201
- "loss": 0.3523,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.6666666666666665,
206
- "grad_norm": 6.175805568695068,
207
- "learning_rate": 0.00017777777777777779,
208
- "loss": 0.5934,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.7307692307692308,
213
- "grad_norm": 2.920872211456299,
214
- "learning_rate": 0.00017692307692307693,
215
- "loss": 0.5758,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.7948717948717947,
220
- "grad_norm": 2.1116819381713867,
221
- "learning_rate": 0.00017606837606837607,
222
- "loss": 0.5086,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.858974358974359,
227
- "grad_norm": 3.4553894996643066,
228
- "learning_rate": 0.00017521367521367522,
229
- "loss": 0.4474,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.9230769230769231,
234
- "grad_norm": 2.4064671993255615,
235
- "learning_rate": 0.00017435897435897436,
236
- "loss": 0.4796,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.9230769230769231,
241
- "eval_accuracy": 0.8129496402877698,
242
- "eval_loss": 0.48327746987342834,
243
- "eval_runtime": 2.565,
244
- "eval_samples_per_second": 108.383,
245
- "eval_steps_per_second": 13.645,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.9871794871794872,
250
- "grad_norm": 3.8495571613311768,
251
- "learning_rate": 0.0001735042735042735,
252
- "loss": 0.4925,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.051282051282051,
257
- "grad_norm": 2.036381244659424,
258
- "learning_rate": 0.00017264957264957268,
259
- "loss": 0.3936,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.1153846153846154,
264
- "grad_norm": 3.381953001022339,
265
- "learning_rate": 0.0001717948717948718,
266
- "loss": 0.3416,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.1794871794871793,
271
- "grad_norm": 5.715399265289307,
272
- "learning_rate": 0.00017094017094017094,
273
- "loss": 0.4081,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.2435897435897436,
278
- "grad_norm": 2.237466335296631,
279
- "learning_rate": 0.00017008547008547008,
280
- "loss": 0.3313,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.3076923076923075,
285
- "grad_norm": 5.658877372741699,
286
- "learning_rate": 0.00016923076923076923,
287
- "loss": 0.3926,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.371794871794872,
292
- "grad_norm": 3.633448362350464,
293
- "learning_rate": 0.00016837606837606837,
294
- "loss": 0.4664,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.435897435897436,
299
- "grad_norm": 4.460226058959961,
300
- "learning_rate": 0.00016752136752136754,
301
- "loss": 0.4206,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.5,
306
- "grad_norm": 1.1033204793930054,
307
- "learning_rate": 0.0001666666666666667,
308
- "loss": 0.2678,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.564102564102564,
313
- "grad_norm": 6.69362735748291,
314
- "learning_rate": 0.00016581196581196583,
315
- "loss": 0.4532,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.564102564102564,
320
- "eval_accuracy": 0.802158273381295,
321
- "eval_loss": 0.5380275249481201,
322
- "eval_runtime": 3.354,
323
- "eval_samples_per_second": 82.885,
324
- "eval_steps_per_second": 10.435,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.628205128205128,
329
- "grad_norm": 2.1671810150146484,
330
- "learning_rate": 0.00016495726495726495,
331
- "loss": 0.2966,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.6923076923076925,
336
- "grad_norm": 4.671816825866699,
337
- "learning_rate": 0.0001641025641025641,
338
- "loss": 0.3559,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.7564102564102564,
343
- "grad_norm": 2.6795239448547363,
344
- "learning_rate": 0.00016324786324786327,
345
- "loss": 0.2111,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 2.8205128205128203,
350
- "grad_norm": 1.8368570804595947,
351
- "learning_rate": 0.0001623931623931624,
352
- "loss": 0.2227,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 2.8846153846153846,
357
- "grad_norm": 0.7519993185997009,
358
- "learning_rate": 0.00016153846153846155,
359
- "loss": 0.2185,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 2.948717948717949,
364
- "grad_norm": 4.014621734619141,
365
- "learning_rate": 0.0001606837606837607,
366
- "loss": 0.2038,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.0128205128205128,
371
- "grad_norm": 2.412414073944092,
372
- "learning_rate": 0.00015982905982905984,
373
- "loss": 0.1874,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.076923076923077,
378
- "grad_norm": 3.7715134620666504,
379
- "learning_rate": 0.00015897435897435896,
380
- "loss": 0.1546,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.141025641025641,
385
- "grad_norm": 1.5307694673538208,
386
- "learning_rate": 0.00015811965811965813,
387
- "loss": 0.1115,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.2051282051282053,
392
- "grad_norm": 2.7572405338287354,
393
- "learning_rate": 0.00015726495726495727,
394
- "loss": 0.1242,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.2051282051282053,
399
- "eval_accuracy": 0.8741007194244604,
400
- "eval_loss": 0.3899326026439667,
401
- "eval_runtime": 2.8072,
402
- "eval_samples_per_second": 99.033,
403
- "eval_steps_per_second": 12.468,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.269230769230769,
408
- "grad_norm": 4.4104390144348145,
409
- "learning_rate": 0.00015641025641025642,
410
- "loss": 0.1986,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.3333333333333335,
415
- "grad_norm": 0.8930767774581909,
416
- "learning_rate": 0.00015555555555555556,
417
- "loss": 0.0582,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.3974358974358974,
422
- "grad_norm": 8.353619575500488,
423
- "learning_rate": 0.0001547008547008547,
424
- "loss": 0.2485,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.4615384615384617,
429
- "grad_norm": 0.09837932884693146,
430
- "learning_rate": 0.00015384615384615385,
431
- "loss": 0.1007,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.5256410256410255,
436
- "grad_norm": 3.90265154838562,
437
- "learning_rate": 0.000152991452991453,
438
- "loss": 0.2279,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.58974358974359,
443
- "grad_norm": 6.65275764465332,
444
- "learning_rate": 0.00015213675213675214,
445
- "loss": 0.1781,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.6538461538461537,
450
- "grad_norm": 3.493739604949951,
451
- "learning_rate": 0.00015128205128205128,
452
- "loss": 0.2098,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 3.717948717948718,
457
- "grad_norm": 4.4887614250183105,
458
- "learning_rate": 0.00015042735042735043,
459
- "loss": 0.1899,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 3.782051282051282,
464
- "grad_norm": 1.8387681245803833,
465
- "learning_rate": 0.00014957264957264957,
466
- "loss": 0.1831,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 3.8461538461538463,
471
- "grad_norm": 5.91892671585083,
472
- "learning_rate": 0.00014871794871794872,
473
- "loss": 0.124,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 3.8461538461538463,
478
- "eval_accuracy": 0.8273381294964028,
479
- "eval_loss": 0.523663341999054,
480
- "eval_runtime": 2.6143,
481
- "eval_samples_per_second": 106.339,
482
- "eval_steps_per_second": 13.388,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 3.91025641025641,
487
- "grad_norm": 0.5099517703056335,
488
- "learning_rate": 0.0001478632478632479,
489
- "loss": 0.1566,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 3.9743589743589745,
494
- "grad_norm": 0.4991530179977417,
495
- "learning_rate": 0.00014700854700854703,
496
- "loss": 0.1168,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 4.038461538461538,
501
- "grad_norm": 4.448193550109863,
502
- "learning_rate": 0.00014615384615384615,
503
- "loss": 0.1044,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 4.102564102564102,
508
- "grad_norm": 0.5183725357055664,
509
- "learning_rate": 0.0001452991452991453,
510
- "loss": 0.1078,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 4.166666666666667,
515
- "grad_norm": 0.9525802135467529,
516
- "learning_rate": 0.00014444444444444444,
517
- "loss": 0.0967,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 4.230769230769231,
522
- "grad_norm": 0.5207259058952332,
523
- "learning_rate": 0.0001435897435897436,
524
- "loss": 0.0973,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 4.294871794871795,
529
- "grad_norm": 1.3160842657089233,
530
- "learning_rate": 0.00014273504273504275,
531
- "loss": 0.1256,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 4.358974358974359,
536
- "grad_norm": 2.892195463180542,
537
- "learning_rate": 0.0001418803418803419,
538
- "loss": 0.1178,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 4.423076923076923,
543
- "grad_norm": 3.8142576217651367,
544
- "learning_rate": 0.00014102564102564104,
545
- "loss": 0.0594,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 4.487179487179487,
550
- "grad_norm": 0.11079952865839005,
551
- "learning_rate": 0.00014017094017094016,
552
- "loss": 0.1239,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 4.487179487179487,
557
- "eval_accuracy": 0.8848920863309353,
558
- "eval_loss": 0.4221162796020508,
559
- "eval_runtime": 2.7273,
560
- "eval_samples_per_second": 101.932,
561
- "eval_steps_per_second": 12.833,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 4.551282051282051,
566
- "grad_norm": 9.544878959655762,
567
- "learning_rate": 0.0001393162393162393,
568
- "loss": 0.1185,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 4.615384615384615,
573
- "grad_norm": 0.06085001304745674,
574
- "learning_rate": 0.00013846153846153847,
575
- "loss": 0.0499,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 4.67948717948718,
580
- "grad_norm": 12.285767555236816,
581
- "learning_rate": 0.00013760683760683762,
582
- "loss": 0.1623,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 4.743589743589744,
587
- "grad_norm": 1.4333381652832031,
588
- "learning_rate": 0.00013675213675213676,
589
- "loss": 0.0903,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 4.8076923076923075,
594
- "grad_norm": 0.37026920914649963,
595
- "learning_rate": 0.0001358974358974359,
596
- "loss": 0.082,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 4.871794871794872,
601
- "grad_norm": 7.013845443725586,
602
- "learning_rate": 0.00013504273504273505,
603
- "loss": 0.0443,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 4.935897435897436,
608
- "grad_norm": 0.3148520588874817,
609
- "learning_rate": 0.0001341880341880342,
610
- "loss": 0.1237,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 5.0,
615
- "grad_norm": 0.5136359930038452,
616
- "learning_rate": 0.00013333333333333334,
617
- "loss": 0.0592,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 5.064102564102564,
622
- "grad_norm": 7.698183536529541,
623
- "learning_rate": 0.00013247863247863248,
624
- "loss": 0.0742,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 5.128205128205128,
629
- "grad_norm": 0.05358889326453209,
630
- "learning_rate": 0.00013162393162393163,
631
- "loss": 0.0785,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 5.128205128205128,
636
- "eval_accuracy": 0.9136690647482014,
637
- "eval_loss": 0.36830753087997437,
638
- "eval_runtime": 3.1493,
639
- "eval_samples_per_second": 88.273,
640
- "eval_steps_per_second": 11.114,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 5.1923076923076925,
645
- "grad_norm": 0.045300308614969254,
646
- "learning_rate": 0.00013076923076923077,
647
- "loss": 0.1987,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 5.256410256410256,
652
- "grad_norm": 6.118052959442139,
653
- "learning_rate": 0.00012991452991452992,
654
- "loss": 0.0708,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 5.32051282051282,
659
- "grad_norm": 0.36830875277519226,
660
- "learning_rate": 0.00012905982905982906,
661
- "loss": 0.0329,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 5.384615384615385,
666
- "grad_norm": 0.5043929219245911,
667
- "learning_rate": 0.00012820512820512823,
668
- "loss": 0.0546,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 5.448717948717949,
673
- "grad_norm": 5.8541035652160645,
674
- "learning_rate": 0.00012735042735042735,
675
- "loss": 0.0589,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 5.512820512820513,
680
- "grad_norm": 0.09965494275093079,
681
- "learning_rate": 0.0001264957264957265,
682
- "loss": 0.0257,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 5.576923076923077,
687
- "grad_norm": 0.03202090039849281,
688
- "learning_rate": 0.00012564102564102564,
689
- "loss": 0.0349,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 5.641025641025641,
694
- "grad_norm": 7.21024751663208,
695
- "learning_rate": 0.00012478632478632478,
696
- "loss": 0.081,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 5.705128205128205,
701
- "grad_norm": 0.03198171406984329,
702
- "learning_rate": 0.00012393162393162395,
703
- "loss": 0.037,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 5.769230769230769,
708
- "grad_norm": 1.1413763761520386,
709
- "learning_rate": 0.0001230769230769231,
710
- "loss": 0.093,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 5.769230769230769,
715
- "eval_accuracy": 0.8597122302158273,
716
- "eval_loss": 0.6375630497932434,
717
- "eval_runtime": 2.5134,
718
- "eval_samples_per_second": 110.607,
719
- "eval_steps_per_second": 13.925,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 5.833333333333333,
724
- "grad_norm": 0.030478307977318764,
725
- "learning_rate": 0.00012222222222222224,
726
- "loss": 0.1015,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 5.897435897435898,
731
- "grad_norm": 7.971870422363281,
732
- "learning_rate": 0.00012136752136752136,
733
- "loss": 0.0421,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 5.961538461538462,
738
- "grad_norm": 0.7655214667320251,
739
- "learning_rate": 0.00012051282051282052,
740
- "loss": 0.0315,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 6.0256410256410255,
745
- "grad_norm": 0.10178599506616592,
746
- "learning_rate": 0.00011965811965811966,
747
- "loss": 0.008,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 6.089743589743589,
752
- "grad_norm": 0.024569841101765633,
753
- "learning_rate": 0.0001188034188034188,
754
- "loss": 0.0054,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 6.153846153846154,
759
- "grad_norm": 0.36783352494239807,
760
- "learning_rate": 0.00011794871794871796,
761
- "loss": 0.0752,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 6.217948717948718,
766
- "grad_norm": 0.04280726611614227,
767
- "learning_rate": 0.00011709401709401711,
768
- "loss": 0.0286,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 6.282051282051282,
773
- "grad_norm": 14.323381423950195,
774
- "learning_rate": 0.00011623931623931625,
775
- "loss": 0.0559,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 6.346153846153846,
780
- "grad_norm": 0.025405921041965485,
781
- "learning_rate": 0.00011538461538461538,
782
- "loss": 0.0238,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 6.410256410256411,
787
- "grad_norm": 0.02457793429493904,
788
- "learning_rate": 0.00011452991452991453,
789
- "loss": 0.0056,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 6.410256410256411,
794
  "eval_accuracy": 0.9244604316546763,
795
- "eval_loss": 0.3162367641925812,
796
- "eval_runtime": 3.2283,
797
- "eval_samples_per_second": 86.114,
798
- "eval_steps_per_second": 10.842,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 6.4743589743589745,
803
- "grad_norm": 0.020618008449673653,
804
- "learning_rate": 0.00011367521367521367,
805
- "loss": 0.0113,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 6.538461538461538,
810
- "grad_norm": 0.021217485889792442,
811
- "learning_rate": 0.00011282051282051283,
812
- "loss": 0.0115,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 6.602564102564102,
817
- "grad_norm": 0.028808822855353355,
818
- "learning_rate": 0.00011196581196581197,
819
- "loss": 0.022,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 6.666666666666667,
824
- "grad_norm": 3.5672314167022705,
825
- "learning_rate": 0.00011111111111111112,
826
- "loss": 0.0706,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 6.730769230769231,
831
- "grad_norm": 0.25913771986961365,
832
- "learning_rate": 0.00011025641025641027,
833
- "loss": 0.021,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 6.794871794871795,
838
- "grad_norm": 12.088153839111328,
839
- "learning_rate": 0.00010940170940170942,
840
- "loss": 0.0914,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 6.858974358974359,
845
- "grad_norm": 0.7027952671051025,
846
- "learning_rate": 0.00010854700854700855,
847
- "loss": 0.0766,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 6.923076923076923,
852
- "grad_norm": 6.911967754364014,
853
- "learning_rate": 0.0001076923076923077,
854
- "loss": 0.0881,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 6.987179487179487,
859
- "grad_norm": 0.03289846330881119,
860
- "learning_rate": 0.00010683760683760684,
861
- "loss": 0.0125,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 7.051282051282051,
866
- "grad_norm": 0.025492513552308083,
867
- "learning_rate": 0.000105982905982906,
868
- "loss": 0.0472,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 7.051282051282051,
873
- "eval_accuracy": 0.8884892086330936,
874
- "eval_loss": 0.5225415825843811,
875
- "eval_runtime": 3.7495,
876
- "eval_samples_per_second": 74.144,
877
- "eval_steps_per_second": 9.335,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 7.115384615384615,
882
- "grad_norm": 0.022507918998599052,
883
- "learning_rate": 0.00010512820512820514,
884
- "loss": 0.0386,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 7.17948717948718,
889
- "grad_norm": 0.020967524498701096,
890
- "learning_rate": 0.00010427350427350428,
891
- "loss": 0.0289,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 7.243589743589744,
896
- "grad_norm": 0.5489076972007751,
897
- "learning_rate": 0.00010341880341880343,
898
- "loss": 0.0041,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 7.3076923076923075,
903
- "grad_norm": 0.12584710121154785,
904
- "learning_rate": 0.00010256410256410256,
905
- "loss": 0.0697,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 7.371794871794872,
910
- "grad_norm": 0.022198162972927094,
911
- "learning_rate": 0.0001017094017094017,
912
- "loss": 0.0039,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 7.435897435897436,
917
- "grad_norm": 11.968843460083008,
918
- "learning_rate": 0.00010085470085470086,
919
- "loss": 0.0855,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 7.5,
924
- "grad_norm": 0.779564380645752,
925
- "learning_rate": 0.0001,
926
- "loss": 0.0585,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 7.564102564102564,
931
- "grad_norm": 0.23576153814792633,
932
- "learning_rate": 9.914529914529915e-05,
933
- "loss": 0.0559,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 7.628205128205128,
938
- "grad_norm": 0.020965000614523888,
939
- "learning_rate": 9.829059829059829e-05,
940
- "loss": 0.0785,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 7.6923076923076925,
945
- "grad_norm": 6.914454936981201,
946
- "learning_rate": 9.743589743589744e-05,
947
- "loss": 0.0234,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 7.6923076923076925,
952
- "eval_accuracy": 0.8597122302158273,
953
- "eval_loss": 0.6095559000968933,
954
- "eval_runtime": 2.5288,
955
- "eval_samples_per_second": 109.933,
956
- "eval_steps_per_second": 13.84,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 7.756410256410256,
961
- "grad_norm": 1.3177701234817505,
962
- "learning_rate": 9.658119658119658e-05,
963
- "loss": 0.0768,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 7.82051282051282,
968
- "grad_norm": 4.212278842926025,
969
- "learning_rate": 9.572649572649574e-05,
970
- "loss": 0.0914,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 7.884615384615385,
975
- "grad_norm": 0.02418905310332775,
976
- "learning_rate": 9.487179487179487e-05,
977
- "loss": 0.0348,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 7.948717948717949,
982
- "grad_norm": 3.008629322052002,
983
- "learning_rate": 9.401709401709401e-05,
984
- "loss": 0.0624,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 8.012820512820513,
989
- "grad_norm": 0.052931949496269226,
990
- "learning_rate": 9.316239316239317e-05,
991
- "loss": 0.0076,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 8.076923076923077,
996
- "grad_norm": 7.994688034057617,
997
- "learning_rate": 9.230769230769232e-05,
998
- "loss": 0.0326,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 8.14102564102564,
1003
- "grad_norm": 0.026721293106675148,
1004
- "learning_rate": 9.145299145299146e-05,
1005
- "loss": 0.0324,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 8.205128205128204,
1010
- "grad_norm": 0.049855004996061325,
1011
- "learning_rate": 9.05982905982906e-05,
1012
- "loss": 0.0057,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 8.26923076923077,
1017
- "grad_norm": 0.014473488554358482,
1018
- "learning_rate": 8.974358974358975e-05,
1019
- "loss": 0.0237,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 8.333333333333334,
1024
- "grad_norm": 0.03150290250778198,
1025
- "learning_rate": 8.888888888888889e-05,
1026
- "loss": 0.0354,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 8.333333333333334,
1031
- "eval_accuracy": 0.8776978417266187,
1032
- "eval_loss": 0.5520122647285461,
1033
- "eval_runtime": 3.3151,
1034
- "eval_samples_per_second": 83.858,
1035
- "eval_steps_per_second": 10.558,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 8.397435897435898,
1040
- "grad_norm": 0.013332781381905079,
1041
- "learning_rate": 8.803418803418804e-05,
1042
- "loss": 0.0049,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 8.461538461538462,
1047
- "grad_norm": 0.013341937214136124,
1048
- "learning_rate": 8.717948717948718e-05,
1049
- "loss": 0.0026,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 8.525641025641026,
1054
- "grad_norm": 0.012689488008618355,
1055
- "learning_rate": 8.632478632478634e-05,
1056
- "loss": 0.0051,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 8.58974358974359,
1061
- "grad_norm": 0.014231017790734768,
1062
- "learning_rate": 8.547008547008547e-05,
1063
- "loss": 0.0027,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 8.653846153846153,
1068
- "grad_norm": 0.052165694534778595,
1069
- "learning_rate": 8.461538461538461e-05,
1070
- "loss": 0.0036,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 8.717948717948717,
1075
- "grad_norm": 0.11598876118659973,
1076
- "learning_rate": 8.376068376068377e-05,
1077
- "loss": 0.0281,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 8.782051282051283,
1082
- "grad_norm": 0.01339508593082428,
1083
- "learning_rate": 8.290598290598292e-05,
1084
- "loss": 0.0026,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 8.846153846153847,
1089
- "grad_norm": 0.011919701471924782,
1090
- "learning_rate": 8.205128205128205e-05,
1091
- "loss": 0.0025,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 8.91025641025641,
1096
- "grad_norm": 3.217728614807129,
1097
- "learning_rate": 8.11965811965812e-05,
1098
- "loss": 0.0379,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 8.974358974358974,
1103
- "grad_norm": 0.01331857219338417,
1104
- "learning_rate": 8.034188034188035e-05,
1105
- "loss": 0.026,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 8.974358974358974,
1110
- "eval_accuracy": 0.8992805755395683,
1111
- "eval_loss": 0.49377354979515076,
1112
- "eval_runtime": 2.8453,
1113
- "eval_samples_per_second": 97.704,
1114
- "eval_steps_per_second": 12.301,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 9.038461538461538,
1119
- "grad_norm": 0.1426580250263214,
1120
- "learning_rate": 7.948717948717948e-05,
1121
- "loss": 0.0072,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 9.102564102564102,
1126
- "grad_norm": 0.021561838686466217,
1127
- "learning_rate": 7.863247863247864e-05,
1128
- "loss": 0.0025,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 9.166666666666666,
1133
- "grad_norm": 0.010494213551282883,
1134
- "learning_rate": 7.777777777777778e-05,
1135
- "loss": 0.0025,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 9.23076923076923,
1140
- "grad_norm": 0.015301249921321869,
1141
- "learning_rate": 7.692307692307693e-05,
1142
- "loss": 0.0051,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 9.294871794871796,
1147
- "grad_norm": 0.013643044047057629,
1148
- "learning_rate": 7.606837606837607e-05,
1149
- "loss": 0.0136,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 9.35897435897436,
1154
- "grad_norm": 0.02054368518292904,
1155
- "learning_rate": 7.521367521367521e-05,
1156
- "loss": 0.0025,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 9.423076923076923,
1161
- "grad_norm": 0.011097296141088009,
1162
- "learning_rate": 7.435897435897436e-05,
1163
- "loss": 0.0025,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 9.487179487179487,
1168
- "grad_norm": 0.5705698132514954,
1169
- "learning_rate": 7.350427350427352e-05,
1170
- "loss": 0.0048,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 9.551282051282051,
1175
- "grad_norm": 0.009772556833922863,
1176
- "learning_rate": 7.264957264957265e-05,
1177
- "loss": 0.002,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 9.615384615384615,
1182
- "grad_norm": 0.011127009056508541,
1183
- "learning_rate": 7.17948717948718e-05,
1184
- "loss": 0.002,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 9.615384615384615,
1189
- "eval_accuracy": 0.9172661870503597,
1190
- "eval_loss": 0.43497270345687866,
1191
- "eval_runtime": 2.5545,
1192
- "eval_samples_per_second": 108.826,
1193
- "eval_steps_per_second": 13.701,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 9.679487179487179,
1198
- "grad_norm": 0.013090673834085464,
1199
- "learning_rate": 7.094017094017095e-05,
1200
- "loss": 0.002,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 9.743589743589745,
1205
- "grad_norm": 0.00843009538948536,
1206
- "learning_rate": 7.008547008547008e-05,
1207
- "loss": 0.0021,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 9.807692307692308,
1212
- "grad_norm": 0.00981289241462946,
1213
- "learning_rate": 6.923076923076924e-05,
1214
- "loss": 0.0018,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 9.871794871794872,
1219
- "grad_norm": 0.01045091450214386,
1220
- "learning_rate": 6.837606837606838e-05,
1221
- "loss": 0.0047,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 9.935897435897436,
1226
- "grad_norm": 0.07456047832965851,
1227
- "learning_rate": 6.752136752136753e-05,
1228
- "loss": 0.0021,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 10.0,
1233
- "grad_norm": 0.010238745249807835,
1234
- "learning_rate": 6.666666666666667e-05,
1235
- "loss": 0.0268,
1236
- "step": 1560
1237
- },
1238
- {
1239
- "epoch": 10.064102564102564,
1240
- "grad_norm": 0.009372313506901264,
1241
- "learning_rate": 6.581196581196581e-05,
1242
- "loss": 0.0018,
1243
- "step": 1570
1244
- },
1245
- {
1246
- "epoch": 10.128205128205128,
1247
- "grad_norm": 0.009544081054627895,
1248
- "learning_rate": 6.495726495726496e-05,
1249
- "loss": 0.0017,
1250
- "step": 1580
1251
- },
1252
- {
1253
- "epoch": 10.192307692307692,
1254
- "grad_norm": 0.014180944301187992,
1255
- "learning_rate": 6.410256410256412e-05,
1256
- "loss": 0.0055,
1257
- "step": 1590
1258
- },
1259
- {
1260
- "epoch": 10.256410256410255,
1261
- "grad_norm": 0.19402286410331726,
1262
- "learning_rate": 6.324786324786325e-05,
1263
- "loss": 0.0021,
1264
- "step": 1600
1265
- },
1266
- {
1267
- "epoch": 10.256410256410255,
1268
- "eval_accuracy": 0.9172661870503597,
1269
- "eval_loss": 0.4223933219909668,
1270
- "eval_runtime": 3.538,
1271
- "eval_samples_per_second": 78.575,
1272
- "eval_steps_per_second": 9.893,
1273
- "step": 1600
1274
- },
1275
- {
1276
- "epoch": 10.320512820512821,
1277
- "grad_norm": 0.008879870176315308,
1278
- "learning_rate": 6.239316239316239e-05,
1279
- "loss": 0.0018,
1280
- "step": 1610
1281
- },
1282
- {
1283
- "epoch": 10.384615384615385,
1284
- "grad_norm": 0.008063827641308308,
1285
- "learning_rate": 6.153846153846155e-05,
1286
- "loss": 0.002,
1287
- "step": 1620
1288
- },
1289
- {
1290
- "epoch": 10.448717948717949,
1291
- "grad_norm": 0.008835590444505215,
1292
- "learning_rate": 6.068376068376068e-05,
1293
- "loss": 0.0018,
1294
- "step": 1630
1295
- },
1296
- {
1297
- "epoch": 10.512820512820513,
1298
- "grad_norm": 0.008632234297692776,
1299
- "learning_rate": 5.982905982905983e-05,
1300
- "loss": 0.0017,
1301
- "step": 1640
1302
- },
1303
- {
1304
- "epoch": 10.576923076923077,
1305
- "grad_norm": 0.00828844029456377,
1306
- "learning_rate": 5.897435897435898e-05,
1307
- "loss": 0.0016,
1308
- "step": 1650
1309
- },
1310
- {
1311
- "epoch": 10.64102564102564,
1312
- "grad_norm": 0.0323554202914238,
1313
- "learning_rate": 5.8119658119658126e-05,
1314
- "loss": 0.0016,
1315
- "step": 1660
1316
- },
1317
- {
1318
- "epoch": 10.705128205128204,
1319
- "grad_norm": 0.008372778072953224,
1320
- "learning_rate": 5.726495726495726e-05,
1321
- "loss": 0.0016,
1322
- "step": 1670
1323
- },
1324
- {
1325
- "epoch": 10.76923076923077,
1326
- "grad_norm": 0.007286165375262499,
1327
- "learning_rate": 5.6410256410256414e-05,
1328
- "loss": 0.0016,
1329
- "step": 1680
1330
- },
1331
- {
1332
- "epoch": 10.833333333333334,
1333
- "grad_norm": 0.012557004578411579,
1334
- "learning_rate": 5.555555555555556e-05,
1335
- "loss": 0.0017,
1336
- "step": 1690
1337
- },
1338
- {
1339
- "epoch": 10.897435897435898,
1340
- "grad_norm": 0.007122470065951347,
1341
- "learning_rate": 5.470085470085471e-05,
1342
- "loss": 0.0016,
1343
- "step": 1700
1344
- },
1345
- {
1346
- "epoch": 10.897435897435898,
1347
- "eval_accuracy": 0.9280575539568345,
1348
- "eval_loss": 0.38381046056747437,
1349
- "eval_runtime": 2.8652,
1350
- "eval_samples_per_second": 97.028,
1351
- "eval_steps_per_second": 12.216,
1352
- "step": 1700
1353
- },
1354
- {
1355
- "epoch": 10.961538461538462,
1356
- "grad_norm": 0.013124003075063229,
1357
- "learning_rate": 5.384615384615385e-05,
1358
- "loss": 0.0015,
1359
- "step": 1710
1360
- },
1361
- {
1362
- "epoch": 11.025641025641026,
1363
- "grad_norm": 0.007307114545255899,
1364
- "learning_rate": 5.2991452991453e-05,
1365
- "loss": 0.0015,
1366
- "step": 1720
1367
- },
1368
- {
1369
- "epoch": 11.08974358974359,
1370
- "grad_norm": 0.007222812157124281,
1371
- "learning_rate": 5.213675213675214e-05,
1372
- "loss": 0.0014,
1373
- "step": 1730
1374
- },
1375
- {
1376
- "epoch": 11.153846153846153,
1377
- "grad_norm": 0.007087068632245064,
1378
- "learning_rate": 5.128205128205128e-05,
1379
- "loss": 0.0015,
1380
- "step": 1740
1381
- },
1382
- {
1383
- "epoch": 11.217948717948717,
1384
- "grad_norm": 0.010747412219643593,
1385
- "learning_rate": 5.042735042735043e-05,
1386
- "loss": 0.0016,
1387
- "step": 1750
1388
- },
1389
- {
1390
- "epoch": 11.282051282051283,
1391
- "grad_norm": 0.007549288682639599,
1392
- "learning_rate": 4.9572649572649575e-05,
1393
- "loss": 0.0014,
1394
- "step": 1760
1395
- },
1396
- {
1397
- "epoch": 11.346153846153847,
1398
- "grad_norm": 0.007861124351620674,
1399
- "learning_rate": 4.871794871794872e-05,
1400
- "loss": 0.0015,
1401
- "step": 1770
1402
- },
1403
- {
1404
- "epoch": 11.41025641025641,
1405
- "grad_norm": 0.008711726404726505,
1406
- "learning_rate": 4.786324786324787e-05,
1407
- "loss": 0.0014,
1408
- "step": 1780
1409
- },
1410
- {
1411
- "epoch": 11.474358974358974,
1412
- "grad_norm": 0.006650915369391441,
1413
- "learning_rate": 4.700854700854701e-05,
1414
- "loss": 0.0014,
1415
- "step": 1790
1416
- },
1417
- {
1418
- "epoch": 11.538461538461538,
1419
- "grad_norm": 0.009336930699646473,
1420
- "learning_rate": 4.615384615384616e-05,
1421
- "loss": 0.0014,
1422
- "step": 1800
1423
- },
1424
- {
1425
- "epoch": 11.538461538461538,
1426
- "eval_accuracy": 0.9280575539568345,
1427
- "eval_loss": 0.3943016529083252,
1428
- "eval_runtime": 2.5492,
1429
- "eval_samples_per_second": 109.052,
1430
- "eval_steps_per_second": 13.73,
1431
- "step": 1800
1432
- },
1433
- {
1434
- "epoch": 11.602564102564102,
1435
- "grad_norm": 0.0071232253685593605,
1436
- "learning_rate": 4.52991452991453e-05,
1437
- "loss": 0.0015,
1438
- "step": 1810
1439
- },
1440
- {
1441
- "epoch": 11.666666666666666,
1442
- "grad_norm": 0.0070044491440057755,
1443
- "learning_rate": 4.4444444444444447e-05,
1444
- "loss": 0.0014,
1445
- "step": 1820
1446
- },
1447
- {
1448
- "epoch": 11.73076923076923,
1449
- "grad_norm": 0.00735941668972373,
1450
- "learning_rate": 4.358974358974359e-05,
1451
- "loss": 0.0014,
1452
- "step": 1830
1453
- },
1454
- {
1455
- "epoch": 11.794871794871796,
1456
- "grad_norm": 0.006876462604850531,
1457
- "learning_rate": 4.2735042735042735e-05,
1458
- "loss": 0.0013,
1459
- "step": 1840
1460
- },
1461
- {
1462
- "epoch": 11.85897435897436,
1463
- "grad_norm": 0.008532642386853695,
1464
- "learning_rate": 4.1880341880341886e-05,
1465
- "loss": 0.0014,
1466
- "step": 1850
1467
- },
1468
- {
1469
- "epoch": 11.923076923076923,
1470
- "grad_norm": 0.007169618736952543,
1471
- "learning_rate": 4.1025641025641023e-05,
1472
- "loss": 0.0013,
1473
- "step": 1860
1474
- },
1475
- {
1476
- "epoch": 11.987179487179487,
1477
- "grad_norm": 0.006206741090863943,
1478
- "learning_rate": 4.0170940170940174e-05,
1479
- "loss": 0.0012,
1480
- "step": 1870
1481
- },
1482
- {
1483
- "epoch": 12.051282051282051,
1484
- "grad_norm": 0.006877180654555559,
1485
- "learning_rate": 3.931623931623932e-05,
1486
- "loss": 0.0013,
1487
- "step": 1880
1488
- },
1489
- {
1490
- "epoch": 12.115384615384615,
1491
- "grad_norm": 0.006645900197327137,
1492
- "learning_rate": 3.846153846153846e-05,
1493
- "loss": 0.0013,
1494
- "step": 1890
1495
- },
1496
- {
1497
- "epoch": 12.179487179487179,
1498
- "grad_norm": 0.007376631256192923,
1499
- "learning_rate": 3.760683760683761e-05,
1500
- "loss": 0.0013,
1501
- "step": 1900
1502
- },
1503
- {
1504
- "epoch": 12.179487179487179,
1505
- "eval_accuracy": 0.9280575539568345,
1506
- "eval_loss": 0.40119558572769165,
1507
- "eval_runtime": 2.73,
1508
- "eval_samples_per_second": 101.833,
1509
- "eval_steps_per_second": 12.821,
1510
- "step": 1900
1511
- },
1512
- {
1513
- "epoch": 12.243589743589743,
1514
- "grad_norm": 0.007013231050223112,
1515
- "learning_rate": 3.675213675213676e-05,
1516
- "loss": 0.0013,
1517
- "step": 1910
1518
- },
1519
- {
1520
- "epoch": 12.307692307692308,
1521
- "grad_norm": 0.006970913149416447,
1522
- "learning_rate": 3.58974358974359e-05,
1523
- "loss": 0.0013,
1524
- "step": 1920
1525
- },
1526
- {
1527
- "epoch": 12.371794871794872,
1528
- "grad_norm": 0.006338655948638916,
1529
- "learning_rate": 3.504273504273504e-05,
1530
- "loss": 0.0013,
1531
- "step": 1930
1532
- },
1533
- {
1534
- "epoch": 12.435897435897436,
1535
- "grad_norm": 0.007881653495132923,
1536
- "learning_rate": 3.418803418803419e-05,
1537
- "loss": 0.0013,
1538
- "step": 1940
1539
- },
1540
- {
1541
- "epoch": 12.5,
1542
- "grad_norm": 0.005947918631136417,
1543
- "learning_rate": 3.3333333333333335e-05,
1544
- "loss": 0.0012,
1545
- "step": 1950
1546
- },
1547
- {
1548
- "epoch": 12.564102564102564,
1549
- "grad_norm": 0.005899305455386639,
1550
- "learning_rate": 3.247863247863248e-05,
1551
- "loss": 0.0012,
1552
- "step": 1960
1553
- },
1554
- {
1555
- "epoch": 12.628205128205128,
1556
- "grad_norm": 0.0061206454411149025,
1557
- "learning_rate": 3.162393162393162e-05,
1558
- "loss": 0.0013,
1559
- "step": 1970
1560
- },
1561
- {
1562
- "epoch": 12.692307692307692,
1563
- "grad_norm": 0.00656491843983531,
1564
- "learning_rate": 3.0769230769230774e-05,
1565
- "loss": 0.0012,
1566
- "step": 1980
1567
- },
1568
- {
1569
- "epoch": 12.756410256410255,
1570
- "grad_norm": 0.006318471394479275,
1571
- "learning_rate": 2.9914529914529915e-05,
1572
- "loss": 0.0012,
1573
- "step": 1990
1574
- },
1575
- {
1576
- "epoch": 12.820512820512821,
1577
- "grad_norm": 0.00670122355222702,
1578
- "learning_rate": 2.9059829059829063e-05,
1579
- "loss": 0.0012,
1580
- "step": 2000
1581
- },
1582
- {
1583
- "epoch": 12.820512820512821,
1584
- "eval_accuracy": 0.9280575539568345,
1585
- "eval_loss": 0.4066712558269501,
1586
- "eval_runtime": 3.6428,
1587
- "eval_samples_per_second": 76.315,
1588
- "eval_steps_per_second": 9.608,
1589
- "step": 2000
1590
- },
1591
- {
1592
- "epoch": 12.884615384615385,
1593
- "grad_norm": 0.005691882688552141,
1594
- "learning_rate": 2.8205128205128207e-05,
1595
- "loss": 0.0012,
1596
- "step": 2010
1597
- },
1598
- {
1599
- "epoch": 12.948717948717949,
1600
- "grad_norm": 0.005753946490585804,
1601
- "learning_rate": 2.7350427350427355e-05,
1602
- "loss": 0.0012,
1603
- "step": 2020
1604
- },
1605
- {
1606
- "epoch": 13.012820512820513,
1607
- "grad_norm": 0.006568002514541149,
1608
- "learning_rate": 2.64957264957265e-05,
1609
- "loss": 0.0013,
1610
- "step": 2030
1611
- },
1612
- {
1613
- "epoch": 13.076923076923077,
1614
- "grad_norm": 0.005731898359954357,
1615
- "learning_rate": 2.564102564102564e-05,
1616
- "loss": 0.0012,
1617
- "step": 2040
1618
- },
1619
- {
1620
- "epoch": 13.14102564102564,
1621
- "grad_norm": 0.005868157371878624,
1622
- "learning_rate": 2.4786324786324787e-05,
1623
- "loss": 0.0011,
1624
- "step": 2050
1625
- },
1626
- {
1627
- "epoch": 13.205128205128204,
1628
- "grad_norm": 0.006337730213999748,
1629
- "learning_rate": 2.3931623931623935e-05,
1630
- "loss": 0.0012,
1631
- "step": 2060
1632
- },
1633
- {
1634
- "epoch": 13.26923076923077,
1635
- "grad_norm": 0.006973997224122286,
1636
- "learning_rate": 2.307692307692308e-05,
1637
- "loss": 0.0011,
1638
- "step": 2070
1639
- },
1640
- {
1641
- "epoch": 13.333333333333334,
1642
- "grad_norm": 0.00554188247770071,
1643
- "learning_rate": 2.2222222222222223e-05,
1644
- "loss": 0.0011,
1645
- "step": 2080
1646
- },
1647
- {
1648
- "epoch": 13.397435897435898,
1649
- "grad_norm": 0.006806936115026474,
1650
- "learning_rate": 2.1367521367521368e-05,
1651
- "loss": 0.0011,
1652
- "step": 2090
1653
- },
1654
- {
1655
- "epoch": 13.461538461538462,
1656
- "grad_norm": 0.005874712951481342,
1657
- "learning_rate": 2.0512820512820512e-05,
1658
- "loss": 0.0011,
1659
- "step": 2100
1660
- },
1661
- {
1662
- "epoch": 13.461538461538462,
1663
- "eval_accuracy": 0.9280575539568345,
1664
- "eval_loss": 0.41012829542160034,
1665
- "eval_runtime": 2.5393,
1666
- "eval_samples_per_second": 109.477,
1667
- "eval_steps_per_second": 13.783,
1668
- "step": 2100
1669
- },
1670
- {
1671
- "epoch": 13.525641025641026,
1672
- "grad_norm": 0.006118403282016516,
1673
- "learning_rate": 1.965811965811966e-05,
1674
- "loss": 0.0012,
1675
- "step": 2110
1676
- },
1677
- {
1678
- "epoch": 13.58974358974359,
1679
- "grad_norm": 0.005724759306758642,
1680
- "learning_rate": 1.8803418803418804e-05,
1681
- "loss": 0.0012,
1682
- "step": 2120
1683
- },
1684
- {
1685
- "epoch": 13.653846153846153,
1686
- "grad_norm": 0.005641784518957138,
1687
- "learning_rate": 1.794871794871795e-05,
1688
- "loss": 0.0011,
1689
- "step": 2130
1690
- },
1691
- {
1692
- "epoch": 13.717948717948717,
1693
- "grad_norm": 0.006412914022803307,
1694
- "learning_rate": 1.7094017094017095e-05,
1695
- "loss": 0.0012,
1696
- "step": 2140
1697
- },
1698
- {
1699
- "epoch": 13.782051282051283,
1700
- "grad_norm": 0.0061592236161231995,
1701
- "learning_rate": 1.623931623931624e-05,
1702
- "loss": 0.0012,
1703
- "step": 2150
1704
- },
1705
- {
1706
- "epoch": 13.846153846153847,
1707
- "grad_norm": 0.006390335038304329,
1708
- "learning_rate": 1.5384615384615387e-05,
1709
- "loss": 0.0012,
1710
- "step": 2160
1711
- },
1712
- {
1713
- "epoch": 13.91025641025641,
1714
- "grad_norm": 0.006186114624142647,
1715
- "learning_rate": 1.4529914529914531e-05,
1716
- "loss": 0.0012,
1717
- "step": 2170
1718
- },
1719
- {
1720
- "epoch": 13.974358974358974,
1721
- "grad_norm": 0.006987506989389658,
1722
- "learning_rate": 1.3675213675213677e-05,
1723
- "loss": 0.0013,
1724
- "step": 2180
1725
- },
1726
- {
1727
- "epoch": 14.038461538461538,
1728
- "grad_norm": 0.0060087586753070354,
1729
- "learning_rate": 1.282051282051282e-05,
1730
- "loss": 0.0011,
1731
- "step": 2190
1732
- },
1733
- {
1734
- "epoch": 14.102564102564102,
1735
- "grad_norm": 0.005536227021366358,
1736
- "learning_rate": 1.1965811965811967e-05,
1737
- "loss": 0.0011,
1738
- "step": 2200
1739
- },
1740
- {
1741
- "epoch": 14.102564102564102,
1742
- "eval_accuracy": 0.9280575539568345,
1743
- "eval_loss": 0.41235998272895813,
1744
- "eval_runtime": 3.6965,
1745
- "eval_samples_per_second": 75.206,
1746
- "eval_steps_per_second": 9.468,
1747
- "step": 2200
1748
- },
1749
- {
1750
- "epoch": 14.166666666666666,
1751
- "grad_norm": 0.00747127179056406,
1752
- "learning_rate": 1.1111111111111112e-05,
1753
- "loss": 0.0011,
1754
- "step": 2210
1755
- },
1756
- {
1757
- "epoch": 14.23076923076923,
1758
- "grad_norm": 0.006075258832424879,
1759
- "learning_rate": 1.0256410256410256e-05,
1760
- "loss": 0.0012,
1761
- "step": 2220
1762
- },
1763
- {
1764
- "epoch": 14.294871794871796,
1765
- "grad_norm": 0.005355818197131157,
1766
- "learning_rate": 9.401709401709402e-06,
1767
- "loss": 0.0011,
1768
- "step": 2230
1769
- },
1770
- {
1771
- "epoch": 14.35897435897436,
1772
- "grad_norm": 0.006171481683850288,
1773
- "learning_rate": 8.547008547008548e-06,
1774
- "loss": 0.0012,
1775
- "step": 2240
1776
- },
1777
- {
1778
- "epoch": 14.423076923076923,
1779
- "grad_norm": 0.006203506141901016,
1780
- "learning_rate": 7.692307692307694e-06,
1781
- "loss": 0.0011,
1782
- "step": 2250
1783
- },
1784
- {
1785
- "epoch": 14.487179487179487,
1786
- "grad_norm": 0.0053332289680838585,
1787
- "learning_rate": 6.837606837606839e-06,
1788
- "loss": 0.0011,
1789
- "step": 2260
1790
- },
1791
- {
1792
- "epoch": 14.551282051282051,
1793
- "grad_norm": 0.006036951672285795,
1794
- "learning_rate": 5.982905982905984e-06,
1795
- "loss": 0.0012,
1796
- "step": 2270
1797
- },
1798
- {
1799
- "epoch": 14.615384615384615,
1800
- "grad_norm": 0.006114748306572437,
1801
- "learning_rate": 5.128205128205128e-06,
1802
- "loss": 0.0012,
1803
- "step": 2280
1804
- },
1805
- {
1806
- "epoch": 14.679487179487179,
1807
- "grad_norm": 0.0059860167093575,
1808
- "learning_rate": 4.273504273504274e-06,
1809
- "loss": 0.0011,
1810
- "step": 2290
1811
- },
1812
- {
1813
- "epoch": 14.743589743589745,
1814
- "grad_norm": 0.005834794137626886,
1815
- "learning_rate": 3.4188034188034193e-06,
1816
- "loss": 0.0012,
1817
- "step": 2300
1818
- },
1819
- {
1820
- "epoch": 14.743589743589745,
1821
- "eval_accuracy": 0.9280575539568345,
1822
- "eval_loss": 0.4135644733905792,
1823
- "eval_runtime": 2.5193,
1824
- "eval_samples_per_second": 110.348,
1825
- "eval_steps_per_second": 13.893,
1826
- "step": 2300
1827
- },
1828
- {
1829
- "epoch": 14.807692307692308,
1830
- "grad_norm": 0.005964505951851606,
1831
- "learning_rate": 2.564102564102564e-06,
1832
- "loss": 0.0012,
1833
- "step": 2310
1834
- },
1835
- {
1836
- "epoch": 14.871794871794872,
1837
- "grad_norm": 0.005720064975321293,
1838
- "learning_rate": 1.7094017094017097e-06,
1839
- "loss": 0.0011,
1840
- "step": 2320
1841
- },
1842
- {
1843
- "epoch": 14.935897435897436,
1844
- "grad_norm": 0.005913382861763239,
1845
- "learning_rate": 8.547008547008548e-07,
1846
- "loss": 0.0011,
1847
- "step": 2330
1848
- },
1849
- {
1850
- "epoch": 15.0,
1851
- "grad_norm": 0.005278407130390406,
1852
  "learning_rate": 0.0,
1853
- "loss": 0.0011,
1854
- "step": 2340
1855
  },
1856
  {
1857
- "epoch": 15.0,
1858
- "step": 2340,
1859
- "total_flos": 2.900189697360077e+18,
1860
- "train_loss": 0.14551133991808146,
1861
- "train_runtime": 927.2479,
1862
- "train_samples_per_second": 40.361,
1863
- "train_steps_per_second": 2.524
1864
  }
1865
  ],
1866
  "logging_steps": 10,
1867
- "max_steps": 2340,
1868
  "num_input_tokens_seen": 0,
1869
- "num_train_epochs": 15,
1870
  "save_steps": 100,
1871
  "stateful_callbacks": {
1872
  "TrainerControl": {
@@ -1880,7 +1262,7 @@
1880
  "attributes": {}
1881
  }
1882
  },
1883
- "total_flos": 2.900189697360077e+18,
1884
  "train_batch_size": 16,
1885
  "trial_name": null,
1886
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.2732398509979248,
3
+ "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-500",
4
+ "epoch": 10.0,
5
  "eval_steps": 100,
6
+ "global_step": 1560,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0641025641025641,
13
+ "grad_norm": 1.2882777452468872,
14
+ "learning_rate": 0.00019871794871794874,
15
+ "loss": 1.2572,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1282051282051282,
20
+ "grad_norm": 1.0699219703674316,
21
+ "learning_rate": 0.00019743589743589744,
22
+ "loss": 1.1322,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.19230769230769232,
27
+ "grad_norm": 1.5904221534729004,
28
+ "learning_rate": 0.00019615384615384615,
29
+ "loss": 1.1501,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2564102564102564,
34
+ "grad_norm": 1.3816825151443481,
35
+ "learning_rate": 0.00019487179487179487,
36
+ "loss": 1.0852,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.32051282051282054,
41
+ "grad_norm": 2.104780435562134,
42
+ "learning_rate": 0.0001935897435897436,
43
+ "loss": 1.1374,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.38461538461538464,
48
+ "grad_norm": 4.023139476776123,
49
+ "learning_rate": 0.00019230769230769233,
50
+ "loss": 0.9742,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.44871794871794873,
55
+ "grad_norm": 2.541919469833374,
56
+ "learning_rate": 0.00019102564102564104,
57
+ "loss": 0.8644,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5128205128205128,
62
+ "grad_norm": 1.7673051357269287,
63
+ "learning_rate": 0.00018974358974358974,
64
+ "loss": 0.928,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5769230769230769,
69
+ "grad_norm": 2.5402064323425293,
70
+ "learning_rate": 0.00018846153846153847,
71
+ "loss": 0.8873,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6410256410256411,
76
+ "grad_norm": 2.0333411693573,
77
+ "learning_rate": 0.0001871794871794872,
78
+ "loss": 0.8791,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6410256410256411,
83
+ "eval_accuracy": 0.7014388489208633,
84
+ "eval_loss": 0.7506538033485413,
85
+ "eval_runtime": 5.5468,
86
+ "eval_samples_per_second": 50.119,
87
+ "eval_steps_per_second": 6.31,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7051282051282052,
92
+ "grad_norm": 3.6588566303253174,
93
+ "learning_rate": 0.0001858974358974359,
94
+ "loss": 0.8667,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.7692307692307693,
99
+ "grad_norm": 4.510610580444336,
100
+ "learning_rate": 0.00018461538461538463,
101
+ "loss": 0.7618,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.8333333333333334,
106
+ "grad_norm": 3.297905445098877,
107
+ "learning_rate": 0.00018333333333333334,
108
+ "loss": 0.814,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.8974358974358975,
113
+ "grad_norm": 2.0982401371002197,
114
+ "learning_rate": 0.00018205128205128207,
115
+ "loss": 1.0144,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.9615384615384616,
120
+ "grad_norm": 2.0412075519561768,
121
+ "learning_rate": 0.00018076923076923077,
122
+ "loss": 0.6803,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.0256410256410255,
127
+ "grad_norm": 1.8364259004592896,
128
+ "learning_rate": 0.0001794871794871795,
129
+ "loss": 0.6764,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.0897435897435896,
134
+ "grad_norm": 2.415219306945801,
135
+ "learning_rate": 0.00017820512820512823,
136
+ "loss": 0.6038,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.1538461538461537,
141
+ "grad_norm": 2.8083627223968506,
142
+ "learning_rate": 0.00017692307692307693,
143
+ "loss": 0.609,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.217948717948718,
148
+ "grad_norm": 2.6611833572387695,
149
+ "learning_rate": 0.00017564102564102566,
150
+ "loss": 0.4588,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.282051282051282,
155
+ "grad_norm": 3.2890446186065674,
156
+ "learning_rate": 0.00017435897435897436,
157
+ "loss": 0.7436,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.282051282051282,
162
+ "eval_accuracy": 0.7697841726618705,
163
+ "eval_loss": 0.5399633049964905,
164
+ "eval_runtime": 6.1673,
165
+ "eval_samples_per_second": 45.076,
166
+ "eval_steps_per_second": 5.675,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.3461538461538463,
171
+ "grad_norm": 1.8595383167266846,
172
+ "learning_rate": 0.0001730769230769231,
173
+ "loss": 0.5508,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.4102564102564101,
178
+ "grad_norm": 2.8622944355010986,
179
+ "learning_rate": 0.0001717948717948718,
180
+ "loss": 0.5182,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.4743589743589745,
185
+ "grad_norm": 1.320397138595581,
186
+ "learning_rate": 0.00017051282051282053,
187
+ "loss": 0.5577,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.5384615384615383,
192
+ "grad_norm": 2.9527108669281006,
193
+ "learning_rate": 0.00016923076923076923,
194
+ "loss": 0.5081,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.6025641025641026,
199
+ "grad_norm": 6.604541301727295,
200
+ "learning_rate": 0.00016794871794871796,
201
+ "loss": 0.7143,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.6666666666666665,
206
+ "grad_norm": 3.27010440826416,
207
+ "learning_rate": 0.0001666666666666667,
208
+ "loss": 0.5838,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.7307692307692308,
213
+ "grad_norm": 2.723151922225952,
214
+ "learning_rate": 0.0001653846153846154,
215
+ "loss": 0.4859,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.7948717948717947,
220
+ "grad_norm": 2.4719648361206055,
221
+ "learning_rate": 0.0001641025641025641,
222
+ "loss": 0.4969,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.858974358974359,
227
+ "grad_norm": 1.9341636896133423,
228
+ "learning_rate": 0.00016282051282051282,
229
+ "loss": 0.4678,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.9230769230769231,
234
+ "grad_norm": 2.635348320007324,
235
+ "learning_rate": 0.00016153846153846155,
236
+ "loss": 0.4783,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.9230769230769231,
241
+ "eval_accuracy": 0.8345323741007195,
242
+ "eval_loss": 0.48316845297813416,
243
+ "eval_runtime": 6.0017,
244
+ "eval_samples_per_second": 46.32,
245
+ "eval_steps_per_second": 5.832,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.9871794871794872,
250
+ "grad_norm": 3.740190267562866,
251
+ "learning_rate": 0.00016025641025641028,
252
+ "loss": 0.8054,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.051282051282051,
257
+ "grad_norm": 3.149517059326172,
258
+ "learning_rate": 0.00015897435897435896,
259
+ "loss": 0.4985,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.1153846153846154,
264
+ "grad_norm": 2.627647638320923,
265
+ "learning_rate": 0.0001576923076923077,
266
+ "loss": 0.3483,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.1794871794871793,
271
+ "grad_norm": 4.553584575653076,
272
+ "learning_rate": 0.00015641025641025642,
273
+ "loss": 0.3432,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.2435897435897436,
278
+ "grad_norm": 1.0300644636154175,
279
+ "learning_rate": 0.00015512820512820515,
280
+ "loss": 0.2012,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.3076923076923075,
285
+ "grad_norm": 3.4395174980163574,
286
+ "learning_rate": 0.00015384615384615385,
287
+ "loss": 0.276,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.371794871794872,
292
+ "grad_norm": 5.862714767456055,
293
+ "learning_rate": 0.00015256410256410255,
294
+ "loss": 0.2513,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.435897435897436,
299
+ "grad_norm": 3.349158763885498,
300
+ "learning_rate": 0.00015128205128205128,
301
+ "loss": 0.3182,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.5,
306
+ "grad_norm": 4.547815799713135,
307
+ "learning_rate": 0.00015000000000000001,
308
+ "loss": 0.3878,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.564102564102564,
313
+ "grad_norm": 12.249879837036133,
314
+ "learning_rate": 0.00014871794871794872,
315
+ "loss": 0.3055,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.564102564102564,
320
+ "eval_accuracy": 0.9100719424460432,
321
+ "eval_loss": 0.2733714282512665,
322
+ "eval_runtime": 3.0904,
323
+ "eval_samples_per_second": 89.955,
324
+ "eval_steps_per_second": 11.325,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.628205128205128,
329
+ "grad_norm": 3.7284679412841797,
330
+ "learning_rate": 0.00014743589743589745,
331
+ "loss": 0.2669,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.6923076923076925,
336
+ "grad_norm": 2.6539294719696045,
337
+ "learning_rate": 0.00014615384615384615,
338
+ "loss": 0.2455,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.7564102564102564,
343
+ "grad_norm": 0.7099685668945312,
344
+ "learning_rate": 0.00014487179487179488,
345
+ "loss": 0.3793,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 2.8205128205128203,
350
+ "grad_norm": 3.6865317821502686,
351
+ "learning_rate": 0.0001435897435897436,
352
+ "loss": 0.2432,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 2.8846153846153846,
357
+ "grad_norm": 5.858238697052002,
358
+ "learning_rate": 0.0001423076923076923,
359
+ "loss": 0.2655,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 2.948717948717949,
364
+ "grad_norm": 0.48953068256378174,
365
+ "learning_rate": 0.00014102564102564104,
366
+ "loss": 0.2394,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.0128205128205128,
371
+ "grad_norm": 0.3400702476501465,
372
+ "learning_rate": 0.00013974358974358974,
373
+ "loss": 0.2037,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.076923076923077,
378
+ "grad_norm": 6.050922393798828,
379
+ "learning_rate": 0.00013846153846153847,
380
+ "loss": 0.1398,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.141025641025641,
385
+ "grad_norm": 3.962301731109619,
386
+ "learning_rate": 0.00013717948717948718,
387
+ "loss": 0.2337,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.2051282051282053,
392
+ "grad_norm": 0.4021244943141937,
393
+ "learning_rate": 0.0001358974358974359,
394
+ "loss": 0.2407,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.2051282051282053,
399
+ "eval_accuracy": 0.9172661870503597,
400
+ "eval_loss": 0.2732398509979248,
401
+ "eval_runtime": 3.5213,
402
+ "eval_samples_per_second": 78.948,
403
+ "eval_steps_per_second": 9.94,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.269230769230769,
408
+ "grad_norm": 0.1918177306652069,
409
+ "learning_rate": 0.00013461538461538464,
410
+ "loss": 0.0846,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.3333333333333335,
415
+ "grad_norm": 0.13242504000663757,
416
+ "learning_rate": 0.00013333333333333334,
417
+ "loss": 0.1978,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.3974358974358974,
422
+ "grad_norm": 6.946755409240723,
423
+ "learning_rate": 0.00013205128205128204,
424
+ "loss": 0.228,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.4615384615384617,
429
+ "grad_norm": 7.6276421546936035,
430
+ "learning_rate": 0.00013076923076923077,
431
+ "loss": 0.18,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.5256410256410255,
436
+ "grad_norm": 0.35523688793182373,
437
+ "learning_rate": 0.0001294871794871795,
438
+ "loss": 0.2565,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.58974358974359,
443
+ "grad_norm": 4.415125370025635,
444
+ "learning_rate": 0.00012820512820512823,
445
+ "loss": 0.2047,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.6538461538461537,
450
+ "grad_norm": 0.24991707503795624,
451
+ "learning_rate": 0.00012692307692307693,
452
+ "loss": 0.0867,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 3.717948717948718,
457
+ "grad_norm": 3.979609489440918,
458
+ "learning_rate": 0.00012564102564102564,
459
+ "loss": 0.2181,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 3.782051282051282,
464
+ "grad_norm": 2.4911768436431885,
465
+ "learning_rate": 0.00012435897435897437,
466
+ "loss": 0.2627,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 3.8461538461538463,
471
+ "grad_norm": 0.17239753901958466,
472
+ "learning_rate": 0.0001230769230769231,
473
+ "loss": 0.1367,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 3.8461538461538463,
478
+ "eval_accuracy": 0.8525179856115108,
479
+ "eval_loss": 0.40619969367980957,
480
+ "eval_runtime": 3.7715,
481
+ "eval_samples_per_second": 73.71,
482
+ "eval_steps_per_second": 9.28,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 3.91025641025641,
487
+ "grad_norm": 5.575563907623291,
488
+ "learning_rate": 0.00012179487179487179,
489
+ "loss": 0.1483,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 3.9743589743589745,
494
+ "grad_norm": 6.226486682891846,
495
+ "learning_rate": 0.00012051282051282052,
496
+ "loss": 0.1488,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 4.038461538461538,
501
+ "grad_norm": 4.32499885559082,
502
+ "learning_rate": 0.00011923076923076923,
503
+ "loss": 0.1551,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 4.102564102564102,
508
+ "grad_norm": 1.007263422012329,
509
+ "learning_rate": 0.00011794871794871796,
510
+ "loss": 0.0952,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 4.166666666666667,
515
+ "grad_norm": 0.3149818480014801,
516
+ "learning_rate": 0.00011666666666666668,
517
+ "loss": 0.0885,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 4.230769230769231,
522
+ "grad_norm": 0.10396721214056015,
523
+ "learning_rate": 0.00011538461538461538,
524
+ "loss": 0.0646,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 4.294871794871795,
529
+ "grad_norm": 1.3003551959991455,
530
+ "learning_rate": 0.0001141025641025641,
531
+ "loss": 0.1037,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 4.358974358974359,
536
+ "grad_norm": 0.16181178390979767,
537
+ "learning_rate": 0.00011282051282051283,
538
+ "loss": 0.0971,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 4.423076923076923,
543
+ "grad_norm": 0.11648831516504288,
544
+ "learning_rate": 0.00011153846153846154,
545
+ "loss": 0.1578,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 4.487179487179487,
550
+ "grad_norm": 7.352902889251709,
551
+ "learning_rate": 0.00011025641025641027,
552
+ "loss": 0.0943,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 4.487179487179487,
557
+ "eval_accuracy": 0.9136690647482014,
558
+ "eval_loss": 0.3187541365623474,
559
+ "eval_runtime": 3.5063,
560
+ "eval_samples_per_second": 79.287,
561
+ "eval_steps_per_second": 9.982,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 4.551282051282051,
566
+ "grad_norm": 0.08271457999944687,
567
+ "learning_rate": 0.00010897435897435896,
568
+ "loss": 0.1492,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 4.615384615384615,
573
+ "grad_norm": 3.449918270111084,
574
+ "learning_rate": 0.0001076923076923077,
575
+ "loss": 0.045,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 4.67948717948718,
580
+ "grad_norm": 13.875882148742676,
581
+ "learning_rate": 0.00010641025641025641,
582
+ "loss": 0.0635,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 4.743589743589744,
587
+ "grad_norm": 0.1391572505235672,
588
+ "learning_rate": 0.00010512820512820514,
589
+ "loss": 0.0433,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 4.8076923076923075,
594
+ "grad_norm": 0.06808628886938095,
595
+ "learning_rate": 0.00010384615384615386,
596
+ "loss": 0.0359,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 4.871794871794872,
601
+ "grad_norm": 0.17365112900733948,
602
+ "learning_rate": 0.00010256410256410256,
603
+ "loss": 0.0235,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 4.935897435897436,
608
+ "grad_norm": 0.15800704061985016,
609
+ "learning_rate": 0.00010128205128205129,
610
+ "loss": 0.0686,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 5.0,
615
+ "grad_norm": 0.584504246711731,
616
+ "learning_rate": 0.0001,
617
+ "loss": 0.1028,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 5.064102564102564,
622
+ "grad_norm": 5.587973594665527,
623
+ "learning_rate": 9.871794871794872e-05,
624
+ "loss": 0.0321,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 5.128205128205128,
629
+ "grad_norm": 11.099601745605469,
630
+ "learning_rate": 9.743589743589744e-05,
631
+ "loss": 0.0938,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 5.128205128205128,
636
+ "eval_accuracy": 0.9172661870503597,
637
+ "eval_loss": 0.32109296321868896,
638
+ "eval_runtime": 2.5127,
639
+ "eval_samples_per_second": 110.638,
640
+ "eval_steps_per_second": 13.929,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 5.1923076923076925,
645
+ "grad_norm": 5.029045104980469,
646
+ "learning_rate": 9.615384615384617e-05,
647
+ "loss": 0.0282,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 5.256410256410256,
652
+ "grad_norm": 0.06672952324151993,
653
+ "learning_rate": 9.487179487179487e-05,
654
+ "loss": 0.0204,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 5.32051282051282,
659
+ "grad_norm": 0.3987838923931122,
660
+ "learning_rate": 9.35897435897436e-05,
661
+ "loss": 0.0289,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 5.384615384615385,
666
+ "grad_norm": 0.05064750835299492,
667
+ "learning_rate": 9.230769230769232e-05,
668
+ "loss": 0.0088,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 5.448717948717949,
673
+ "grad_norm": 9.289505004882812,
674
+ "learning_rate": 9.102564102564103e-05,
675
+ "loss": 0.0158,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 5.512820512820513,
680
+ "grad_norm": 0.046484652906656265,
681
+ "learning_rate": 8.974358974358975e-05,
682
+ "loss": 0.0354,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 5.576923076923077,
687
+ "grad_norm": 2.9247965812683105,
688
+ "learning_rate": 8.846153846153847e-05,
689
+ "loss": 0.0583,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 5.641025641025641,
694
+ "grad_norm": 18.817678451538086,
695
+ "learning_rate": 8.717948717948718e-05,
696
+ "loss": 0.2382,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 5.705128205128205,
701
+ "grad_norm": 0.03485775738954544,
702
+ "learning_rate": 8.58974358974359e-05,
703
+ "loss": 0.03,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 5.769230769230769,
708
+ "grad_norm": 0.03652678430080414,
709
+ "learning_rate": 8.461538461538461e-05,
710
+ "loss": 0.0352,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 5.769230769230769,
715
+ "eval_accuracy": 0.9280575539568345,
716
+ "eval_loss": 0.30018866062164307,
717
+ "eval_runtime": 2.4539,
718
+ "eval_samples_per_second": 113.29,
719
+ "eval_steps_per_second": 14.263,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 5.833333333333333,
724
+ "grad_norm": 0.0848119929432869,
725
+ "learning_rate": 8.333333333333334e-05,
726
+ "loss": 0.0352,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 5.897435897435898,
731
+ "grad_norm": 0.04110792279243469,
732
+ "learning_rate": 8.205128205128205e-05,
733
+ "loss": 0.009,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 5.961538461538462,
738
+ "grad_norm": 0.0703083947300911,
739
+ "learning_rate": 8.076923076923078e-05,
740
+ "loss": 0.0361,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 6.0256410256410255,
745
+ "grad_norm": 0.0352785587310791,
746
+ "learning_rate": 7.948717948717948e-05,
747
+ "loss": 0.0073,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 6.089743589743589,
752
+ "grad_norm": 0.03985786810517311,
753
+ "learning_rate": 7.820512820512821e-05,
754
+ "loss": 0.0062,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 6.153846153846154,
759
+ "grad_norm": 1.1201144456863403,
760
+ "learning_rate": 7.692307692307693e-05,
761
+ "loss": 0.0108,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 6.217948717948718,
766
+ "grad_norm": 0.028453074395656586,
767
+ "learning_rate": 7.564102564102564e-05,
768
+ "loss": 0.0067,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 6.282051282051282,
773
+ "grad_norm": 0.11250611394643784,
774
+ "learning_rate": 7.435897435897436e-05,
775
+ "loss": 0.0061,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 6.346153846153846,
780
+ "grad_norm": 0.09943367540836334,
781
+ "learning_rate": 7.307692307692307e-05,
782
+ "loss": 0.0234,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 6.410256410256411,
787
+ "grad_norm": 0.031824991106987,
788
+ "learning_rate": 7.17948717948718e-05,
789
+ "loss": 0.0054,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 6.410256410256411,
794
  "eval_accuracy": 0.9244604316546763,
795
+ "eval_loss": 0.3863399028778076,
796
+ "eval_runtime": 2.6052,
797
+ "eval_samples_per_second": 106.709,
798
+ "eval_steps_per_second": 13.435,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 6.4743589743589745,
803
+ "grad_norm": 0.0259853545576334,
804
+ "learning_rate": 7.051282051282052e-05,
805
+ "loss": 0.0134,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 6.538461538461538,
810
+ "grad_norm": 0.07778998464345932,
811
+ "learning_rate": 6.923076923076924e-05,
812
+ "loss": 0.0055,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 6.602564102564102,
817
+ "grad_norm": 0.023598596453666687,
818
+ "learning_rate": 6.794871794871795e-05,
819
+ "loss": 0.009,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 6.666666666666667,
824
+ "grad_norm": 0.02402086742222309,
825
+ "learning_rate": 6.666666666666667e-05,
826
+ "loss": 0.0294,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 6.730769230769231,
831
+ "grad_norm": 0.02628781832754612,
832
+ "learning_rate": 6.538461538461539e-05,
833
+ "loss": 0.0055,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 6.794871794871795,
838
+ "grad_norm": 0.021159937605261803,
839
+ "learning_rate": 6.410256410256412e-05,
840
+ "loss": 0.0051,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 6.858974358974359,
845
+ "grad_norm": 0.027991948649287224,
846
+ "learning_rate": 6.282051282051282e-05,
847
+ "loss": 0.0044,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 6.923076923076923,
852
+ "grad_norm": 0.021519368514418602,
853
+ "learning_rate": 6.153846153846155e-05,
854
+ "loss": 0.0481,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 6.987179487179487,
859
+ "grad_norm": 0.02785063348710537,
860
+ "learning_rate": 6.025641025641026e-05,
861
+ "loss": 0.0048,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 7.051282051282051,
866
+ "grad_norm": 0.029052695259451866,
867
+ "learning_rate": 5.897435897435898e-05,
868
+ "loss": 0.0397,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 7.051282051282051,
873
+ "eval_accuracy": 0.9316546762589928,
874
+ "eval_loss": 0.37904202938079834,
875
+ "eval_runtime": 3.1468,
876
+ "eval_samples_per_second": 88.343,
877
+ "eval_steps_per_second": 11.122,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 7.115384615384615,
882
+ "grad_norm": 0.02662370726466179,
883
+ "learning_rate": 5.769230769230769e-05,
884
+ "loss": 0.0052,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 7.17948717948718,
889
+ "grad_norm": 0.0247439406812191,
890
+ "learning_rate": 5.6410256410256414e-05,
891
+ "loss": 0.0051,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 7.243589743589744,
896
+ "grad_norm": 0.025030823424458504,
897
+ "learning_rate": 5.512820512820514e-05,
898
+ "loss": 0.0044,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 7.3076923076923075,
903
+ "grad_norm": 0.030788838863372803,
904
+ "learning_rate": 5.384615384615385e-05,
905
+ "loss": 0.0044,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 7.371794871794872,
910
+ "grad_norm": 0.07316362112760544,
911
+ "learning_rate": 5.256410256410257e-05,
912
+ "loss": 0.0043,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 7.435897435897436,
917
+ "grad_norm": 0.024025099352002144,
918
+ "learning_rate": 5.128205128205128e-05,
919
+ "loss": 0.0041,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 7.5,
924
+ "grad_norm": 0.019680393859744072,
925
+ "learning_rate": 5e-05,
926
+ "loss": 0.0044,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 7.564102564102564,
931
+ "grad_norm": 0.02100115269422531,
932
+ "learning_rate": 4.871794871794872e-05,
933
+ "loss": 0.004,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 7.628205128205128,
938
+ "grad_norm": 0.01914617232978344,
939
+ "learning_rate": 4.7435897435897435e-05,
940
+ "loss": 0.0037,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 7.6923076923076925,
945
+ "grad_norm": 0.018958982080221176,
946
+ "learning_rate": 4.615384615384616e-05,
947
+ "loss": 0.0038,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 7.6923076923076925,
952
+ "eval_accuracy": 0.9388489208633094,
953
+ "eval_loss": 0.29680585861206055,
954
+ "eval_runtime": 3.2229,
955
+ "eval_samples_per_second": 86.259,
956
+ "eval_steps_per_second": 10.86,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 7.756410256410256,
961
+ "grad_norm": 0.01917438395321369,
962
+ "learning_rate": 4.4871794871794874e-05,
963
+ "loss": 0.0038,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 7.82051282051282,
968
+ "grad_norm": 0.01674094796180725,
969
+ "learning_rate": 4.358974358974359e-05,
970
+ "loss": 0.004,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 7.884615384615385,
975
+ "grad_norm": 0.01729915663599968,
976
+ "learning_rate": 4.230769230769231e-05,
977
+ "loss": 0.0037,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 7.948717948717949,
982
+ "grad_norm": 0.02169210836291313,
983
+ "learning_rate": 4.1025641025641023e-05,
984
+ "loss": 0.0036,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 8.012820512820513,
989
+ "grad_norm": 0.017752377316355705,
990
+ "learning_rate": 3.974358974358974e-05,
991
+ "loss": 0.0036,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 8.076923076923077,
996
+ "grad_norm": 0.017430851235985756,
997
+ "learning_rate": 3.846153846153846e-05,
998
+ "loss": 0.0035,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 8.14102564102564,
1003
+ "grad_norm": 0.017766138538718224,
1004
+ "learning_rate": 3.717948717948718e-05,
1005
+ "loss": 0.0036,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 8.205128205128204,
1010
+ "grad_norm": 0.016512656584382057,
1011
+ "learning_rate": 3.58974358974359e-05,
1012
+ "loss": 0.0034,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 8.26923076923077,
1017
+ "grad_norm": 0.018264977261424065,
1018
+ "learning_rate": 3.461538461538462e-05,
1019
+ "loss": 0.0034,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 8.333333333333334,
1024
+ "grad_norm": 0.017350930720567703,
1025
+ "learning_rate": 3.3333333333333335e-05,
1026
+ "loss": 0.0035,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 8.333333333333334,
1031
+ "eval_accuracy": 0.935251798561151,
1032
+ "eval_loss": 0.2936682403087616,
1033
+ "eval_runtime": 2.3785,
1034
+ "eval_samples_per_second": 116.881,
1035
+ "eval_steps_per_second": 14.715,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 8.397435897435898,
1040
+ "grad_norm": 0.017029928043484688,
1041
+ "learning_rate": 3.205128205128206e-05,
1042
+ "loss": 0.0032,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 8.461538461538462,
1047
+ "grad_norm": 0.015666915103793144,
1048
+ "learning_rate": 3.0769230769230774e-05,
1049
+ "loss": 0.0032,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 8.525641025641026,
1054
+ "grad_norm": 0.01642964966595173,
1055
+ "learning_rate": 2.948717948717949e-05,
1056
+ "loss": 0.0034,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 8.58974358974359,
1061
+ "grad_norm": 0.016315065324306488,
1062
+ "learning_rate": 2.8205128205128207e-05,
1063
+ "loss": 0.0033,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 8.653846153846153,
1068
+ "grad_norm": 0.014955148100852966,
1069
+ "learning_rate": 2.6923076923076923e-05,
1070
+ "loss": 0.0032,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 8.717948717948717,
1075
+ "grad_norm": 0.016276659443974495,
1076
+ "learning_rate": 2.564102564102564e-05,
1077
+ "loss": 0.0031,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 8.782051282051283,
1082
+ "grad_norm": 0.015878435224294662,
1083
+ "learning_rate": 2.435897435897436e-05,
1084
+ "loss": 0.0034,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 8.846153846153847,
1089
+ "grad_norm": 0.015775034204125404,
1090
+ "learning_rate": 2.307692307692308e-05,
1091
+ "loss": 0.0029,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 8.91025641025641,
1096
+ "grad_norm": 0.025302419438958168,
1097
+ "learning_rate": 2.1794871794871795e-05,
1098
+ "loss": 0.0031,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 8.974358974358974,
1103
+ "grad_norm": 0.01465103030204773,
1104
+ "learning_rate": 2.0512820512820512e-05,
1105
+ "loss": 0.003,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 8.974358974358974,
1110
+ "eval_accuracy": 0.9388489208633094,
1111
+ "eval_loss": 0.3025781810283661,
1112
+ "eval_runtime": 2.9166,
1113
+ "eval_samples_per_second": 95.318,
1114
+ "eval_steps_per_second": 12.0,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 9.038461538461538,
1119
+ "grad_norm": 0.016850067302584648,
1120
+ "learning_rate": 1.923076923076923e-05,
1121
+ "loss": 0.0032,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 9.102564102564102,
1126
+ "grad_norm": 0.017839446663856506,
1127
+ "learning_rate": 1.794871794871795e-05,
1128
+ "loss": 0.0033,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 9.166666666666666,
1133
+ "grad_norm": 0.019131546840071678,
1134
+ "learning_rate": 1.6666666666666667e-05,
1135
+ "loss": 0.0032,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 9.23076923076923,
1140
+ "grad_norm": 0.014986937865614891,
1141
+ "learning_rate": 1.5384615384615387e-05,
1142
+ "loss": 0.003,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 9.294871794871796,
1147
+ "grad_norm": 0.014039918780326843,
1148
+ "learning_rate": 1.4102564102564104e-05,
1149
+ "loss": 0.0029,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 9.35897435897436,
1154
+ "grad_norm": 0.015353621914982796,
1155
+ "learning_rate": 1.282051282051282e-05,
1156
+ "loss": 0.0029,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 9.423076923076923,
1161
+ "grad_norm": 0.01887168549001217,
1162
+ "learning_rate": 1.153846153846154e-05,
1163
+ "loss": 0.0031,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 9.487179487179487,
1168
+ "grad_norm": 0.01612565666437149,
1169
+ "learning_rate": 1.0256410256410256e-05,
1170
+ "loss": 0.003,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 9.551282051282051,
1175
+ "grad_norm": 0.014622623100876808,
1176
+ "learning_rate": 8.974358974358976e-06,
1177
+ "loss": 0.003,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 9.615384615384615,
1182
+ "grad_norm": 0.015224343165755272,
1183
+ "learning_rate": 7.692307692307694e-06,
1184
+ "loss": 0.0031,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 9.615384615384615,
1189
+ "eval_accuracy": 0.9388489208633094,
1190
+ "eval_loss": 0.3090037703514099,
1191
+ "eval_runtime": 2.3915,
1192
+ "eval_samples_per_second": 116.245,
1193
+ "eval_steps_per_second": 14.635,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 9.679487179487179,
1198
+ "grad_norm": 0.013898891396820545,
1199
+ "learning_rate": 6.41025641025641e-06,
1200
+ "loss": 0.0029,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 9.743589743589745,
1205
+ "grad_norm": 0.014191491529345512,
1206
+ "learning_rate": 5.128205128205128e-06,
1207
+ "loss": 0.003,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 9.807692307692308,
1212
+ "grad_norm": 0.01462490577250719,
1213
+ "learning_rate": 3.846153846153847e-06,
1214
+ "loss": 0.0029,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 9.871794871794872,
1219
+ "grad_norm": 0.019841287285089493,
1220
+ "learning_rate": 2.564102564102564e-06,
1221
+ "loss": 0.0029,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 9.935897435897436,
1226
+ "grad_norm": 0.015159196220338345,
1227
+ "learning_rate": 1.282051282051282e-06,
1228
+ "loss": 0.003,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 10.0,
1233
+ "grad_norm": 0.014964770525693893,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
  "learning_rate": 0.0,
1235
+ "loss": 0.0031,
1236
+ "step": 1560
1237
  },
1238
  {
1239
+ "epoch": 10.0,
1240
+ "step": 1560,
1241
+ "total_flos": 1.9334597982400512e+18,
1242
+ "train_loss": 0.21612570865485722,
1243
+ "train_runtime": 723.2579,
1244
+ "train_samples_per_second": 34.497,
1245
+ "train_steps_per_second": 2.157
1246
  }
1247
  ],
1248
  "logging_steps": 10,
1249
+ "max_steps": 1560,
1250
  "num_input_tokens_seen": 0,
1251
+ "num_train_epochs": 10,
1252
  "save_steps": 100,
1253
  "stateful_callbacks": {
1254
  "TrainerControl": {
 
1262
  "attributes": {}
1263
  }
1264
  },
1265
+ "total_flos": 1.9334597982400512e+18,
1266
  "train_batch_size": 16,
1267
  "trial_name": null,
1268
  "trial_params": null