Encore02 commited on
Commit
aabbf6e
·
verified ·
1 Parent(s): fdfc589

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
@@ -23,7 +24,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9280575539568345
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +34,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.4136
37
- - Accuracy: 0.9281
38
 
39
  ## Model description
40
 
 
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  datasets:
9
  - imagefolder
 
24
  metrics:
25
  - name: Accuracy
26
  type: accuracy
27
+ value: 0.9244604316546763
28
  ---
29
 
30
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
34
 
35
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
36
  It achieves the following results on the evaluation set:
37
+ - Loss: 0.3162
38
+ - Accuracy: 0.9245
39
 
40
  ## Model description
41
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 13.0,
3
- "eval_accuracy": 0.6745283018867925,
4
- "eval_loss": 0.7452366948127747,
5
- "eval_runtime": 2.5059,
6
- "eval_samples_per_second": 84.599,
7
- "eval_steps_per_second": 10.774,
8
- "total_flos": 1.9140864535683072e+18,
9
- "train_loss": 0.35688263059153663,
10
- "train_runtime": 955.328,
11
- "train_samples_per_second": 25.855,
12
- "train_steps_per_second": 1.619
13
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "eval_accuracy": 0.9244604316546763,
4
+ "eval_loss": 0.3162367641925812,
5
+ "eval_runtime": 2.9069,
6
+ "eval_samples_per_second": 95.635,
7
+ "eval_steps_per_second": 12.04,
8
+ "total_flos": 2.900189697360077e+18,
9
+ "train_loss": 0.14551133991808146,
10
+ "train_runtime": 927.2479,
11
+ "train_samples_per_second": 40.361,
12
+ "train_steps_per_second": 2.524
13
  }
data/events.out.tfevents.1730620135.cf96de2eb818.233.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1fc0570e9865f3a2fd0a3427472d0e425fbe34f012b012daf49ebe5ecf21398
3
+ size 411
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 13.0,
3
- "eval_accuracy": 0.6745283018867925,
4
- "eval_loss": 0.7452366948127747,
5
- "eval_runtime": 2.5059,
6
- "eval_samples_per_second": 84.599,
7
- "eval_steps_per_second": 10.774
8
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "eval_accuracy": 0.9244604316546763,
4
+ "eval_loss": 0.3162367641925812,
5
+ "eval_runtime": 2.9069,
6
+ "eval_samples_per_second": 95.635,
7
+ "eval_steps_per_second": 12.04
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 13.0,
3
- "total_flos": 1.9140864535683072e+18,
4
- "train_loss": 0.35688263059153663,
5
- "train_runtime": 955.328,
6
- "train_samples_per_second": 25.855,
7
- "train_steps_per_second": 1.619
8
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "total_flos": 2.900189697360077e+18,
4
+ "train_loss": 0.14551133991808146,
5
+ "train_runtime": 927.2479,
6
+ "train_samples_per_second": 40.361,
7
+ "train_steps_per_second": 2.524
8
  }
trainer_state.json CHANGED
@@ -1,1240 +1,1872 @@
1
  {
2
- "best_metric": 0.7452366948127747,
3
- "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-400",
4
- "epoch": 13.0,
5
  "eval_steps": 100,
6
- "global_step": 1547,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08403361344537816,
13
- "grad_norm": 1.7778676748275757,
14
- "learning_rate": 0.00019870717517776342,
15
- "loss": 1.2807,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.16806722689075632,
20
- "grad_norm": 1.3721851110458374,
21
- "learning_rate": 0.00019741435035552685,
22
- "loss": 1.187,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.25210084033613445,
27
- "grad_norm": 0.4043492376804352,
28
- "learning_rate": 0.00019612152553329023,
29
- "loss": 1.2471,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.33613445378151263,
34
- "grad_norm": 1.9244325160980225,
35
- "learning_rate": 0.00019482870071105366,
36
- "loss": 1.2329,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.42016806722689076,
41
- "grad_norm": 0.6638385653495789,
42
- "learning_rate": 0.00019353587588881707,
43
- "loss": 1.1524,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.5042016806722689,
48
- "grad_norm": 0.3663930594921112,
49
- "learning_rate": 0.0001922430510665805,
50
- "loss": 1.1548,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.5882352941176471,
55
- "grad_norm": 0.6382243633270264,
56
- "learning_rate": 0.0001909502262443439,
57
- "loss": 1.1993,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.6722689075630253,
62
- "grad_norm": 1.2848349809646606,
63
- "learning_rate": 0.0001896574014221073,
64
- "loss": 1.1822,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.7563025210084033,
69
- "grad_norm": 1.2714462280273438,
70
- "learning_rate": 0.00018836457659987072,
71
- "loss": 1.1902,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.8403361344537815,
76
- "grad_norm": 1.4769024848937988,
77
- "learning_rate": 0.00018707175177763415,
78
- "loss": 1.2099,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.8403361344537815,
83
- "eval_accuracy": 0.5047169811320755,
84
- "eval_loss": 1.1625308990478516,
85
- "eval_runtime": 2.2575,
86
- "eval_samples_per_second": 93.909,
87
- "eval_steps_per_second": 11.96,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 0.9243697478991597,
92
- "grad_norm": 1.2741708755493164,
93
- "learning_rate": 0.00018577892695539755,
94
- "loss": 1.2203,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 1.0084033613445378,
99
- "grad_norm": 1.2036206722259521,
100
- "learning_rate": 0.000184486102133161,
101
- "loss": 1.1936,
102
  "step": 120
103
  },
104
  {
105
- "epoch": 1.092436974789916,
106
- "grad_norm": 1.2514188289642334,
107
- "learning_rate": 0.00018319327731092437,
108
- "loss": 1.1515,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 1.1764705882352942,
113
- "grad_norm": 0.7073956727981567,
114
- "learning_rate": 0.0001819004524886878,
115
- "loss": 1.1483,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 1.2605042016806722,
120
- "grad_norm": 0.7465972900390625,
121
- "learning_rate": 0.0001806076276664512,
122
- "loss": 1.1962,
123
  "step": 150
124
  },
125
  {
126
- "epoch": 1.3445378151260505,
127
- "grad_norm": 0.6339373588562012,
128
- "learning_rate": 0.00017931480284421464,
129
- "loss": 1.1156,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 1.4285714285714286,
134
- "grad_norm": 1.036371111869812,
135
- "learning_rate": 0.00017802197802197802,
136
- "loss": 1.1304,
137
  "step": 170
138
  },
139
  {
140
- "epoch": 1.5126050420168067,
141
- "grad_norm": 1.3491630554199219,
142
- "learning_rate": 0.00017672915319974145,
143
- "loss": 1.1783,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 1.596638655462185,
148
- "grad_norm": 1.3167691230773926,
149
- "learning_rate": 0.00017543632837750485,
150
- "loss": 1.1617,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.680672268907563,
155
- "grad_norm": 1.0561383962631226,
156
- "learning_rate": 0.00017414350355526826,
157
- "loss": 1.1066,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 1.680672268907563,
162
- "eval_accuracy": 0.49528301886792453,
163
- "eval_loss": 1.0892218351364136,
164
- "eval_runtime": 2.1715,
165
- "eval_samples_per_second": 97.628,
166
- "eval_steps_per_second": 12.434,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 1.7647058823529411,
171
- "grad_norm": 1.2338758707046509,
172
- "learning_rate": 0.0001728506787330317,
173
- "loss": 1.0634,
174
  "step": 210
175
  },
176
  {
177
- "epoch": 1.8487394957983194,
178
- "grad_norm": 1.416668176651001,
179
- "learning_rate": 0.0001715578539107951,
180
- "loss": 1.1408,
181
  "step": 220
182
  },
183
  {
184
- "epoch": 1.9327731092436975,
185
- "grad_norm": 1.9194142818450928,
186
- "learning_rate": 0.0001702650290885585,
187
- "loss": 1.2019,
188
  "step": 230
189
  },
190
  {
191
- "epoch": 2.0168067226890756,
192
- "grad_norm": 1.1238566637039185,
193
- "learning_rate": 0.0001689722042663219,
194
- "loss": 1.0287,
195
  "step": 240
196
  },
197
  {
198
- "epoch": 2.100840336134454,
199
- "grad_norm": 3.9827613830566406,
200
- "learning_rate": 0.00016767937944408534,
201
- "loss": 0.9904,
202
  "step": 250
203
  },
204
  {
205
- "epoch": 2.184873949579832,
206
- "grad_norm": 1.7081505060195923,
207
- "learning_rate": 0.00016638655462184875,
208
- "loss": 1.0905,
209
  "step": 260
210
  },
211
  {
212
- "epoch": 2.26890756302521,
213
- "grad_norm": 2.686239719390869,
214
- "learning_rate": 0.00016509372979961215,
215
- "loss": 0.9177,
216
  "step": 270
217
  },
218
  {
219
- "epoch": 2.3529411764705883,
220
- "grad_norm": 1.3638893365859985,
221
- "learning_rate": 0.00016380090497737556,
222
- "loss": 0.9348,
223
  "step": 280
224
  },
225
  {
226
- "epoch": 2.4369747899159666,
227
- "grad_norm": 2.050823450088501,
228
- "learning_rate": 0.000162508080155139,
229
- "loss": 0.8364,
230
  "step": 290
231
  },
232
  {
233
- "epoch": 2.5210084033613445,
234
- "grad_norm": 1.718785047531128,
235
- "learning_rate": 0.0001612152553329024,
236
- "loss": 1.0298,
237
  "step": 300
238
  },
239
  {
240
- "epoch": 2.5210084033613445,
241
- "eval_accuracy": 0.589622641509434,
242
- "eval_loss": 0.893924355506897,
243
- "eval_runtime": 2.9715,
244
- "eval_samples_per_second": 71.345,
245
- "eval_steps_per_second": 9.086,
246
  "step": 300
247
  },
248
  {
249
- "epoch": 2.6050420168067228,
250
- "grad_norm": 2.593571424484253,
251
- "learning_rate": 0.00015992243051066583,
252
- "loss": 0.9351,
253
  "step": 310
254
  },
255
  {
256
- "epoch": 2.689075630252101,
257
- "grad_norm": 1.2314530611038208,
258
- "learning_rate": 0.0001586296056884292,
259
- "loss": 0.841,
260
  "step": 320
261
  },
262
  {
263
- "epoch": 2.773109243697479,
264
- "grad_norm": 2.589643716812134,
265
- "learning_rate": 0.00015733678086619264,
266
- "loss": 0.7806,
267
  "step": 330
268
  },
269
  {
270
- "epoch": 2.857142857142857,
271
- "grad_norm": 1.8523632287979126,
272
- "learning_rate": 0.00015604395604395605,
273
- "loss": 0.8915,
274
  "step": 340
275
  },
276
  {
277
- "epoch": 2.9411764705882355,
278
- "grad_norm": 2.1867382526397705,
279
- "learning_rate": 0.00015475113122171948,
280
- "loss": 0.8422,
281
  "step": 350
282
  },
283
  {
284
- "epoch": 3.0252100840336134,
285
- "grad_norm": 2.2038803100585938,
286
- "learning_rate": 0.0001534583063994829,
287
- "loss": 0.7271,
288
  "step": 360
289
  },
290
  {
291
- "epoch": 3.1092436974789917,
292
- "grad_norm": 2.8037712574005127,
293
- "learning_rate": 0.0001521654815772463,
294
- "loss": 0.6003,
295
  "step": 370
296
  },
297
  {
298
- "epoch": 3.19327731092437,
299
- "grad_norm": 1.8391917943954468,
300
- "learning_rate": 0.0001508726567550097,
301
- "loss": 0.6642,
302
  "step": 380
303
  },
304
  {
305
- "epoch": 3.277310924369748,
306
- "grad_norm": 4.166950225830078,
307
- "learning_rate": 0.00014957983193277313,
308
- "loss": 0.6483,
309
  "step": 390
310
  },
311
  {
312
- "epoch": 3.361344537815126,
313
- "grad_norm": 2.6970036029815674,
314
- "learning_rate": 0.00014828700711053654,
315
- "loss": 0.5798,
316
  "step": 400
317
  },
318
  {
319
- "epoch": 3.361344537815126,
320
- "eval_accuracy": 0.6745283018867925,
321
- "eval_loss": 0.7452366948127747,
322
- "eval_runtime": 2.3359,
323
- "eval_samples_per_second": 90.759,
324
- "eval_steps_per_second": 11.559,
325
  "step": 400
326
  },
327
  {
328
- "epoch": 3.4453781512605044,
329
- "grad_norm": 2.7985074520111084,
330
- "learning_rate": 0.00014699418228829994,
331
- "loss": 0.5447,
332
  "step": 410
333
  },
334
  {
335
- "epoch": 3.5294117647058822,
336
- "grad_norm": 5.049683094024658,
337
- "learning_rate": 0.00014570135746606335,
338
- "loss": 0.6424,
339
  "step": 420
340
  },
341
  {
342
- "epoch": 3.6134453781512605,
343
- "grad_norm": 2.288046360015869,
344
- "learning_rate": 0.00014440853264382675,
345
- "loss": 0.5786,
346
  "step": 430
347
  },
348
  {
349
- "epoch": 3.697478991596639,
350
- "grad_norm": 2.6284878253936768,
351
- "learning_rate": 0.0001431157078215902,
352
- "loss": 0.4704,
353
  "step": 440
354
  },
355
  {
356
- "epoch": 3.7815126050420167,
357
- "grad_norm": 3.1169135570526123,
358
- "learning_rate": 0.0001418228829993536,
359
- "loss": 0.5602,
360
  "step": 450
361
  },
362
  {
363
- "epoch": 3.865546218487395,
364
- "grad_norm": 4.109696388244629,
365
- "learning_rate": 0.000140530058177117,
366
- "loss": 0.5028,
367
  "step": 460
368
  },
369
  {
370
- "epoch": 3.9495798319327733,
371
- "grad_norm": 2.4809916019439697,
372
- "learning_rate": 0.0001392372333548804,
373
- "loss": 0.5331,
374
  "step": 470
375
  },
376
  {
377
- "epoch": 4.033613445378151,
378
- "grad_norm": 5.215726375579834,
379
- "learning_rate": 0.00013794440853264384,
380
- "loss": 0.6104,
381
  "step": 480
382
  },
383
  {
384
- "epoch": 4.117647058823529,
385
- "grad_norm": 2.3470144271850586,
386
- "learning_rate": 0.00013665158371040724,
387
- "loss": 0.3547,
388
  "step": 490
389
  },
390
  {
391
- "epoch": 4.201680672268908,
392
- "grad_norm": 2.578737258911133,
393
- "learning_rate": 0.00013535875888817068,
394
- "loss": 0.4879,
395
  "step": 500
396
  },
397
  {
398
- "epoch": 4.201680672268908,
399
- "eval_accuracy": 0.6556603773584906,
400
- "eval_loss": 0.8673213720321655,
401
- "eval_runtime": 2.9615,
402
- "eval_samples_per_second": 71.585,
403
- "eval_steps_per_second": 9.117,
404
  "step": 500
405
  },
406
  {
407
- "epoch": 4.285714285714286,
408
- "grad_norm": 8.1809663772583,
409
- "learning_rate": 0.00013406593406593405,
410
- "loss": 0.3164,
411
  "step": 510
412
  },
413
  {
414
- "epoch": 4.369747899159664,
415
- "grad_norm": 5.916128158569336,
416
- "learning_rate": 0.0001327731092436975,
417
- "loss": 0.3425,
418
  "step": 520
419
  },
420
  {
421
- "epoch": 4.453781512605042,
422
- "grad_norm": 1.5418981313705444,
423
- "learning_rate": 0.0001314802844214609,
424
- "loss": 0.3219,
425
  "step": 530
426
  },
427
  {
428
- "epoch": 4.53781512605042,
429
- "grad_norm": 0.39303484559059143,
430
- "learning_rate": 0.00013018745959922433,
431
- "loss": 0.372,
432
  "step": 540
433
  },
434
  {
435
- "epoch": 4.621848739495798,
436
- "grad_norm": 2.0171704292297363,
437
- "learning_rate": 0.00012889463477698773,
438
- "loss": 0.3941,
439
  "step": 550
440
  },
441
  {
442
- "epoch": 4.705882352941177,
443
- "grad_norm": 5.667063236236572,
444
- "learning_rate": 0.00012760180995475114,
445
- "loss": 0.3784,
446
  "step": 560
447
  },
448
  {
449
- "epoch": 4.7899159663865545,
450
- "grad_norm": 2.0401604175567627,
451
- "learning_rate": 0.00012630898513251454,
452
- "loss": 0.3231,
453
  "step": 570
454
  },
455
  {
456
- "epoch": 4.873949579831933,
457
- "grad_norm": 2.5163936614990234,
458
- "learning_rate": 0.00012501616031027798,
459
- "loss": 0.3287,
460
  "step": 580
461
  },
462
  {
463
- "epoch": 4.957983193277311,
464
- "grad_norm": 2.357574939727783,
465
- "learning_rate": 0.00012372333548804138,
466
- "loss": 0.2309,
467
  "step": 590
468
  },
469
  {
470
- "epoch": 5.042016806722689,
471
- "grad_norm": 2.399186611175537,
472
- "learning_rate": 0.0001224305106658048,
473
- "loss": 0.197,
474
  "step": 600
475
  },
476
  {
477
- "epoch": 5.042016806722689,
478
- "eval_accuracy": 0.6556603773584906,
479
- "eval_loss": 1.0144904851913452,
480
- "eval_runtime": 2.2097,
481
- "eval_samples_per_second": 95.94,
482
- "eval_steps_per_second": 12.219,
483
  "step": 600
484
  },
485
  {
486
- "epoch": 5.126050420168067,
487
- "grad_norm": 4.648927688598633,
488
- "learning_rate": 0.00012113768584356819,
489
- "loss": 0.2243,
490
  "step": 610
491
  },
492
  {
493
- "epoch": 5.2100840336134455,
494
- "grad_norm": 5.755702972412109,
495
- "learning_rate": 0.00011984486102133161,
496
- "loss": 0.1775,
497
  "step": 620
498
  },
499
  {
500
- "epoch": 5.294117647058823,
501
- "grad_norm": 5.102352619171143,
502
- "learning_rate": 0.00011855203619909503,
503
- "loss": 0.1982,
504
  "step": 630
505
  },
506
  {
507
- "epoch": 5.378151260504202,
508
- "grad_norm": 2.311920404434204,
509
- "learning_rate": 0.00011725921137685845,
510
- "loss": 0.2125,
511
  "step": 640
512
  },
513
  {
514
- "epoch": 5.46218487394958,
515
- "grad_norm": 5.563356876373291,
516
- "learning_rate": 0.00011596638655462187,
517
- "loss": 0.2259,
518
  "step": 650
519
  },
520
  {
521
- "epoch": 5.546218487394958,
522
- "grad_norm": 5.233443260192871,
523
- "learning_rate": 0.00011467356173238526,
524
- "loss": 0.263,
525
  "step": 660
526
  },
527
  {
528
- "epoch": 5.630252100840336,
529
- "grad_norm": 2.19209361076355,
530
- "learning_rate": 0.00011338073691014868,
531
- "loss": 0.2627,
532
  "step": 670
533
  },
534
  {
535
- "epoch": 5.714285714285714,
536
- "grad_norm": 5.696531772613525,
537
- "learning_rate": 0.0001120879120879121,
538
- "loss": 0.2588,
539
  "step": 680
540
  },
541
  {
542
- "epoch": 5.798319327731092,
543
- "grad_norm": 4.516761302947998,
544
- "learning_rate": 0.0001107950872656755,
545
- "loss": 0.2148,
546
  "step": 690
547
  },
548
  {
549
- "epoch": 5.882352941176471,
550
- "grad_norm": 0.21505996584892273,
551
- "learning_rate": 0.00010950226244343893,
552
- "loss": 0.1368,
553
  "step": 700
554
  },
555
  {
556
- "epoch": 5.882352941176471,
557
- "eval_accuracy": 0.7311320754716981,
558
- "eval_loss": 0.8305109739303589,
559
- "eval_runtime": 2.2559,
560
- "eval_samples_per_second": 93.974,
561
- "eval_steps_per_second": 11.968,
562
  "step": 700
563
  },
564
  {
565
- "epoch": 5.966386554621849,
566
- "grad_norm": 4.3951263427734375,
567
- "learning_rate": 0.00010820943762120233,
568
- "loss": 0.2699,
569
  "step": 710
570
  },
571
  {
572
- "epoch": 6.050420168067227,
573
- "grad_norm": 0.8779445290565491,
574
- "learning_rate": 0.00010691661279896574,
575
- "loss": 0.1172,
576
  "step": 720
577
  },
578
  {
579
- "epoch": 6.1344537815126055,
580
- "grad_norm": 4.695611476898193,
581
- "learning_rate": 0.00010562378797672916,
582
- "loss": 0.13,
583
  "step": 730
584
  },
585
  {
586
- "epoch": 6.218487394957983,
587
- "grad_norm": 7.564522743225098,
588
- "learning_rate": 0.00010433096315449258,
589
- "loss": 0.1392,
590
  "step": 740
591
  },
592
  {
593
- "epoch": 6.302521008403361,
594
- "grad_norm": 0.17681638896465302,
595
- "learning_rate": 0.00010303813833225597,
596
- "loss": 0.0828,
597
  "step": 750
598
  },
599
  {
600
- "epoch": 6.38655462184874,
601
- "grad_norm": 2.515813112258911,
602
- "learning_rate": 0.00010174531351001939,
603
- "loss": 0.1119,
604
  "step": 760
605
  },
606
  {
607
- "epoch": 6.470588235294118,
608
- "grad_norm": 0.3115313649177551,
609
- "learning_rate": 0.0001004524886877828,
610
- "loss": 0.0667,
611
  "step": 770
612
  },
613
  {
614
- "epoch": 6.554621848739496,
615
- "grad_norm": 0.9338003396987915,
616
- "learning_rate": 9.915966386554623e-05,
617
- "loss": 0.0779,
618
  "step": 780
619
  },
620
  {
621
- "epoch": 6.6386554621848735,
622
- "grad_norm": 5.663729190826416,
623
- "learning_rate": 9.786683904330963e-05,
624
- "loss": 0.0949,
625
  "step": 790
626
  },
627
  {
628
- "epoch": 6.722689075630252,
629
- "grad_norm": 1.159752368927002,
630
- "learning_rate": 9.657401422107305e-05,
631
- "loss": 0.0841,
632
  "step": 800
633
  },
634
  {
635
- "epoch": 6.722689075630252,
636
- "eval_accuracy": 0.7735849056603774,
637
- "eval_loss": 0.8974043130874634,
638
- "eval_runtime": 2.2126,
639
- "eval_samples_per_second": 95.816,
640
- "eval_steps_per_second": 12.203,
641
  "step": 800
642
  },
643
  {
644
- "epoch": 6.80672268907563,
645
- "grad_norm": 0.8134496808052063,
646
- "learning_rate": 9.528118939883646e-05,
647
- "loss": 0.1272,
648
  "step": 810
649
  },
650
  {
651
- "epoch": 6.890756302521009,
652
- "grad_norm": 0.09464468061923981,
653
- "learning_rate": 9.398836457659988e-05,
654
- "loss": 0.1339,
655
  "step": 820
656
  },
657
  {
658
- "epoch": 6.974789915966387,
659
- "grad_norm": 0.08403979986906052,
660
- "learning_rate": 9.26955397543633e-05,
661
- "loss": 0.0779,
662
  "step": 830
663
  },
664
  {
665
- "epoch": 7.0588235294117645,
666
- "grad_norm": 0.11395015567541122,
667
- "learning_rate": 9.14027149321267e-05,
668
- "loss": 0.0495,
669
  "step": 840
670
  },
671
  {
672
- "epoch": 7.142857142857143,
673
- "grad_norm": 5.00321102142334,
674
- "learning_rate": 9.010989010989012e-05,
675
- "loss": 0.2217,
676
  "step": 850
677
  },
678
  {
679
- "epoch": 7.226890756302521,
680
- "grad_norm": 5.354154109954834,
681
- "learning_rate": 8.881706528765353e-05,
682
- "loss": 0.0713,
683
  "step": 860
684
  },
685
  {
686
- "epoch": 7.310924369747899,
687
- "grad_norm": 0.07731425017118454,
688
- "learning_rate": 8.752424046541694e-05,
689
- "loss": 0.0482,
690
  "step": 870
691
  },
692
  {
693
- "epoch": 7.394957983193278,
694
- "grad_norm": 1.70600163936615,
695
- "learning_rate": 8.623141564318036e-05,
696
- "loss": 0.0368,
697
  "step": 880
698
  },
699
  {
700
- "epoch": 7.4789915966386555,
701
- "grad_norm": 0.09904234856367111,
702
- "learning_rate": 8.493859082094377e-05,
703
- "loss": 0.0389,
704
  "step": 890
705
  },
706
  {
707
- "epoch": 7.563025210084033,
708
- "grad_norm": 5.335230350494385,
709
- "learning_rate": 8.364576599870718e-05,
710
- "loss": 0.0942,
711
  "step": 900
712
  },
713
  {
714
- "epoch": 7.563025210084033,
715
- "eval_accuracy": 0.7216981132075472,
716
- "eval_loss": 1.1261749267578125,
717
- "eval_runtime": 2.2006,
718
- "eval_samples_per_second": 96.335,
719
- "eval_steps_per_second": 12.269,
720
  "step": 900
721
  },
722
  {
723
- "epoch": 7.647058823529412,
724
- "grad_norm": 5.030584812164307,
725
- "learning_rate": 8.23529411764706e-05,
726
- "loss": 0.0278,
727
  "step": 910
728
  },
729
  {
730
- "epoch": 7.73109243697479,
731
- "grad_norm": 0.12369989603757858,
732
- "learning_rate": 8.1060116354234e-05,
733
- "loss": 0.1137,
734
  "step": 920
735
  },
736
  {
737
- "epoch": 7.815126050420168,
738
- "grad_norm": 7.5863189697265625,
739
- "learning_rate": 7.976729153199742e-05,
740
- "loss": 0.0904,
741
  "step": 930
742
  },
743
  {
744
- "epoch": 7.899159663865547,
745
- "grad_norm": 0.2067825049161911,
746
- "learning_rate": 7.847446670976083e-05,
747
- "loss": 0.0397,
748
  "step": 940
749
  },
750
  {
751
- "epoch": 7.983193277310924,
752
- "grad_norm": 0.056721098721027374,
753
- "learning_rate": 7.718164188752424e-05,
754
- "loss": 0.0679,
755
  "step": 950
756
  },
757
  {
758
- "epoch": 8.067226890756302,
759
- "grad_norm": 0.05310463905334473,
760
- "learning_rate": 7.588881706528765e-05,
761
- "loss": 0.0329,
762
  "step": 960
763
  },
764
  {
765
- "epoch": 8.15126050420168,
766
- "grad_norm": 7.898382663726807,
767
- "learning_rate": 7.459599224305107e-05,
768
- "loss": 0.0183,
769
  "step": 970
770
  },
771
  {
772
- "epoch": 8.235294117647058,
773
- "grad_norm": 2.061277151107788,
774
- "learning_rate": 7.330316742081448e-05,
775
- "loss": 0.0311,
776
  "step": 980
777
  },
778
  {
779
- "epoch": 8.319327731092438,
780
- "grad_norm": 0.06646686792373657,
781
- "learning_rate": 7.20103425985779e-05,
782
- "loss": 0.0334,
783
  "step": 990
784
  },
785
  {
786
- "epoch": 8.403361344537815,
787
- "grad_norm": 0.07112545520067215,
788
- "learning_rate": 7.071751777634131e-05,
789
- "loss": 0.0296,
790
  "step": 1000
791
  },
792
  {
793
- "epoch": 8.403361344537815,
794
- "eval_accuracy": 0.7122641509433962,
795
- "eval_loss": 1.2889635562896729,
796
- "eval_runtime": 2.2011,
797
- "eval_samples_per_second": 96.314,
798
- "eval_steps_per_second": 12.266,
799
  "step": 1000
800
  },
801
  {
802
- "epoch": 8.487394957983193,
803
- "grad_norm": 0.07936228811740875,
804
- "learning_rate": 6.942469295410472e-05,
805
- "loss": 0.0256,
806
  "step": 1010
807
  },
808
  {
809
- "epoch": 8.571428571428571,
810
- "grad_norm": 5.849864959716797,
811
- "learning_rate": 6.813186813186814e-05,
812
- "loss": 0.0346,
813
  "step": 1020
814
  },
815
  {
816
- "epoch": 8.655462184873949,
817
- "grad_norm": 0.05158023163676262,
818
- "learning_rate": 6.683904330963154e-05,
819
- "loss": 0.0109,
820
  "step": 1030
821
  },
822
  {
823
- "epoch": 8.739495798319329,
824
- "grad_norm": 0.05596969276666641,
825
- "learning_rate": 6.554621848739496e-05,
826
- "loss": 0.0129,
827
  "step": 1040
828
  },
829
  {
830
- "epoch": 8.823529411764707,
831
- "grad_norm": 0.05292417109012604,
832
- "learning_rate": 6.425339366515838e-05,
833
- "loss": 0.033,
834
  "step": 1050
835
  },
836
  {
837
- "epoch": 8.907563025210084,
838
- "grad_norm": 0.8892333507537842,
839
- "learning_rate": 6.296056884292179e-05,
840
- "loss": 0.0199,
841
  "step": 1060
842
  },
843
  {
844
- "epoch": 8.991596638655462,
845
- "grad_norm": 1.8524911403656006,
846
- "learning_rate": 6.166774402068521e-05,
847
- "loss": 0.0324,
848
  "step": 1070
849
  },
850
  {
851
- "epoch": 9.07563025210084,
852
- "grad_norm": 0.040928326547145844,
853
- "learning_rate": 6.037491919844861e-05,
854
- "loss": 0.0352,
855
  "step": 1080
856
  },
857
  {
858
- "epoch": 9.159663865546218,
859
- "grad_norm": 0.043698906898498535,
860
- "learning_rate": 5.9082094376212026e-05,
861
- "loss": 0.009,
862
  "step": 1090
863
  },
864
  {
865
- "epoch": 9.243697478991596,
866
- "grad_norm": 0.04034803435206413,
867
- "learning_rate": 5.778926955397543e-05,
868
- "loss": 0.0432,
869
  "step": 1100
870
  },
871
  {
872
- "epoch": 9.243697478991596,
873
- "eval_accuracy": 0.7405660377358491,
874
- "eval_loss": 1.2427575588226318,
875
- "eval_runtime": 2.1879,
876
- "eval_samples_per_second": 96.895,
877
- "eval_steps_per_second": 12.34,
878
  "step": 1100
879
  },
880
  {
881
- "epoch": 9.327731092436975,
882
- "grad_norm": 0.042758312076330185,
883
- "learning_rate": 5.649644473173885e-05,
884
- "loss": 0.0086,
885
  "step": 1110
886
  },
887
  {
888
- "epoch": 9.411764705882353,
889
- "grad_norm": 0.05348571389913559,
890
- "learning_rate": 5.520361990950227e-05,
891
- "loss": 0.0113,
892
  "step": 1120
893
  },
894
  {
895
- "epoch": 9.495798319327731,
896
- "grad_norm": 0.04173032566905022,
897
- "learning_rate": 5.3910795087265676e-05,
898
- "loss": 0.0083,
899
  "step": 1130
900
  },
901
  {
902
- "epoch": 9.579831932773109,
903
- "grad_norm": 0.03784575313329697,
904
- "learning_rate": 5.2617970265029096e-05,
905
- "loss": 0.0086,
906
  "step": 1140
907
  },
908
  {
909
- "epoch": 9.663865546218487,
910
- "grad_norm": 0.05332985520362854,
911
- "learning_rate": 5.13251454427925e-05,
912
- "loss": 0.0086,
913
  "step": 1150
914
  },
915
  {
916
- "epoch": 9.747899159663866,
917
- "grad_norm": 0.03503885120153427,
918
- "learning_rate": 5.0032320620555914e-05,
919
- "loss": 0.0078,
920
  "step": 1160
921
  },
922
  {
923
- "epoch": 9.831932773109244,
924
- "grad_norm": 0.033440928906202316,
925
- "learning_rate": 4.8739495798319326e-05,
926
- "loss": 0.0095,
927
  "step": 1170
928
  },
929
  {
930
- "epoch": 9.915966386554622,
931
- "grad_norm": 0.03903155028820038,
932
- "learning_rate": 4.744667097608274e-05,
933
- "loss": 0.0071,
934
  "step": 1180
935
  },
936
  {
937
- "epoch": 10.0,
938
- "grad_norm": 0.034581057727336884,
939
- "learning_rate": 4.615384615384616e-05,
940
- "loss": 0.0347,
941
  "step": 1190
942
  },
943
  {
944
- "epoch": 10.084033613445378,
945
- "grad_norm": 4.804828643798828,
946
- "learning_rate": 4.486102133160957e-05,
947
- "loss": 0.0353,
948
  "step": 1200
949
  },
950
  {
951
- "epoch": 10.084033613445378,
952
- "eval_accuracy": 0.7452830188679245,
953
- "eval_loss": 1.250637173652649,
954
- "eval_runtime": 2.1411,
955
- "eval_samples_per_second": 99.016,
956
- "eval_steps_per_second": 12.61,
957
  "step": 1200
958
  },
959
  {
960
- "epoch": 10.168067226890756,
961
- "grad_norm": 0.03247935697436333,
962
- "learning_rate": 4.356819650937298e-05,
963
- "loss": 0.0071,
964
  "step": 1210
965
  },
966
  {
967
- "epoch": 10.252100840336134,
968
- "grad_norm": 0.03735749423503876,
969
- "learning_rate": 4.2275371687136396e-05,
970
- "loss": 0.007,
971
  "step": 1220
972
  },
973
  {
974
- "epoch": 10.336134453781513,
975
- "grad_norm": 0.03190077841281891,
976
- "learning_rate": 4.098254686489981e-05,
977
- "loss": 0.0068,
978
  "step": 1230
979
  },
980
  {
981
- "epoch": 10.420168067226891,
982
- "grad_norm": 0.03304820880293846,
983
- "learning_rate": 3.968972204266322e-05,
984
- "loss": 0.0063,
985
  "step": 1240
986
  },
987
  {
988
- "epoch": 10.504201680672269,
989
- "grad_norm": 0.038498662412166595,
990
- "learning_rate": 3.839689722042663e-05,
991
- "loss": 0.0069,
992
  "step": 1250
993
  },
994
  {
995
- "epoch": 10.588235294117647,
996
- "grad_norm": 0.03530021384358406,
997
- "learning_rate": 3.7104072398190046e-05,
998
- "loss": 0.0067,
999
  "step": 1260
1000
  },
1001
  {
1002
- "epoch": 10.672268907563025,
1003
- "grad_norm": 0.041745755821466446,
1004
- "learning_rate": 3.581124757595346e-05,
1005
- "loss": 0.0063,
1006
  "step": 1270
1007
  },
1008
  {
1009
- "epoch": 10.756302521008404,
1010
- "grad_norm": 0.03443057835102081,
1011
- "learning_rate": 3.451842275371687e-05,
1012
- "loss": 0.0062,
1013
  "step": 1280
1014
  },
1015
  {
1016
- "epoch": 10.840336134453782,
1017
- "grad_norm": 0.029045993462204933,
1018
- "learning_rate": 3.322559793148028e-05,
1019
- "loss": 0.0063,
1020
  "step": 1290
1021
  },
1022
  {
1023
- "epoch": 10.92436974789916,
1024
- "grad_norm": 0.04033966362476349,
1025
- "learning_rate": 3.1932773109243696e-05,
1026
- "loss": 0.0065,
1027
  "step": 1300
1028
  },
1029
  {
1030
- "epoch": 10.92436974789916,
1031
- "eval_accuracy": 0.7783018867924528,
1032
- "eval_loss": 1.1232017278671265,
1033
- "eval_runtime": 2.9539,
1034
- "eval_samples_per_second": 71.77,
1035
- "eval_steps_per_second": 9.141,
1036
  "step": 1300
1037
  },
1038
  {
1039
- "epoch": 11.008403361344538,
1040
- "grad_norm": 0.029126280918717384,
1041
- "learning_rate": 3.0639948287007115e-05,
1042
- "loss": 0.0063,
1043
  "step": 1310
1044
  },
1045
  {
1046
- "epoch": 11.092436974789916,
1047
- "grad_norm": 0.02833595871925354,
1048
- "learning_rate": 2.9347123464770527e-05,
1049
- "loss": 0.0063,
1050
  "step": 1320
1051
  },
1052
  {
1053
- "epoch": 11.176470588235293,
1054
- "grad_norm": 0.032052479684352875,
1055
- "learning_rate": 2.805429864253394e-05,
1056
- "loss": 0.0063,
1057
  "step": 1330
1058
  },
1059
  {
1060
- "epoch": 11.260504201680673,
1061
- "grad_norm": 0.030251996591687202,
1062
- "learning_rate": 2.676147382029735e-05,
1063
- "loss": 0.006,
1064
  "step": 1340
1065
  },
1066
  {
1067
- "epoch": 11.344537815126051,
1068
- "grad_norm": 0.030112557113170624,
1069
- "learning_rate": 2.546864899806076e-05,
1070
- "loss": 0.0059,
1071
  "step": 1350
1072
  },
1073
  {
1074
- "epoch": 11.428571428571429,
1075
- "grad_norm": 0.027209602296352386,
1076
- "learning_rate": 2.4175824175824177e-05,
1077
- "loss": 0.0059,
1078
  "step": 1360
1079
  },
1080
  {
1081
- "epoch": 11.512605042016807,
1082
- "grad_norm": 0.027164172381162643,
1083
- "learning_rate": 2.288299935358759e-05,
1084
- "loss": 0.0057,
1085
  "step": 1370
1086
  },
1087
  {
1088
- "epoch": 11.596638655462185,
1089
- "grad_norm": 0.02858646586537361,
1090
- "learning_rate": 2.1590174531351002e-05,
1091
- "loss": 0.0058,
1092
  "step": 1380
1093
  },
1094
  {
1095
- "epoch": 11.680672268907562,
1096
- "grad_norm": 0.02894781529903412,
1097
- "learning_rate": 2.0297349709114415e-05,
1098
- "loss": 0.0055,
1099
  "step": 1390
1100
  },
1101
  {
1102
- "epoch": 11.764705882352942,
1103
- "grad_norm": 0.025563258677721024,
1104
- "learning_rate": 1.9004524886877827e-05,
1105
- "loss": 0.0056,
1106
  "step": 1400
1107
  },
1108
  {
1109
- "epoch": 11.764705882352942,
1110
- "eval_accuracy": 0.7830188679245284,
1111
- "eval_loss": 1.1348851919174194,
1112
- "eval_runtime": 2.1553,
1113
- "eval_samples_per_second": 98.362,
1114
- "eval_steps_per_second": 12.527,
1115
  "step": 1400
1116
  },
1117
  {
1118
- "epoch": 11.84873949579832,
1119
- "grad_norm": 0.027119316160678864,
1120
- "learning_rate": 1.7711700064641243e-05,
1121
- "loss": 0.0056,
1122
  "step": 1410
1123
  },
1124
  {
1125
- "epoch": 11.932773109243698,
1126
- "grad_norm": 0.02663271874189377,
1127
- "learning_rate": 1.6418875242404656e-05,
1128
- "loss": 0.0055,
1129
  "step": 1420
1130
  },
1131
  {
1132
- "epoch": 12.016806722689076,
1133
- "grad_norm": 0.027364488691091537,
1134
- "learning_rate": 1.5126050420168067e-05,
1135
- "loss": 0.0054,
1136
  "step": 1430
1137
  },
1138
  {
1139
- "epoch": 12.100840336134453,
1140
- "grad_norm": 0.02702498808503151,
1141
- "learning_rate": 1.3833225597931483e-05,
1142
- "loss": 0.0057,
1143
  "step": 1440
1144
  },
1145
  {
1146
- "epoch": 12.184873949579831,
1147
- "grad_norm": 0.02570091001689434,
1148
- "learning_rate": 1.2540400775694893e-05,
1149
- "loss": 0.0054,
1150
  "step": 1450
1151
  },
1152
  {
1153
- "epoch": 12.268907563025211,
1154
- "grad_norm": 0.02761007659137249,
1155
- "learning_rate": 1.1247575953458308e-05,
1156
- "loss": 0.0055,
1157
  "step": 1460
1158
  },
1159
  {
1160
- "epoch": 12.352941176470589,
1161
- "grad_norm": 0.02617548778653145,
1162
- "learning_rate": 9.95475113122172e-06,
1163
- "loss": 0.0055,
1164
  "step": 1470
1165
  },
1166
  {
1167
- "epoch": 12.436974789915967,
1168
- "grad_norm": 0.02675885520875454,
1169
- "learning_rate": 8.661926308985133e-06,
1170
- "loss": 0.0056,
1171
  "step": 1480
1172
  },
1173
  {
1174
- "epoch": 12.521008403361344,
1175
- "grad_norm": 0.029071761295199394,
1176
- "learning_rate": 7.369101486748546e-06,
1177
- "loss": 0.0052,
1178
  "step": 1490
1179
  },
1180
  {
1181
- "epoch": 12.605042016806722,
1182
- "grad_norm": 0.02562028169631958,
1183
- "learning_rate": 6.076276664511959e-06,
1184
- "loss": 0.0054,
1185
  "step": 1500
1186
  },
1187
  {
1188
- "epoch": 12.605042016806722,
1189
- "eval_accuracy": 0.7830188679245284,
1190
- "eval_loss": 1.1463406085968018,
1191
- "eval_runtime": 2.157,
1192
- "eval_samples_per_second": 98.284,
1193
- "eval_steps_per_second": 12.517,
1194
  "step": 1500
1195
  },
1196
  {
1197
- "epoch": 12.6890756302521,
1198
- "grad_norm": 0.024869520217180252,
1199
- "learning_rate": 4.783451842275372e-06,
1200
- "loss": 0.0055,
1201
  "step": 1510
1202
  },
1203
  {
1204
- "epoch": 12.77310924369748,
1205
- "grad_norm": 0.02748894691467285,
1206
- "learning_rate": 3.490627020038785e-06,
1207
- "loss": 0.0056,
1208
  "step": 1520
1209
  },
1210
  {
1211
- "epoch": 12.857142857142858,
1212
- "grad_norm": 0.026005534455180168,
1213
- "learning_rate": 2.197802197802198e-06,
1214
- "loss": 0.0056,
1215
  "step": 1530
1216
  },
1217
  {
1218
- "epoch": 12.941176470588236,
1219
- "grad_norm": 0.028039414435625076,
1220
- "learning_rate": 9.04977375565611e-07,
1221
- "loss": 0.0054,
1222
  "step": 1540
1223
  },
1224
  {
1225
- "epoch": 13.0,
1226
- "step": 1547,
1227
- "total_flos": 1.9140864535683072e+18,
1228
- "train_loss": 0.35688263059153663,
1229
- "train_runtime": 955.328,
1230
- "train_samples_per_second": 25.855,
1231
- "train_steps_per_second": 1.619
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1232
  }
1233
  ],
1234
  "logging_steps": 10,
1235
- "max_steps": 1547,
1236
  "num_input_tokens_seen": 0,
1237
- "num_train_epochs": 13,
1238
  "save_steps": 100,
1239
  "stateful_callbacks": {
1240
  "TrainerControl": {
@@ -1248,7 +1880,7 @@
1248
  "attributes": {}
1249
  }
1250
  },
1251
- "total_flos": 1.9140864535683072e+18,
1252
  "train_batch_size": 16,
1253
  "trial_name": null,
1254
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3162367641925812,
3
+ "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-1000",
4
+ "epoch": 15.0,
5
  "eval_steps": 100,
6
+ "global_step": 2340,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0641025641025641,
13
+ "grad_norm": 1.9132781028747559,
14
+ "learning_rate": 0.00019914529914529915,
15
+ "loss": 1.2054,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.1282051282051282,
20
+ "grad_norm": 1.633124589920044,
21
+ "learning_rate": 0.0001982905982905983,
22
+ "loss": 1.0748,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.19230769230769232,
27
+ "grad_norm": 2.4126267433166504,
28
+ "learning_rate": 0.00019743589743589744,
29
+ "loss": 1.0973,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.2564102564102564,
34
+ "grad_norm": 3.881457567214966,
35
+ "learning_rate": 0.00019658119658119659,
36
+ "loss": 1.0609,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.32051282051282054,
41
+ "grad_norm": 3.1995434761047363,
42
+ "learning_rate": 0.00019572649572649573,
43
+ "loss": 1.0024,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.38461538461538464,
48
+ "grad_norm": 2.410505533218384,
49
+ "learning_rate": 0.00019487179487179487,
50
+ "loss": 0.8658,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.44871794871794873,
55
+ "grad_norm": 2.05910325050354,
56
+ "learning_rate": 0.00019401709401709402,
57
+ "loss": 0.9616,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.5128205128205128,
62
+ "grad_norm": 4.032101154327393,
63
+ "learning_rate": 0.00019316239316239316,
64
+ "loss": 0.9391,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.5769230769230769,
69
+ "grad_norm": 2.779008150100708,
70
+ "learning_rate": 0.00019230769230769233,
71
+ "loss": 0.9206,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.6410256410256411,
76
+ "grad_norm": 2.771672010421753,
77
+ "learning_rate": 0.00019145299145299148,
78
+ "loss": 0.8146,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.6410256410256411,
83
+ "eval_accuracy": 0.6834532374100719,
84
+ "eval_loss": 0.7348725199699402,
85
+ "eval_runtime": 2.6163,
86
+ "eval_samples_per_second": 106.259,
87
+ "eval_steps_per_second": 13.378,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 0.7051282051282052,
92
+ "grad_norm": 2.4781739711761475,
93
+ "learning_rate": 0.0001905982905982906,
94
+ "loss": 0.7542,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 0.7692307692307693,
99
+ "grad_norm": 2.8731400966644287,
100
+ "learning_rate": 0.00018974358974358974,
101
+ "loss": 0.7726,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 0.8333333333333334,
106
+ "grad_norm": 2.480015277862549,
107
+ "learning_rate": 0.00018888888888888888,
108
+ "loss": 0.6705,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 0.8974358974358975,
113
+ "grad_norm": 3.7568867206573486,
114
+ "learning_rate": 0.00018803418803418803,
115
+ "loss": 0.6004,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 0.9615384615384616,
120
+ "grad_norm": 2.720820903778076,
121
+ "learning_rate": 0.0001871794871794872,
122
+ "loss": 0.8144,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 1.0256410256410255,
127
+ "grad_norm": 1.9295154809951782,
128
+ "learning_rate": 0.00018632478632478634,
129
+ "loss": 0.6479,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 1.0897435897435896,
134
+ "grad_norm": 3.0400049686431885,
135
+ "learning_rate": 0.0001854700854700855,
136
+ "loss": 0.522,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 1.1538461538461537,
141
+ "grad_norm": 3.7371773719787598,
142
+ "learning_rate": 0.00018461538461538463,
143
+ "loss": 0.5998,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 1.217948717948718,
148
+ "grad_norm": 2.992065668106079,
149
+ "learning_rate": 0.00018376068376068375,
150
+ "loss": 0.6268,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 1.282051282051282,
155
+ "grad_norm": 2.213074207305908,
156
+ "learning_rate": 0.00018290598290598292,
157
+ "loss": 0.6048,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 1.282051282051282,
162
+ "eval_accuracy": 0.697841726618705,
163
+ "eval_loss": 0.6820898056030273,
164
+ "eval_runtime": 2.7768,
165
+ "eval_samples_per_second": 100.114,
166
+ "eval_steps_per_second": 12.604,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 1.3461538461538463,
171
+ "grad_norm": 3.2302353382110596,
172
+ "learning_rate": 0.00018205128205128207,
173
+ "loss": 0.6054,
174
  "step": 210
175
  },
176
  {
177
+ "epoch": 1.4102564102564101,
178
+ "grad_norm": 3.9419608116149902,
179
+ "learning_rate": 0.0001811965811965812,
180
+ "loss": 0.5863,
181
  "step": 220
182
  },
183
  {
184
+ "epoch": 1.4743589743589745,
185
+ "grad_norm": 2.5351428985595703,
186
+ "learning_rate": 0.00018034188034188035,
187
+ "loss": 0.4328,
188
  "step": 230
189
  },
190
  {
191
+ "epoch": 1.5384615384615383,
192
+ "grad_norm": 2.677548885345459,
193
+ "learning_rate": 0.0001794871794871795,
194
+ "loss": 0.4744,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 1.6025641025641026,
199
+ "grad_norm": 2.3627212047576904,
200
+ "learning_rate": 0.00017863247863247864,
201
+ "loss": 0.3523,
202
  "step": 250
203
  },
204
  {
205
+ "epoch": 1.6666666666666665,
206
+ "grad_norm": 6.175805568695068,
207
+ "learning_rate": 0.00017777777777777779,
208
+ "loss": 0.5934,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 1.7307692307692308,
213
+ "grad_norm": 2.920872211456299,
214
+ "learning_rate": 0.00017692307692307693,
215
+ "loss": 0.5758,
216
  "step": 270
217
  },
218
  {
219
+ "epoch": 1.7948717948717947,
220
+ "grad_norm": 2.1116819381713867,
221
+ "learning_rate": 0.00017606837606837607,
222
+ "loss": 0.5086,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 1.858974358974359,
227
+ "grad_norm": 3.4553894996643066,
228
+ "learning_rate": 0.00017521367521367522,
229
+ "loss": 0.4474,
230
  "step": 290
231
  },
232
  {
233
+ "epoch": 1.9230769230769231,
234
+ "grad_norm": 2.4064671993255615,
235
+ "learning_rate": 0.00017435897435897436,
236
+ "loss": 0.4796,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 1.9230769230769231,
241
+ "eval_accuracy": 0.8129496402877698,
242
+ "eval_loss": 0.48327746987342834,
243
+ "eval_runtime": 2.565,
244
+ "eval_samples_per_second": 108.383,
245
+ "eval_steps_per_second": 13.645,
246
  "step": 300
247
  },
248
  {
249
+ "epoch": 1.9871794871794872,
250
+ "grad_norm": 3.8495571613311768,
251
+ "learning_rate": 0.0001735042735042735,
252
+ "loss": 0.4925,
253
  "step": 310
254
  },
255
  {
256
+ "epoch": 2.051282051282051,
257
+ "grad_norm": 2.036381244659424,
258
+ "learning_rate": 0.00017264957264957268,
259
+ "loss": 0.3936,
260
  "step": 320
261
  },
262
  {
263
+ "epoch": 2.1153846153846154,
264
+ "grad_norm": 3.381953001022339,
265
+ "learning_rate": 0.0001717948717948718,
266
+ "loss": 0.3416,
267
  "step": 330
268
  },
269
  {
270
+ "epoch": 2.1794871794871793,
271
+ "grad_norm": 5.715399265289307,
272
+ "learning_rate": 0.00017094017094017094,
273
+ "loss": 0.4081,
274
  "step": 340
275
  },
276
  {
277
+ "epoch": 2.2435897435897436,
278
+ "grad_norm": 2.237466335296631,
279
+ "learning_rate": 0.00017008547008547008,
280
+ "loss": 0.3313,
281
  "step": 350
282
  },
283
  {
284
+ "epoch": 2.3076923076923075,
285
+ "grad_norm": 5.658877372741699,
286
+ "learning_rate": 0.00016923076923076923,
287
+ "loss": 0.3926,
288
  "step": 360
289
  },
290
  {
291
+ "epoch": 2.371794871794872,
292
+ "grad_norm": 3.633448362350464,
293
+ "learning_rate": 0.00016837606837606837,
294
+ "loss": 0.4664,
295
  "step": 370
296
  },
297
  {
298
+ "epoch": 2.435897435897436,
299
+ "grad_norm": 4.460226058959961,
300
+ "learning_rate": 0.00016752136752136754,
301
+ "loss": 0.4206,
302
  "step": 380
303
  },
304
  {
305
+ "epoch": 2.5,
306
+ "grad_norm": 1.1033204793930054,
307
+ "learning_rate": 0.0001666666666666667,
308
+ "loss": 0.2678,
309
  "step": 390
310
  },
311
  {
312
+ "epoch": 2.564102564102564,
313
+ "grad_norm": 6.69362735748291,
314
+ "learning_rate": 0.00016581196581196583,
315
+ "loss": 0.4532,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 2.564102564102564,
320
+ "eval_accuracy": 0.802158273381295,
321
+ "eval_loss": 0.5380275249481201,
322
+ "eval_runtime": 3.354,
323
+ "eval_samples_per_second": 82.885,
324
+ "eval_steps_per_second": 10.435,
325
  "step": 400
326
  },
327
  {
328
+ "epoch": 2.628205128205128,
329
+ "grad_norm": 2.1671810150146484,
330
+ "learning_rate": 0.00016495726495726495,
331
+ "loss": 0.2966,
332
  "step": 410
333
  },
334
  {
335
+ "epoch": 2.6923076923076925,
336
+ "grad_norm": 4.671816825866699,
337
+ "learning_rate": 0.0001641025641025641,
338
+ "loss": 0.3559,
339
  "step": 420
340
  },
341
  {
342
+ "epoch": 2.7564102564102564,
343
+ "grad_norm": 2.6795239448547363,
344
+ "learning_rate": 0.00016324786324786327,
345
+ "loss": 0.2111,
346
  "step": 430
347
  },
348
  {
349
+ "epoch": 2.8205128205128203,
350
+ "grad_norm": 1.8368570804595947,
351
+ "learning_rate": 0.0001623931623931624,
352
+ "loss": 0.2227,
353
  "step": 440
354
  },
355
  {
356
+ "epoch": 2.8846153846153846,
357
+ "grad_norm": 0.7519993185997009,
358
+ "learning_rate": 0.00016153846153846155,
359
+ "loss": 0.2185,
360
  "step": 450
361
  },
362
  {
363
+ "epoch": 2.948717948717949,
364
+ "grad_norm": 4.014621734619141,
365
+ "learning_rate": 0.0001606837606837607,
366
+ "loss": 0.2038,
367
  "step": 460
368
  },
369
  {
370
+ "epoch": 3.0128205128205128,
371
+ "grad_norm": 2.412414073944092,
372
+ "learning_rate": 0.00015982905982905984,
373
+ "loss": 0.1874,
374
  "step": 470
375
  },
376
  {
377
+ "epoch": 3.076923076923077,
378
+ "grad_norm": 3.7715134620666504,
379
+ "learning_rate": 0.00015897435897435896,
380
+ "loss": 0.1546,
381
  "step": 480
382
  },
383
  {
384
+ "epoch": 3.141025641025641,
385
+ "grad_norm": 1.5307694673538208,
386
+ "learning_rate": 0.00015811965811965813,
387
+ "loss": 0.1115,
388
  "step": 490
389
  },
390
  {
391
+ "epoch": 3.2051282051282053,
392
+ "grad_norm": 2.7572405338287354,
393
+ "learning_rate": 0.00015726495726495727,
394
+ "loss": 0.1242,
395
  "step": 500
396
  },
397
  {
398
+ "epoch": 3.2051282051282053,
399
+ "eval_accuracy": 0.8741007194244604,
400
+ "eval_loss": 0.3899326026439667,
401
+ "eval_runtime": 2.8072,
402
+ "eval_samples_per_second": 99.033,
403
+ "eval_steps_per_second": 12.468,
404
  "step": 500
405
  },
406
  {
407
+ "epoch": 3.269230769230769,
408
+ "grad_norm": 4.4104390144348145,
409
+ "learning_rate": 0.00015641025641025642,
410
+ "loss": 0.1986,
411
  "step": 510
412
  },
413
  {
414
+ "epoch": 3.3333333333333335,
415
+ "grad_norm": 0.8930767774581909,
416
+ "learning_rate": 0.00015555555555555556,
417
+ "loss": 0.0582,
418
  "step": 520
419
  },
420
  {
421
+ "epoch": 3.3974358974358974,
422
+ "grad_norm": 8.353619575500488,
423
+ "learning_rate": 0.0001547008547008547,
424
+ "loss": 0.2485,
425
  "step": 530
426
  },
427
  {
428
+ "epoch": 3.4615384615384617,
429
+ "grad_norm": 0.09837932884693146,
430
+ "learning_rate": 0.00015384615384615385,
431
+ "loss": 0.1007,
432
  "step": 540
433
  },
434
  {
435
+ "epoch": 3.5256410256410255,
436
+ "grad_norm": 3.90265154838562,
437
+ "learning_rate": 0.000152991452991453,
438
+ "loss": 0.2279,
439
  "step": 550
440
  },
441
  {
442
+ "epoch": 3.58974358974359,
443
+ "grad_norm": 6.65275764465332,
444
+ "learning_rate": 0.00015213675213675214,
445
+ "loss": 0.1781,
446
  "step": 560
447
  },
448
  {
449
+ "epoch": 3.6538461538461537,
450
+ "grad_norm": 3.493739604949951,
451
+ "learning_rate": 0.00015128205128205128,
452
+ "loss": 0.2098,
453
  "step": 570
454
  },
455
  {
456
+ "epoch": 3.717948717948718,
457
+ "grad_norm": 4.4887614250183105,
458
+ "learning_rate": 0.00015042735042735043,
459
+ "loss": 0.1899,
460
  "step": 580
461
  },
462
  {
463
+ "epoch": 3.782051282051282,
464
+ "grad_norm": 1.8387681245803833,
465
+ "learning_rate": 0.00014957264957264957,
466
+ "loss": 0.1831,
467
  "step": 590
468
  },
469
  {
470
+ "epoch": 3.8461538461538463,
471
+ "grad_norm": 5.91892671585083,
472
+ "learning_rate": 0.00014871794871794872,
473
+ "loss": 0.124,
474
  "step": 600
475
  },
476
  {
477
+ "epoch": 3.8461538461538463,
478
+ "eval_accuracy": 0.8273381294964028,
479
+ "eval_loss": 0.523663341999054,
480
+ "eval_runtime": 2.6143,
481
+ "eval_samples_per_second": 106.339,
482
+ "eval_steps_per_second": 13.388,
483
  "step": 600
484
  },
485
  {
486
+ "epoch": 3.91025641025641,
487
+ "grad_norm": 0.5099517703056335,
488
+ "learning_rate": 0.0001478632478632479,
489
+ "loss": 0.1566,
490
  "step": 610
491
  },
492
  {
493
+ "epoch": 3.9743589743589745,
494
+ "grad_norm": 0.4991530179977417,
495
+ "learning_rate": 0.00014700854700854703,
496
+ "loss": 0.1168,
497
  "step": 620
498
  },
499
  {
500
+ "epoch": 4.038461538461538,
501
+ "grad_norm": 4.448193550109863,
502
+ "learning_rate": 0.00014615384615384615,
503
+ "loss": 0.1044,
504
  "step": 630
505
  },
506
  {
507
+ "epoch": 4.102564102564102,
508
+ "grad_norm": 0.5183725357055664,
509
+ "learning_rate": 0.0001452991452991453,
510
+ "loss": 0.1078,
511
  "step": 640
512
  },
513
  {
514
+ "epoch": 4.166666666666667,
515
+ "grad_norm": 0.9525802135467529,
516
+ "learning_rate": 0.00014444444444444444,
517
+ "loss": 0.0967,
518
  "step": 650
519
  },
520
  {
521
+ "epoch": 4.230769230769231,
522
+ "grad_norm": 0.5207259058952332,
523
+ "learning_rate": 0.0001435897435897436,
524
+ "loss": 0.0973,
525
  "step": 660
526
  },
527
  {
528
+ "epoch": 4.294871794871795,
529
+ "grad_norm": 1.3160842657089233,
530
+ "learning_rate": 0.00014273504273504275,
531
+ "loss": 0.1256,
532
  "step": 670
533
  },
534
  {
535
+ "epoch": 4.358974358974359,
536
+ "grad_norm": 2.892195463180542,
537
+ "learning_rate": 0.0001418803418803419,
538
+ "loss": 0.1178,
539
  "step": 680
540
  },
541
  {
542
+ "epoch": 4.423076923076923,
543
+ "grad_norm": 3.8142576217651367,
544
+ "learning_rate": 0.00014102564102564104,
545
+ "loss": 0.0594,
546
  "step": 690
547
  },
548
  {
549
+ "epoch": 4.487179487179487,
550
+ "grad_norm": 0.11079952865839005,
551
+ "learning_rate": 0.00014017094017094016,
552
+ "loss": 0.1239,
553
  "step": 700
554
  },
555
  {
556
+ "epoch": 4.487179487179487,
557
+ "eval_accuracy": 0.8848920863309353,
558
+ "eval_loss": 0.4221162796020508,
559
+ "eval_runtime": 2.7273,
560
+ "eval_samples_per_second": 101.932,
561
+ "eval_steps_per_second": 12.833,
562
  "step": 700
563
  },
564
  {
565
+ "epoch": 4.551282051282051,
566
+ "grad_norm": 9.544878959655762,
567
+ "learning_rate": 0.0001393162393162393,
568
+ "loss": 0.1185,
569
  "step": 710
570
  },
571
  {
572
+ "epoch": 4.615384615384615,
573
+ "grad_norm": 0.06085001304745674,
574
+ "learning_rate": 0.00013846153846153847,
575
+ "loss": 0.0499,
576
  "step": 720
577
  },
578
  {
579
+ "epoch": 4.67948717948718,
580
+ "grad_norm": 12.285767555236816,
581
+ "learning_rate": 0.00013760683760683762,
582
+ "loss": 0.1623,
583
  "step": 730
584
  },
585
  {
586
+ "epoch": 4.743589743589744,
587
+ "grad_norm": 1.4333381652832031,
588
+ "learning_rate": 0.00013675213675213676,
589
+ "loss": 0.0903,
590
  "step": 740
591
  },
592
  {
593
+ "epoch": 4.8076923076923075,
594
+ "grad_norm": 0.37026920914649963,
595
+ "learning_rate": 0.0001358974358974359,
596
+ "loss": 0.082,
597
  "step": 750
598
  },
599
  {
600
+ "epoch": 4.871794871794872,
601
+ "grad_norm": 7.013845443725586,
602
+ "learning_rate": 0.00013504273504273505,
603
+ "loss": 0.0443,
604
  "step": 760
605
  },
606
  {
607
+ "epoch": 4.935897435897436,
608
+ "grad_norm": 0.3148520588874817,
609
+ "learning_rate": 0.0001341880341880342,
610
+ "loss": 0.1237,
611
  "step": 770
612
  },
613
  {
614
+ "epoch": 5.0,
615
+ "grad_norm": 0.5136359930038452,
616
+ "learning_rate": 0.00013333333333333334,
617
+ "loss": 0.0592,
618
  "step": 780
619
  },
620
  {
621
+ "epoch": 5.064102564102564,
622
+ "grad_norm": 7.698183536529541,
623
+ "learning_rate": 0.00013247863247863248,
624
+ "loss": 0.0742,
625
  "step": 790
626
  },
627
  {
628
+ "epoch": 5.128205128205128,
629
+ "grad_norm": 0.05358889326453209,
630
+ "learning_rate": 0.00013162393162393163,
631
+ "loss": 0.0785,
632
  "step": 800
633
  },
634
  {
635
+ "epoch": 5.128205128205128,
636
+ "eval_accuracy": 0.9136690647482014,
637
+ "eval_loss": 0.36830753087997437,
638
+ "eval_runtime": 3.1493,
639
+ "eval_samples_per_second": 88.273,
640
+ "eval_steps_per_second": 11.114,
641
  "step": 800
642
  },
643
  {
644
+ "epoch": 5.1923076923076925,
645
+ "grad_norm": 0.045300308614969254,
646
+ "learning_rate": 0.00013076923076923077,
647
+ "loss": 0.1987,
648
  "step": 810
649
  },
650
  {
651
+ "epoch": 5.256410256410256,
652
+ "grad_norm": 6.118052959442139,
653
+ "learning_rate": 0.00012991452991452992,
654
+ "loss": 0.0708,
655
  "step": 820
656
  },
657
  {
658
+ "epoch": 5.32051282051282,
659
+ "grad_norm": 0.36830875277519226,
660
+ "learning_rate": 0.00012905982905982906,
661
+ "loss": 0.0329,
662
  "step": 830
663
  },
664
  {
665
+ "epoch": 5.384615384615385,
666
+ "grad_norm": 0.5043929219245911,
667
+ "learning_rate": 0.00012820512820512823,
668
+ "loss": 0.0546,
669
  "step": 840
670
  },
671
  {
672
+ "epoch": 5.448717948717949,
673
+ "grad_norm": 5.8541035652160645,
674
+ "learning_rate": 0.00012735042735042735,
675
+ "loss": 0.0589,
676
  "step": 850
677
  },
678
  {
679
+ "epoch": 5.512820512820513,
680
+ "grad_norm": 0.09965494275093079,
681
+ "learning_rate": 0.0001264957264957265,
682
+ "loss": 0.0257,
683
  "step": 860
684
  },
685
  {
686
+ "epoch": 5.576923076923077,
687
+ "grad_norm": 0.03202090039849281,
688
+ "learning_rate": 0.00012564102564102564,
689
+ "loss": 0.0349,
690
  "step": 870
691
  },
692
  {
693
+ "epoch": 5.641025641025641,
694
+ "grad_norm": 7.21024751663208,
695
+ "learning_rate": 0.00012478632478632478,
696
+ "loss": 0.081,
697
  "step": 880
698
  },
699
  {
700
+ "epoch": 5.705128205128205,
701
+ "grad_norm": 0.03198171406984329,
702
+ "learning_rate": 0.00012393162393162395,
703
+ "loss": 0.037,
704
  "step": 890
705
  },
706
  {
707
+ "epoch": 5.769230769230769,
708
+ "grad_norm": 1.1413763761520386,
709
+ "learning_rate": 0.0001230769230769231,
710
+ "loss": 0.093,
711
  "step": 900
712
  },
713
  {
714
+ "epoch": 5.769230769230769,
715
+ "eval_accuracy": 0.8597122302158273,
716
+ "eval_loss": 0.6375630497932434,
717
+ "eval_runtime": 2.5134,
718
+ "eval_samples_per_second": 110.607,
719
+ "eval_steps_per_second": 13.925,
720
  "step": 900
721
  },
722
  {
723
+ "epoch": 5.833333333333333,
724
+ "grad_norm": 0.030478307977318764,
725
+ "learning_rate": 0.00012222222222222224,
726
+ "loss": 0.1015,
727
  "step": 910
728
  },
729
  {
730
+ "epoch": 5.897435897435898,
731
+ "grad_norm": 7.971870422363281,
732
+ "learning_rate": 0.00012136752136752136,
733
+ "loss": 0.0421,
734
  "step": 920
735
  },
736
  {
737
+ "epoch": 5.961538461538462,
738
+ "grad_norm": 0.7655214667320251,
739
+ "learning_rate": 0.00012051282051282052,
740
+ "loss": 0.0315,
741
  "step": 930
742
  },
743
  {
744
+ "epoch": 6.0256410256410255,
745
+ "grad_norm": 0.10178599506616592,
746
+ "learning_rate": 0.00011965811965811966,
747
+ "loss": 0.008,
748
  "step": 940
749
  },
750
  {
751
+ "epoch": 6.089743589743589,
752
+ "grad_norm": 0.024569841101765633,
753
+ "learning_rate": 0.0001188034188034188,
754
+ "loss": 0.0054,
755
  "step": 950
756
  },
757
  {
758
+ "epoch": 6.153846153846154,
759
+ "grad_norm": 0.36783352494239807,
760
+ "learning_rate": 0.00011794871794871796,
761
+ "loss": 0.0752,
762
  "step": 960
763
  },
764
  {
765
+ "epoch": 6.217948717948718,
766
+ "grad_norm": 0.04280726611614227,
767
+ "learning_rate": 0.00011709401709401711,
768
+ "loss": 0.0286,
769
  "step": 970
770
  },
771
  {
772
+ "epoch": 6.282051282051282,
773
+ "grad_norm": 14.323381423950195,
774
+ "learning_rate": 0.00011623931623931625,
775
+ "loss": 0.0559,
776
  "step": 980
777
  },
778
  {
779
+ "epoch": 6.346153846153846,
780
+ "grad_norm": 0.025405921041965485,
781
+ "learning_rate": 0.00011538461538461538,
782
+ "loss": 0.0238,
783
  "step": 990
784
  },
785
  {
786
+ "epoch": 6.410256410256411,
787
+ "grad_norm": 0.02457793429493904,
788
+ "learning_rate": 0.00011452991452991453,
789
+ "loss": 0.0056,
790
  "step": 1000
791
  },
792
  {
793
+ "epoch": 6.410256410256411,
794
+ "eval_accuracy": 0.9244604316546763,
795
+ "eval_loss": 0.3162367641925812,
796
+ "eval_runtime": 3.2283,
797
+ "eval_samples_per_second": 86.114,
798
+ "eval_steps_per_second": 10.842,
799
  "step": 1000
800
  },
801
  {
802
+ "epoch": 6.4743589743589745,
803
+ "grad_norm": 0.020618008449673653,
804
+ "learning_rate": 0.00011367521367521367,
805
+ "loss": 0.0113,
806
  "step": 1010
807
  },
808
  {
809
+ "epoch": 6.538461538461538,
810
+ "grad_norm": 0.021217485889792442,
811
+ "learning_rate": 0.00011282051282051283,
812
+ "loss": 0.0115,
813
  "step": 1020
814
  },
815
  {
816
+ "epoch": 6.602564102564102,
817
+ "grad_norm": 0.028808822855353355,
818
+ "learning_rate": 0.00011196581196581197,
819
+ "loss": 0.022,
820
  "step": 1030
821
  },
822
  {
823
+ "epoch": 6.666666666666667,
824
+ "grad_norm": 3.5672314167022705,
825
+ "learning_rate": 0.00011111111111111112,
826
+ "loss": 0.0706,
827
  "step": 1040
828
  },
829
  {
830
+ "epoch": 6.730769230769231,
831
+ "grad_norm": 0.25913771986961365,
832
+ "learning_rate": 0.00011025641025641027,
833
+ "loss": 0.021,
834
  "step": 1050
835
  },
836
  {
837
+ "epoch": 6.794871794871795,
838
+ "grad_norm": 12.088153839111328,
839
+ "learning_rate": 0.00010940170940170942,
840
+ "loss": 0.0914,
841
  "step": 1060
842
  },
843
  {
844
+ "epoch": 6.858974358974359,
845
+ "grad_norm": 0.7027952671051025,
846
+ "learning_rate": 0.00010854700854700855,
847
+ "loss": 0.0766,
848
  "step": 1070
849
  },
850
  {
851
+ "epoch": 6.923076923076923,
852
+ "grad_norm": 6.911967754364014,
853
+ "learning_rate": 0.0001076923076923077,
854
+ "loss": 0.0881,
855
  "step": 1080
856
  },
857
  {
858
+ "epoch": 6.987179487179487,
859
+ "grad_norm": 0.03289846330881119,
860
+ "learning_rate": 0.00010683760683760684,
861
+ "loss": 0.0125,
862
  "step": 1090
863
  },
864
  {
865
+ "epoch": 7.051282051282051,
866
+ "grad_norm": 0.025492513552308083,
867
+ "learning_rate": 0.000105982905982906,
868
+ "loss": 0.0472,
869
  "step": 1100
870
  },
871
  {
872
+ "epoch": 7.051282051282051,
873
+ "eval_accuracy": 0.8884892086330936,
874
+ "eval_loss": 0.5225415825843811,
875
+ "eval_runtime": 3.7495,
876
+ "eval_samples_per_second": 74.144,
877
+ "eval_steps_per_second": 9.335,
878
  "step": 1100
879
  },
880
  {
881
+ "epoch": 7.115384615384615,
882
+ "grad_norm": 0.022507918998599052,
883
+ "learning_rate": 0.00010512820512820514,
884
+ "loss": 0.0386,
885
  "step": 1110
886
  },
887
  {
888
+ "epoch": 7.17948717948718,
889
+ "grad_norm": 0.020967524498701096,
890
+ "learning_rate": 0.00010427350427350428,
891
+ "loss": 0.0289,
892
  "step": 1120
893
  },
894
  {
895
+ "epoch": 7.243589743589744,
896
+ "grad_norm": 0.5489076972007751,
897
+ "learning_rate": 0.00010341880341880343,
898
+ "loss": 0.0041,
899
  "step": 1130
900
  },
901
  {
902
+ "epoch": 7.3076923076923075,
903
+ "grad_norm": 0.12584710121154785,
904
+ "learning_rate": 0.00010256410256410256,
905
+ "loss": 0.0697,
906
  "step": 1140
907
  },
908
  {
909
+ "epoch": 7.371794871794872,
910
+ "grad_norm": 0.022198162972927094,
911
+ "learning_rate": 0.0001017094017094017,
912
+ "loss": 0.0039,
913
  "step": 1150
914
  },
915
  {
916
+ "epoch": 7.435897435897436,
917
+ "grad_norm": 11.968843460083008,
918
+ "learning_rate": 0.00010085470085470086,
919
+ "loss": 0.0855,
920
  "step": 1160
921
  },
922
  {
923
+ "epoch": 7.5,
924
+ "grad_norm": 0.779564380645752,
925
+ "learning_rate": 0.0001,
926
+ "loss": 0.0585,
927
  "step": 1170
928
  },
929
  {
930
+ "epoch": 7.564102564102564,
931
+ "grad_norm": 0.23576153814792633,
932
+ "learning_rate": 9.914529914529915e-05,
933
+ "loss": 0.0559,
934
  "step": 1180
935
  },
936
  {
937
+ "epoch": 7.628205128205128,
938
+ "grad_norm": 0.020965000614523888,
939
+ "learning_rate": 9.829059829059829e-05,
940
+ "loss": 0.0785,
941
  "step": 1190
942
  },
943
  {
944
+ "epoch": 7.6923076923076925,
945
+ "grad_norm": 6.914454936981201,
946
+ "learning_rate": 9.743589743589744e-05,
947
+ "loss": 0.0234,
948
  "step": 1200
949
  },
950
  {
951
+ "epoch": 7.6923076923076925,
952
+ "eval_accuracy": 0.8597122302158273,
953
+ "eval_loss": 0.6095559000968933,
954
+ "eval_runtime": 2.5288,
955
+ "eval_samples_per_second": 109.933,
956
+ "eval_steps_per_second": 13.84,
957
  "step": 1200
958
  },
959
  {
960
+ "epoch": 7.756410256410256,
961
+ "grad_norm": 1.3177701234817505,
962
+ "learning_rate": 9.658119658119658e-05,
963
+ "loss": 0.0768,
964
  "step": 1210
965
  },
966
  {
967
+ "epoch": 7.82051282051282,
968
+ "grad_norm": 4.212278842926025,
969
+ "learning_rate": 9.572649572649574e-05,
970
+ "loss": 0.0914,
971
  "step": 1220
972
  },
973
  {
974
+ "epoch": 7.884615384615385,
975
+ "grad_norm": 0.02418905310332775,
976
+ "learning_rate": 9.487179487179487e-05,
977
+ "loss": 0.0348,
978
  "step": 1230
979
  },
980
  {
981
+ "epoch": 7.948717948717949,
982
+ "grad_norm": 3.008629322052002,
983
+ "learning_rate": 9.401709401709401e-05,
984
+ "loss": 0.0624,
985
  "step": 1240
986
  },
987
  {
988
+ "epoch": 8.012820512820513,
989
+ "grad_norm": 0.052931949496269226,
990
+ "learning_rate": 9.316239316239317e-05,
991
+ "loss": 0.0076,
992
  "step": 1250
993
  },
994
  {
995
+ "epoch": 8.076923076923077,
996
+ "grad_norm": 7.994688034057617,
997
+ "learning_rate": 9.230769230769232e-05,
998
+ "loss": 0.0326,
999
  "step": 1260
1000
  },
1001
  {
1002
+ "epoch": 8.14102564102564,
1003
+ "grad_norm": 0.026721293106675148,
1004
+ "learning_rate": 9.145299145299146e-05,
1005
+ "loss": 0.0324,
1006
  "step": 1270
1007
  },
1008
  {
1009
+ "epoch": 8.205128205128204,
1010
+ "grad_norm": 0.049855004996061325,
1011
+ "learning_rate": 9.05982905982906e-05,
1012
+ "loss": 0.0057,
1013
  "step": 1280
1014
  },
1015
  {
1016
+ "epoch": 8.26923076923077,
1017
+ "grad_norm": 0.014473488554358482,
1018
+ "learning_rate": 8.974358974358975e-05,
1019
+ "loss": 0.0237,
1020
  "step": 1290
1021
  },
1022
  {
1023
+ "epoch": 8.333333333333334,
1024
+ "grad_norm": 0.03150290250778198,
1025
+ "learning_rate": 8.888888888888889e-05,
1026
+ "loss": 0.0354,
1027
  "step": 1300
1028
  },
1029
  {
1030
+ "epoch": 8.333333333333334,
1031
+ "eval_accuracy": 0.8776978417266187,
1032
+ "eval_loss": 0.5520122647285461,
1033
+ "eval_runtime": 3.3151,
1034
+ "eval_samples_per_second": 83.858,
1035
+ "eval_steps_per_second": 10.558,
1036
  "step": 1300
1037
  },
1038
  {
1039
+ "epoch": 8.397435897435898,
1040
+ "grad_norm": 0.013332781381905079,
1041
+ "learning_rate": 8.803418803418804e-05,
1042
+ "loss": 0.0049,
1043
  "step": 1310
1044
  },
1045
  {
1046
+ "epoch": 8.461538461538462,
1047
+ "grad_norm": 0.013341937214136124,
1048
+ "learning_rate": 8.717948717948718e-05,
1049
+ "loss": 0.0026,
1050
  "step": 1320
1051
  },
1052
  {
1053
+ "epoch": 8.525641025641026,
1054
+ "grad_norm": 0.012689488008618355,
1055
+ "learning_rate": 8.632478632478634e-05,
1056
+ "loss": 0.0051,
1057
  "step": 1330
1058
  },
1059
  {
1060
+ "epoch": 8.58974358974359,
1061
+ "grad_norm": 0.014231017790734768,
1062
+ "learning_rate": 8.547008547008547e-05,
1063
+ "loss": 0.0027,
1064
  "step": 1340
1065
  },
1066
  {
1067
+ "epoch": 8.653846153846153,
1068
+ "grad_norm": 0.052165694534778595,
1069
+ "learning_rate": 8.461538461538461e-05,
1070
+ "loss": 0.0036,
1071
  "step": 1350
1072
  },
1073
  {
1074
+ "epoch": 8.717948717948717,
1075
+ "grad_norm": 0.11598876118659973,
1076
+ "learning_rate": 8.376068376068377e-05,
1077
+ "loss": 0.0281,
1078
  "step": 1360
1079
  },
1080
  {
1081
+ "epoch": 8.782051282051283,
1082
+ "grad_norm": 0.01339508593082428,
1083
+ "learning_rate": 8.290598290598292e-05,
1084
+ "loss": 0.0026,
1085
  "step": 1370
1086
  },
1087
  {
1088
+ "epoch": 8.846153846153847,
1089
+ "grad_norm": 0.011919701471924782,
1090
+ "learning_rate": 8.205128205128205e-05,
1091
+ "loss": 0.0025,
1092
  "step": 1380
1093
  },
1094
  {
1095
+ "epoch": 8.91025641025641,
1096
+ "grad_norm": 3.217728614807129,
1097
+ "learning_rate": 8.11965811965812e-05,
1098
+ "loss": 0.0379,
1099
  "step": 1390
1100
  },
1101
  {
1102
+ "epoch": 8.974358974358974,
1103
+ "grad_norm": 0.01331857219338417,
1104
+ "learning_rate": 8.034188034188035e-05,
1105
+ "loss": 0.026,
1106
  "step": 1400
1107
  },
1108
  {
1109
+ "epoch": 8.974358974358974,
1110
+ "eval_accuracy": 0.8992805755395683,
1111
+ "eval_loss": 0.49377354979515076,
1112
+ "eval_runtime": 2.8453,
1113
+ "eval_samples_per_second": 97.704,
1114
+ "eval_steps_per_second": 12.301,
1115
  "step": 1400
1116
  },
1117
  {
1118
+ "epoch": 9.038461538461538,
1119
+ "grad_norm": 0.1426580250263214,
1120
+ "learning_rate": 7.948717948717948e-05,
1121
+ "loss": 0.0072,
1122
  "step": 1410
1123
  },
1124
  {
1125
+ "epoch": 9.102564102564102,
1126
+ "grad_norm": 0.021561838686466217,
1127
+ "learning_rate": 7.863247863247864e-05,
1128
+ "loss": 0.0025,
1129
  "step": 1420
1130
  },
1131
  {
1132
+ "epoch": 9.166666666666666,
1133
+ "grad_norm": 0.010494213551282883,
1134
+ "learning_rate": 7.777777777777778e-05,
1135
+ "loss": 0.0025,
1136
  "step": 1430
1137
  },
1138
  {
1139
+ "epoch": 9.23076923076923,
1140
+ "grad_norm": 0.015301249921321869,
1141
+ "learning_rate": 7.692307692307693e-05,
1142
+ "loss": 0.0051,
1143
  "step": 1440
1144
  },
1145
  {
1146
+ "epoch": 9.294871794871796,
1147
+ "grad_norm": 0.013643044047057629,
1148
+ "learning_rate": 7.606837606837607e-05,
1149
+ "loss": 0.0136,
1150
  "step": 1450
1151
  },
1152
  {
1153
+ "epoch": 9.35897435897436,
1154
+ "grad_norm": 0.02054368518292904,
1155
+ "learning_rate": 7.521367521367521e-05,
1156
+ "loss": 0.0025,
1157
  "step": 1460
1158
  },
1159
  {
1160
+ "epoch": 9.423076923076923,
1161
+ "grad_norm": 0.011097296141088009,
1162
+ "learning_rate": 7.435897435897436e-05,
1163
+ "loss": 0.0025,
1164
  "step": 1470
1165
  },
1166
  {
1167
+ "epoch": 9.487179487179487,
1168
+ "grad_norm": 0.5705698132514954,
1169
+ "learning_rate": 7.350427350427352e-05,
1170
+ "loss": 0.0048,
1171
  "step": 1480
1172
  },
1173
  {
1174
+ "epoch": 9.551282051282051,
1175
+ "grad_norm": 0.009772556833922863,
1176
+ "learning_rate": 7.264957264957265e-05,
1177
+ "loss": 0.002,
1178
  "step": 1490
1179
  },
1180
  {
1181
+ "epoch": 9.615384615384615,
1182
+ "grad_norm": 0.011127009056508541,
1183
+ "learning_rate": 7.17948717948718e-05,
1184
+ "loss": 0.002,
1185
  "step": 1500
1186
  },
1187
  {
1188
+ "epoch": 9.615384615384615,
1189
+ "eval_accuracy": 0.9172661870503597,
1190
+ "eval_loss": 0.43497270345687866,
1191
+ "eval_runtime": 2.5545,
1192
+ "eval_samples_per_second": 108.826,
1193
+ "eval_steps_per_second": 13.701,
1194
  "step": 1500
1195
  },
1196
  {
1197
+ "epoch": 9.679487179487179,
1198
+ "grad_norm": 0.013090673834085464,
1199
+ "learning_rate": 7.094017094017095e-05,
1200
+ "loss": 0.002,
1201
  "step": 1510
1202
  },
1203
  {
1204
+ "epoch": 9.743589743589745,
1205
+ "grad_norm": 0.00843009538948536,
1206
+ "learning_rate": 7.008547008547008e-05,
1207
+ "loss": 0.0021,
1208
  "step": 1520
1209
  },
1210
  {
1211
+ "epoch": 9.807692307692308,
1212
+ "grad_norm": 0.00981289241462946,
1213
+ "learning_rate": 6.923076923076924e-05,
1214
+ "loss": 0.0018,
1215
  "step": 1530
1216
  },
1217
  {
1218
+ "epoch": 9.871794871794872,
1219
+ "grad_norm": 0.01045091450214386,
1220
+ "learning_rate": 6.837606837606838e-05,
1221
+ "loss": 0.0047,
1222
  "step": 1540
1223
  },
1224
  {
1225
+ "epoch": 9.935897435897436,
1226
+ "grad_norm": 0.07456047832965851,
1227
+ "learning_rate": 6.752136752136753e-05,
1228
+ "loss": 0.0021,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 10.0,
1233
+ "grad_norm": 0.010238745249807835,
1234
+ "learning_rate": 6.666666666666667e-05,
1235
+ "loss": 0.0268,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 10.064102564102564,
1240
+ "grad_norm": 0.009372313506901264,
1241
+ "learning_rate": 6.581196581196581e-05,
1242
+ "loss": 0.0018,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 10.128205128205128,
1247
+ "grad_norm": 0.009544081054627895,
1248
+ "learning_rate": 6.495726495726496e-05,
1249
+ "loss": 0.0017,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 10.192307692307692,
1254
+ "grad_norm": 0.014180944301187992,
1255
+ "learning_rate": 6.410256410256412e-05,
1256
+ "loss": 0.0055,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 10.256410256410255,
1261
+ "grad_norm": 0.19402286410331726,
1262
+ "learning_rate": 6.324786324786325e-05,
1263
+ "loss": 0.0021,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 10.256410256410255,
1268
+ "eval_accuracy": 0.9172661870503597,
1269
+ "eval_loss": 0.4223933219909668,
1270
+ "eval_runtime": 3.538,
1271
+ "eval_samples_per_second": 78.575,
1272
+ "eval_steps_per_second": 9.893,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 10.320512820512821,
1277
+ "grad_norm": 0.008879870176315308,
1278
+ "learning_rate": 6.239316239316239e-05,
1279
+ "loss": 0.0018,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 10.384615384615385,
1284
+ "grad_norm": 0.008063827641308308,
1285
+ "learning_rate": 6.153846153846155e-05,
1286
+ "loss": 0.002,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 10.448717948717949,
1291
+ "grad_norm": 0.008835590444505215,
1292
+ "learning_rate": 6.068376068376068e-05,
1293
+ "loss": 0.0018,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 10.512820512820513,
1298
+ "grad_norm": 0.008632234297692776,
1299
+ "learning_rate": 5.982905982905983e-05,
1300
+ "loss": 0.0017,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 10.576923076923077,
1305
+ "grad_norm": 0.00828844029456377,
1306
+ "learning_rate": 5.897435897435898e-05,
1307
+ "loss": 0.0016,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 10.64102564102564,
1312
+ "grad_norm": 0.0323554202914238,
1313
+ "learning_rate": 5.8119658119658126e-05,
1314
+ "loss": 0.0016,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 10.705128205128204,
1319
+ "grad_norm": 0.008372778072953224,
1320
+ "learning_rate": 5.726495726495726e-05,
1321
+ "loss": 0.0016,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 10.76923076923077,
1326
+ "grad_norm": 0.007286165375262499,
1327
+ "learning_rate": 5.6410256410256414e-05,
1328
+ "loss": 0.0016,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 10.833333333333334,
1333
+ "grad_norm": 0.012557004578411579,
1334
+ "learning_rate": 5.555555555555556e-05,
1335
+ "loss": 0.0017,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 10.897435897435898,
1340
+ "grad_norm": 0.007122470065951347,
1341
+ "learning_rate": 5.470085470085471e-05,
1342
+ "loss": 0.0016,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 10.897435897435898,
1347
+ "eval_accuracy": 0.9280575539568345,
1348
+ "eval_loss": 0.38381046056747437,
1349
+ "eval_runtime": 2.8652,
1350
+ "eval_samples_per_second": 97.028,
1351
+ "eval_steps_per_second": 12.216,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 10.961538461538462,
1356
+ "grad_norm": 0.013124003075063229,
1357
+ "learning_rate": 5.384615384615385e-05,
1358
+ "loss": 0.0015,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 11.025641025641026,
1363
+ "grad_norm": 0.007307114545255899,
1364
+ "learning_rate": 5.2991452991453e-05,
1365
+ "loss": 0.0015,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 11.08974358974359,
1370
+ "grad_norm": 0.007222812157124281,
1371
+ "learning_rate": 5.213675213675214e-05,
1372
+ "loss": 0.0014,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 11.153846153846153,
1377
+ "grad_norm": 0.007087068632245064,
1378
+ "learning_rate": 5.128205128205128e-05,
1379
+ "loss": 0.0015,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 11.217948717948717,
1384
+ "grad_norm": 0.010747412219643593,
1385
+ "learning_rate": 5.042735042735043e-05,
1386
+ "loss": 0.0016,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 11.282051282051283,
1391
+ "grad_norm": 0.007549288682639599,
1392
+ "learning_rate": 4.9572649572649575e-05,
1393
+ "loss": 0.0014,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 11.346153846153847,
1398
+ "grad_norm": 0.007861124351620674,
1399
+ "learning_rate": 4.871794871794872e-05,
1400
+ "loss": 0.0015,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 11.41025641025641,
1405
+ "grad_norm": 0.008711726404726505,
1406
+ "learning_rate": 4.786324786324787e-05,
1407
+ "loss": 0.0014,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 11.474358974358974,
1412
+ "grad_norm": 0.006650915369391441,
1413
+ "learning_rate": 4.700854700854701e-05,
1414
+ "loss": 0.0014,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 11.538461538461538,
1419
+ "grad_norm": 0.009336930699646473,
1420
+ "learning_rate": 4.615384615384616e-05,
1421
+ "loss": 0.0014,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 11.538461538461538,
1426
+ "eval_accuracy": 0.9280575539568345,
1427
+ "eval_loss": 0.3943016529083252,
1428
+ "eval_runtime": 2.5492,
1429
+ "eval_samples_per_second": 109.052,
1430
+ "eval_steps_per_second": 13.73,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 11.602564102564102,
1435
+ "grad_norm": 0.0071232253685593605,
1436
+ "learning_rate": 4.52991452991453e-05,
1437
+ "loss": 0.0015,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 11.666666666666666,
1442
+ "grad_norm": 0.0070044491440057755,
1443
+ "learning_rate": 4.4444444444444447e-05,
1444
+ "loss": 0.0014,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 11.73076923076923,
1449
+ "grad_norm": 0.00735941668972373,
1450
+ "learning_rate": 4.358974358974359e-05,
1451
+ "loss": 0.0014,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 11.794871794871796,
1456
+ "grad_norm": 0.006876462604850531,
1457
+ "learning_rate": 4.2735042735042735e-05,
1458
+ "loss": 0.0013,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 11.85897435897436,
1463
+ "grad_norm": 0.008532642386853695,
1464
+ "learning_rate": 4.1880341880341886e-05,
1465
+ "loss": 0.0014,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 11.923076923076923,
1470
+ "grad_norm": 0.007169618736952543,
1471
+ "learning_rate": 4.1025641025641023e-05,
1472
+ "loss": 0.0013,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 11.987179487179487,
1477
+ "grad_norm": 0.006206741090863943,
1478
+ "learning_rate": 4.0170940170940174e-05,
1479
+ "loss": 0.0012,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 12.051282051282051,
1484
+ "grad_norm": 0.006877180654555559,
1485
+ "learning_rate": 3.931623931623932e-05,
1486
+ "loss": 0.0013,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 12.115384615384615,
1491
+ "grad_norm": 0.006645900197327137,
1492
+ "learning_rate": 3.846153846153846e-05,
1493
+ "loss": 0.0013,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 12.179487179487179,
1498
+ "grad_norm": 0.007376631256192923,
1499
+ "learning_rate": 3.760683760683761e-05,
1500
+ "loss": 0.0013,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 12.179487179487179,
1505
+ "eval_accuracy": 0.9280575539568345,
1506
+ "eval_loss": 0.40119558572769165,
1507
+ "eval_runtime": 2.73,
1508
+ "eval_samples_per_second": 101.833,
1509
+ "eval_steps_per_second": 12.821,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 12.243589743589743,
1514
+ "grad_norm": 0.007013231050223112,
1515
+ "learning_rate": 3.675213675213676e-05,
1516
+ "loss": 0.0013,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 12.307692307692308,
1521
+ "grad_norm": 0.006970913149416447,
1522
+ "learning_rate": 3.58974358974359e-05,
1523
+ "loss": 0.0013,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 12.371794871794872,
1528
+ "grad_norm": 0.006338655948638916,
1529
+ "learning_rate": 3.504273504273504e-05,
1530
+ "loss": 0.0013,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 12.435897435897436,
1535
+ "grad_norm": 0.007881653495132923,
1536
+ "learning_rate": 3.418803418803419e-05,
1537
+ "loss": 0.0013,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 12.5,
1542
+ "grad_norm": 0.005947918631136417,
1543
+ "learning_rate": 3.3333333333333335e-05,
1544
+ "loss": 0.0012,
1545
+ "step": 1950
1546
+ },
1547
+ {
1548
+ "epoch": 12.564102564102564,
1549
+ "grad_norm": 0.005899305455386639,
1550
+ "learning_rate": 3.247863247863248e-05,
1551
+ "loss": 0.0012,
1552
+ "step": 1960
1553
+ },
1554
+ {
1555
+ "epoch": 12.628205128205128,
1556
+ "grad_norm": 0.0061206454411149025,
1557
+ "learning_rate": 3.162393162393162e-05,
1558
+ "loss": 0.0013,
1559
+ "step": 1970
1560
+ },
1561
+ {
1562
+ "epoch": 12.692307692307692,
1563
+ "grad_norm": 0.00656491843983531,
1564
+ "learning_rate": 3.0769230769230774e-05,
1565
+ "loss": 0.0012,
1566
+ "step": 1980
1567
+ },
1568
+ {
1569
+ "epoch": 12.756410256410255,
1570
+ "grad_norm": 0.006318471394479275,
1571
+ "learning_rate": 2.9914529914529915e-05,
1572
+ "loss": 0.0012,
1573
+ "step": 1990
1574
+ },
1575
+ {
1576
+ "epoch": 12.820512820512821,
1577
+ "grad_norm": 0.00670122355222702,
1578
+ "learning_rate": 2.9059829059829063e-05,
1579
+ "loss": 0.0012,
1580
+ "step": 2000
1581
+ },
1582
+ {
1583
+ "epoch": 12.820512820512821,
1584
+ "eval_accuracy": 0.9280575539568345,
1585
+ "eval_loss": 0.4066712558269501,
1586
+ "eval_runtime": 3.6428,
1587
+ "eval_samples_per_second": 76.315,
1588
+ "eval_steps_per_second": 9.608,
1589
+ "step": 2000
1590
+ },
1591
+ {
1592
+ "epoch": 12.884615384615385,
1593
+ "grad_norm": 0.005691882688552141,
1594
+ "learning_rate": 2.8205128205128207e-05,
1595
+ "loss": 0.0012,
1596
+ "step": 2010
1597
+ },
1598
+ {
1599
+ "epoch": 12.948717948717949,
1600
+ "grad_norm": 0.005753946490585804,
1601
+ "learning_rate": 2.7350427350427355e-05,
1602
+ "loss": 0.0012,
1603
+ "step": 2020
1604
+ },
1605
+ {
1606
+ "epoch": 13.012820512820513,
1607
+ "grad_norm": 0.006568002514541149,
1608
+ "learning_rate": 2.64957264957265e-05,
1609
+ "loss": 0.0013,
1610
+ "step": 2030
1611
+ },
1612
+ {
1613
+ "epoch": 13.076923076923077,
1614
+ "grad_norm": 0.005731898359954357,
1615
+ "learning_rate": 2.564102564102564e-05,
1616
+ "loss": 0.0012,
1617
+ "step": 2040
1618
+ },
1619
+ {
1620
+ "epoch": 13.14102564102564,
1621
+ "grad_norm": 0.005868157371878624,
1622
+ "learning_rate": 2.4786324786324787e-05,
1623
+ "loss": 0.0011,
1624
+ "step": 2050
1625
+ },
1626
+ {
1627
+ "epoch": 13.205128205128204,
1628
+ "grad_norm": 0.006337730213999748,
1629
+ "learning_rate": 2.3931623931623935e-05,
1630
+ "loss": 0.0012,
1631
+ "step": 2060
1632
+ },
1633
+ {
1634
+ "epoch": 13.26923076923077,
1635
+ "grad_norm": 0.006973997224122286,
1636
+ "learning_rate": 2.307692307692308e-05,
1637
+ "loss": 0.0011,
1638
+ "step": 2070
1639
+ },
1640
+ {
1641
+ "epoch": 13.333333333333334,
1642
+ "grad_norm": 0.00554188247770071,
1643
+ "learning_rate": 2.2222222222222223e-05,
1644
+ "loss": 0.0011,
1645
+ "step": 2080
1646
+ },
1647
+ {
1648
+ "epoch": 13.397435897435898,
1649
+ "grad_norm": 0.006806936115026474,
1650
+ "learning_rate": 2.1367521367521368e-05,
1651
+ "loss": 0.0011,
1652
+ "step": 2090
1653
+ },
1654
+ {
1655
+ "epoch": 13.461538461538462,
1656
+ "grad_norm": 0.005874712951481342,
1657
+ "learning_rate": 2.0512820512820512e-05,
1658
+ "loss": 0.0011,
1659
+ "step": 2100
1660
+ },
1661
+ {
1662
+ "epoch": 13.461538461538462,
1663
+ "eval_accuracy": 0.9280575539568345,
1664
+ "eval_loss": 0.41012829542160034,
1665
+ "eval_runtime": 2.5393,
1666
+ "eval_samples_per_second": 109.477,
1667
+ "eval_steps_per_second": 13.783,
1668
+ "step": 2100
1669
+ },
1670
+ {
1671
+ "epoch": 13.525641025641026,
1672
+ "grad_norm": 0.006118403282016516,
1673
+ "learning_rate": 1.965811965811966e-05,
1674
+ "loss": 0.0012,
1675
+ "step": 2110
1676
+ },
1677
+ {
1678
+ "epoch": 13.58974358974359,
1679
+ "grad_norm": 0.005724759306758642,
1680
+ "learning_rate": 1.8803418803418804e-05,
1681
+ "loss": 0.0012,
1682
+ "step": 2120
1683
+ },
1684
+ {
1685
+ "epoch": 13.653846153846153,
1686
+ "grad_norm": 0.005641784518957138,
1687
+ "learning_rate": 1.794871794871795e-05,
1688
+ "loss": 0.0011,
1689
+ "step": 2130
1690
+ },
1691
+ {
1692
+ "epoch": 13.717948717948717,
1693
+ "grad_norm": 0.006412914022803307,
1694
+ "learning_rate": 1.7094017094017095e-05,
1695
+ "loss": 0.0012,
1696
+ "step": 2140
1697
+ },
1698
+ {
1699
+ "epoch": 13.782051282051283,
1700
+ "grad_norm": 0.0061592236161231995,
1701
+ "learning_rate": 1.623931623931624e-05,
1702
+ "loss": 0.0012,
1703
+ "step": 2150
1704
+ },
1705
+ {
1706
+ "epoch": 13.846153846153847,
1707
+ "grad_norm": 0.006390335038304329,
1708
+ "learning_rate": 1.5384615384615387e-05,
1709
+ "loss": 0.0012,
1710
+ "step": 2160
1711
+ },
1712
+ {
1713
+ "epoch": 13.91025641025641,
1714
+ "grad_norm": 0.006186114624142647,
1715
+ "learning_rate": 1.4529914529914531e-05,
1716
+ "loss": 0.0012,
1717
+ "step": 2170
1718
+ },
1719
+ {
1720
+ "epoch": 13.974358974358974,
1721
+ "grad_norm": 0.006987506989389658,
1722
+ "learning_rate": 1.3675213675213677e-05,
1723
+ "loss": 0.0013,
1724
+ "step": 2180
1725
+ },
1726
+ {
1727
+ "epoch": 14.038461538461538,
1728
+ "grad_norm": 0.0060087586753070354,
1729
+ "learning_rate": 1.282051282051282e-05,
1730
+ "loss": 0.0011,
1731
+ "step": 2190
1732
+ },
1733
+ {
1734
+ "epoch": 14.102564102564102,
1735
+ "grad_norm": 0.005536227021366358,
1736
+ "learning_rate": 1.1965811965811967e-05,
1737
+ "loss": 0.0011,
1738
+ "step": 2200
1739
+ },
1740
+ {
1741
+ "epoch": 14.102564102564102,
1742
+ "eval_accuracy": 0.9280575539568345,
1743
+ "eval_loss": 0.41235998272895813,
1744
+ "eval_runtime": 3.6965,
1745
+ "eval_samples_per_second": 75.206,
1746
+ "eval_steps_per_second": 9.468,
1747
+ "step": 2200
1748
+ },
1749
+ {
1750
+ "epoch": 14.166666666666666,
1751
+ "grad_norm": 0.00747127179056406,
1752
+ "learning_rate": 1.1111111111111112e-05,
1753
+ "loss": 0.0011,
1754
+ "step": 2210
1755
+ },
1756
+ {
1757
+ "epoch": 14.23076923076923,
1758
+ "grad_norm": 0.006075258832424879,
1759
+ "learning_rate": 1.0256410256410256e-05,
1760
+ "loss": 0.0012,
1761
+ "step": 2220
1762
+ },
1763
+ {
1764
+ "epoch": 14.294871794871796,
1765
+ "grad_norm": 0.005355818197131157,
1766
+ "learning_rate": 9.401709401709402e-06,
1767
+ "loss": 0.0011,
1768
+ "step": 2230
1769
+ },
1770
+ {
1771
+ "epoch": 14.35897435897436,
1772
+ "grad_norm": 0.006171481683850288,
1773
+ "learning_rate": 8.547008547008548e-06,
1774
+ "loss": 0.0012,
1775
+ "step": 2240
1776
+ },
1777
+ {
1778
+ "epoch": 14.423076923076923,
1779
+ "grad_norm": 0.006203506141901016,
1780
+ "learning_rate": 7.692307692307694e-06,
1781
+ "loss": 0.0011,
1782
+ "step": 2250
1783
+ },
1784
+ {
1785
+ "epoch": 14.487179487179487,
1786
+ "grad_norm": 0.0053332289680838585,
1787
+ "learning_rate": 6.837606837606839e-06,
1788
+ "loss": 0.0011,
1789
+ "step": 2260
1790
+ },
1791
+ {
1792
+ "epoch": 14.551282051282051,
1793
+ "grad_norm": 0.006036951672285795,
1794
+ "learning_rate": 5.982905982905984e-06,
1795
+ "loss": 0.0012,
1796
+ "step": 2270
1797
+ },
1798
+ {
1799
+ "epoch": 14.615384615384615,
1800
+ "grad_norm": 0.006114748306572437,
1801
+ "learning_rate": 5.128205128205128e-06,
1802
+ "loss": 0.0012,
1803
+ "step": 2280
1804
+ },
1805
+ {
1806
+ "epoch": 14.679487179487179,
1807
+ "grad_norm": 0.0059860167093575,
1808
+ "learning_rate": 4.273504273504274e-06,
1809
+ "loss": 0.0011,
1810
+ "step": 2290
1811
+ },
1812
+ {
1813
+ "epoch": 14.743589743589745,
1814
+ "grad_norm": 0.005834794137626886,
1815
+ "learning_rate": 3.4188034188034193e-06,
1816
+ "loss": 0.0012,
1817
+ "step": 2300
1818
+ },
1819
+ {
1820
+ "epoch": 14.743589743589745,
1821
+ "eval_accuracy": 0.9280575539568345,
1822
+ "eval_loss": 0.4135644733905792,
1823
+ "eval_runtime": 2.5193,
1824
+ "eval_samples_per_second": 110.348,
1825
+ "eval_steps_per_second": 13.893,
1826
+ "step": 2300
1827
+ },
1828
+ {
1829
+ "epoch": 14.807692307692308,
1830
+ "grad_norm": 0.005964505951851606,
1831
+ "learning_rate": 2.564102564102564e-06,
1832
+ "loss": 0.0012,
1833
+ "step": 2310
1834
+ },
1835
+ {
1836
+ "epoch": 14.871794871794872,
1837
+ "grad_norm": 0.005720064975321293,
1838
+ "learning_rate": 1.7094017094017097e-06,
1839
+ "loss": 0.0011,
1840
+ "step": 2320
1841
+ },
1842
+ {
1843
+ "epoch": 14.935897435897436,
1844
+ "grad_norm": 0.005913382861763239,
1845
+ "learning_rate": 8.547008547008548e-07,
1846
+ "loss": 0.0011,
1847
+ "step": 2330
1848
+ },
1849
+ {
1850
+ "epoch": 15.0,
1851
+ "grad_norm": 0.005278407130390406,
1852
+ "learning_rate": 0.0,
1853
+ "loss": 0.0011,
1854
+ "step": 2340
1855
+ },
1856
+ {
1857
+ "epoch": 15.0,
1858
+ "step": 2340,
1859
+ "total_flos": 2.900189697360077e+18,
1860
+ "train_loss": 0.14551133991808146,
1861
+ "train_runtime": 927.2479,
1862
+ "train_samples_per_second": 40.361,
1863
+ "train_steps_per_second": 2.524
1864
  }
1865
  ],
1866
  "logging_steps": 10,
1867
+ "max_steps": 2340,
1868
  "num_input_tokens_seen": 0,
1869
+ "num_train_epochs": 15,
1870
  "save_steps": 100,
1871
  "stateful_callbacks": {
1872
  "TrainerControl": {
 
1880
  "attributes": {}
1881
  }
1882
  },
1883
+ "total_flos": 2.900189697360077e+18,
1884
  "train_batch_size": 16,
1885
  "trial_name": null,
1886
  "trial_params": null