pkr7098 commited on
Commit
e190377
1 Parent(s): f37f905

End of training

Browse files
Files changed (5) hide show
  1. README.md +4 -2
  2. all_results.json +9 -7
  3. eval_results.json +5 -3
  4. train_results.json +4 -4
  5. trainer_state.json +316 -296
README.md CHANGED
@@ -5,6 +5,8 @@ license: apache-2.0
5
  metrics:
6
  - accuracy
7
  tags:
 
 
8
  - generated_from_trainer
9
  model-index:
10
  - name: only-lora-beans-vit-base-patch16-224-in21k
@@ -16,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # only-lora-beans-vit-base-patch16-224-in21k
18
 
19
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.0983
22
  - Accuracy: 0.3308
23
 
24
  ## Model description
 
5
  metrics:
6
  - accuracy
7
  tags:
8
+ - image-classification
9
+ - vision
10
  - generated_from_trainer
11
  model-index:
12
  - name: only-lora-beans-vit-base-patch16-224-in21k
 
18
 
19
  # only-lora-beans-vit-base-patch16-224-in21k
20
 
21
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.2664
24
  - Accuracy: 0.3308
25
 
26
  ## Model description
all_results.json CHANGED
@@ -1,11 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_runtime": 0.7922,
4
- "eval_samples_per_second": 167.879,
5
- "eval_steps_per_second": 21.458,
 
 
6
  "total_flos": 8.400578669044531e+17,
7
- "train_loss": 0.9780362775692573,
8
- "train_runtime": 137.5902,
9
- "train_samples_per_second": 75.151,
10
- "train_steps_per_second": 9.448
11
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.3308270676691729,
4
+ "eval_loss": 1.2663869857788086,
5
+ "eval_runtime": 0.9217,
6
+ "eval_samples_per_second": 144.296,
7
+ "eval_steps_per_second": 18.444,
8
  "total_flos": 8.400578669044531e+17,
9
+ "train_loss": 0.9618942783429072,
10
+ "train_runtime": 164.295,
11
+ "train_samples_per_second": 62.936,
12
+ "train_steps_per_second": 7.913
13
  }
eval_results.json CHANGED
@@ -1,6 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_runtime": 0.7922,
4
- "eval_samples_per_second": 167.879,
5
- "eval_steps_per_second": 21.458
 
 
6
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.3308270676691729,
4
+ "eval_loss": 1.2663869857788086,
5
+ "eval_runtime": 0.9217,
6
+ "eval_samples_per_second": 144.296,
7
+ "eval_steps_per_second": 18.444
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 8.400578669044531e+17,
4
- "train_loss": 0.9780362775692573,
5
- "train_runtime": 137.5902,
6
- "train_samples_per_second": 75.151,
7
- "train_steps_per_second": 9.448
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 8.400578669044531e+17,
4
+ "train_loss": 0.9618942783429072,
5
+ "train_runtime": 164.295,
6
+ "train_samples_per_second": 62.936,
7
+ "train_steps_per_second": 7.913
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
  "global_step": 1300,
@@ -10,992 +10,1012 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
- "grad_norm": 2.0707900524139404,
14
  "learning_rate": 0.004961538461538462,
15
- "loss": 0.8561,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
- "grad_norm": 0.9913710951805115,
21
  "learning_rate": 0.004923076923076923,
22
- "loss": 0.6755,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
- "grad_norm": 1.5771081447601318,
28
  "learning_rate": 0.004884615384615385,
29
- "loss": 0.4032,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
- "grad_norm": 1.9519121646881104,
35
  "learning_rate": 0.004846153846153846,
36
- "loss": 0.3903,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
- "grad_norm": 2.557398796081543,
42
  "learning_rate": 0.004807692307692308,
43
- "loss": 0.5472,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
- "grad_norm": 2.4644172191619873,
49
  "learning_rate": 0.0047692307692307695,
50
- "loss": 0.3894,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
- "grad_norm": 1.2416399717330933,
56
  "learning_rate": 0.004730769230769231,
57
- "loss": 0.1827,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
- "grad_norm": 0.8515920639038086,
63
  "learning_rate": 0.004692307692307693,
64
- "loss": 0.3148,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
- "grad_norm": 0.227506622672081,
70
  "learning_rate": 0.004653846153846154,
71
- "loss": 0.237,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
- "grad_norm": 2.7295713424682617,
77
  "learning_rate": 0.004615384615384616,
78
- "loss": 0.4501,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
- "grad_norm": 1.2349276542663574,
84
  "learning_rate": 0.0045769230769230765,
85
- "loss": 0.6825,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
- "grad_norm": 3.311713933944702,
91
  "learning_rate": 0.004538461538461539,
92
- "loss": 0.4085,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
- "grad_norm": 0.20723728835582733,
98
  "learning_rate": 0.0045000000000000005,
99
- "loss": 0.1292,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
- "eval_runtime": 0.749,
105
- "eval_samples_per_second": 177.568,
106
- "eval_steps_per_second": 22.697,
 
 
107
  "step": 130
108
  },
109
  {
110
  "epoch": 1.0769230769230769,
111
- "grad_norm": 1.5258536338806152,
112
  "learning_rate": 0.004461538461538462,
113
- "loss": 0.3389,
114
  "step": 140
115
  },
116
  {
117
  "epoch": 1.1538461538461537,
118
- "grad_norm": 2.0288751125335693,
119
  "learning_rate": 0.004423076923076923,
120
- "loss": 0.2397,
121
  "step": 150
122
  },
123
  {
124
  "epoch": 1.2307692307692308,
125
- "grad_norm": 0.03801692649722099,
126
  "learning_rate": 0.004384615384615384,
127
- "loss": 0.162,
128
  "step": 160
129
  },
130
  {
131
  "epoch": 1.3076923076923077,
132
- "grad_norm": 0.1472761183977127,
133
  "learning_rate": 0.004346153846153846,
134
- "loss": 0.2687,
135
  "step": 170
136
  },
137
  {
138
  "epoch": 1.3846153846153846,
139
- "grad_norm": 4.054047107696533,
140
  "learning_rate": 0.004307692307692308,
141
- "loss": 0.6044,
142
  "step": 180
143
  },
144
  {
145
  "epoch": 1.4615384615384617,
146
- "grad_norm": 1.6007931232452393,
147
  "learning_rate": 0.004269230769230769,
148
- "loss": 0.6124,
149
  "step": 190
150
  },
151
  {
152
  "epoch": 1.5384615384615383,
153
- "grad_norm": 0.15122832357883453,
154
  "learning_rate": 0.004230769230769231,
155
- "loss": 0.4298,
156
  "step": 200
157
  },
158
  {
159
  "epoch": 1.6153846153846154,
160
- "grad_norm": 1.2161375284194946,
161
  "learning_rate": 0.004192307692307692,
162
- "loss": 0.5844,
163
  "step": 210
164
  },
165
  {
166
  "epoch": 1.6923076923076923,
167
- "grad_norm": 7.478526592254639,
168
  "learning_rate": 0.004153846153846154,
169
- "loss": 1.2806,
170
  "step": 220
171
  },
172
  {
173
  "epoch": 1.7692307692307692,
174
- "grad_norm": 1.2232049703598022,
175
  "learning_rate": 0.004115384615384615,
176
- "loss": 0.9687,
177
  "step": 230
178
  },
179
  {
180
  "epoch": 1.8461538461538463,
181
- "grad_norm": 1.4465281963348389,
182
  "learning_rate": 0.004076923076923077,
183
- "loss": 0.4883,
184
  "step": 240
185
  },
186
  {
187
  "epoch": 1.9230769230769231,
188
- "grad_norm": 3.471082925796509,
189
  "learning_rate": 0.0040384615384615385,
190
- "loss": 0.6709,
191
  "step": 250
192
  },
193
  {
194
  "epoch": 2.0,
195
- "grad_norm": 2.699213981628418,
196
  "learning_rate": 0.004,
197
- "loss": 0.6404,
198
  "step": 260
199
  },
200
  {
201
  "epoch": 2.0,
202
- "eval_runtime": 0.747,
203
- "eval_samples_per_second": 178.056,
204
- "eval_steps_per_second": 22.759,
 
 
205
  "step": 260
206
  },
207
  {
208
  "epoch": 2.076923076923077,
209
- "grad_norm": 1.1869909763336182,
210
  "learning_rate": 0.003961538461538462,
211
- "loss": 0.6565,
212
  "step": 270
213
  },
214
  {
215
  "epoch": 2.1538461538461537,
216
- "grad_norm": 2.2662851810455322,
217
  "learning_rate": 0.003923076923076923,
218
- "loss": 0.3753,
219
  "step": 280
220
  },
221
  {
222
  "epoch": 2.230769230769231,
223
- "grad_norm": 3.587059736251831,
224
  "learning_rate": 0.003884615384615385,
225
- "loss": 0.9706,
226
  "step": 290
227
  },
228
  {
229
  "epoch": 2.3076923076923075,
230
- "grad_norm": 1.530928611755371,
231
  "learning_rate": 0.0038461538461538464,
232
- "loss": 1.2593,
233
  "step": 300
234
  },
235
  {
236
  "epoch": 2.3846153846153846,
237
- "grad_norm": 1.0865129232406616,
238
  "learning_rate": 0.0038076923076923075,
239
- "loss": 0.9166,
240
  "step": 310
241
  },
242
  {
243
  "epoch": 2.4615384615384617,
244
- "grad_norm": 2.3256101608276367,
245
  "learning_rate": 0.003769230769230769,
246
- "loss": 1.1882,
247
  "step": 320
248
  },
249
  {
250
  "epoch": 2.5384615384615383,
251
- "grad_norm": 0.5831055045127869,
252
  "learning_rate": 0.003730769230769231,
253
- "loss": 1.2764,
254
  "step": 330
255
  },
256
  {
257
  "epoch": 2.6153846153846154,
258
- "grad_norm": 0.9471529126167297,
259
  "learning_rate": 0.0036923076923076927,
260
- "loss": 1.349,
261
  "step": 340
262
  },
263
  {
264
  "epoch": 2.6923076923076925,
265
- "grad_norm": 0.6033434867858887,
266
  "learning_rate": 0.003653846153846154,
267
- "loss": 1.1626,
268
  "step": 350
269
  },
270
  {
271
  "epoch": 2.769230769230769,
272
- "grad_norm": 0.322643518447876,
273
  "learning_rate": 0.0036153846153846154,
274
- "loss": 1.1705,
275
  "step": 360
276
  },
277
  {
278
  "epoch": 2.8461538461538463,
279
- "grad_norm": 0.6473853588104248,
280
  "learning_rate": 0.003576923076923077,
281
- "loss": 1.1492,
282
  "step": 370
283
  },
284
  {
285
  "epoch": 2.9230769230769234,
286
- "grad_norm": 0.4563555121421814,
287
  "learning_rate": 0.003538461538461539,
288
- "loss": 1.1009,
289
  "step": 380
290
  },
291
  {
292
  "epoch": 3.0,
293
- "grad_norm": 0.6773648858070374,
294
  "learning_rate": 0.0034999999999999996,
295
- "loss": 1.1322,
296
  "step": 390
297
  },
298
  {
299
  "epoch": 3.0,
300
- "eval_runtime": 0.7759,
301
- "eval_samples_per_second": 171.41,
302
- "eval_steps_per_second": 21.91,
 
 
303
  "step": 390
304
  },
305
  {
306
  "epoch": 3.076923076923077,
307
- "grad_norm": 0.499421089887619,
308
  "learning_rate": 0.0034615384615384616,
309
- "loss": 1.0926,
310
  "step": 400
311
  },
312
  {
313
  "epoch": 3.1538461538461537,
314
- "grad_norm": 0.20487019419670105,
315
  "learning_rate": 0.003423076923076923,
316
- "loss": 1.129,
317
  "step": 410
318
  },
319
  {
320
  "epoch": 3.230769230769231,
321
- "grad_norm": 0.5652111172676086,
322
  "learning_rate": 0.003384615384615385,
323
- "loss": 1.0965,
324
  "step": 420
325
  },
326
  {
327
  "epoch": 3.3076923076923075,
328
- "grad_norm": 0.5159160494804382,
329
  "learning_rate": 0.003346153846153846,
330
- "loss": 1.0865,
331
  "step": 430
332
  },
333
  {
334
  "epoch": 3.3846153846153846,
335
- "grad_norm": 0.5972450971603394,
336
  "learning_rate": 0.0033076923076923075,
337
- "loss": 1.1468,
338
  "step": 440
339
  },
340
  {
341
  "epoch": 3.4615384615384617,
342
- "grad_norm": 0.4497612714767456,
343
  "learning_rate": 0.0032692307692307695,
344
- "loss": 1.0972,
345
  "step": 450
346
  },
347
  {
348
  "epoch": 3.5384615384615383,
349
- "grad_norm": 0.8494599461555481,
350
  "learning_rate": 0.003230769230769231,
351
- "loss": 1.1196,
352
  "step": 460
353
  },
354
  {
355
  "epoch": 3.6153846153846154,
356
- "grad_norm": 0.21025516092777252,
357
  "learning_rate": 0.003192307692307692,
358
- "loss": 1.1336,
359
  "step": 470
360
  },
361
  {
362
  "epoch": 3.6923076923076925,
363
- "grad_norm": 0.5746878385543823,
364
  "learning_rate": 0.0031538461538461538,
365
- "loss": 1.0988,
366
  "step": 480
367
  },
368
  {
369
  "epoch": 3.769230769230769,
370
- "grad_norm": 0.20434798300266266,
371
  "learning_rate": 0.0031153846153846153,
372
- "loss": 1.1101,
373
  "step": 490
374
  },
375
  {
376
  "epoch": 3.8461538461538463,
377
- "grad_norm": 0.15725131332874298,
378
  "learning_rate": 0.0030769230769230774,
379
- "loss": 1.1133,
380
  "step": 500
381
  },
382
  {
383
  "epoch": 3.9230769230769234,
384
- "grad_norm": 0.12141856551170349,
385
  "learning_rate": 0.0030384615384615385,
386
- "loss": 1.0987,
387
  "step": 510
388
  },
389
  {
390
  "epoch": 4.0,
391
- "grad_norm": 1.1281518936157227,
392
  "learning_rate": 0.003,
393
- "loss": 1.1021,
394
  "step": 520
395
  },
396
  {
397
  "epoch": 4.0,
398
- "eval_runtime": 0.7689,
399
- "eval_samples_per_second": 172.973,
400
- "eval_steps_per_second": 22.109,
 
 
401
  "step": 520
402
  },
403
  {
404
  "epoch": 4.076923076923077,
405
- "grad_norm": 0.5692263841629028,
406
  "learning_rate": 0.0029615384615384616,
407
- "loss": 1.1032,
408
  "step": 530
409
  },
410
  {
411
  "epoch": 4.153846153846154,
412
- "grad_norm": 0.21811030805110931,
413
  "learning_rate": 0.002923076923076923,
414
- "loss": 1.096,
415
  "step": 540
416
  },
417
  {
418
  "epoch": 4.230769230769231,
419
- "grad_norm": 0.44653287529945374,
420
  "learning_rate": 0.0028846153846153843,
421
- "loss": 1.0986,
422
  "step": 550
423
  },
424
  {
425
  "epoch": 4.3076923076923075,
426
- "grad_norm": 0.4496428966522217,
427
  "learning_rate": 0.002846153846153846,
428
- "loss": 1.1244,
429
  "step": 560
430
  },
431
  {
432
  "epoch": 4.384615384615385,
433
- "grad_norm": 0.34072381258010864,
434
  "learning_rate": 0.002807692307692308,
435
- "loss": 1.0982,
436
  "step": 570
437
  },
438
  {
439
  "epoch": 4.461538461538462,
440
- "grad_norm": 0.617884635925293,
441
  "learning_rate": 0.0027692307692307695,
442
- "loss": 1.1137,
443
  "step": 580
444
  },
445
  {
446
  "epoch": 4.538461538461538,
447
- "grad_norm": 0.3251841068267822,
448
  "learning_rate": 0.0027307692307692306,
449
- "loss": 1.1054,
450
  "step": 590
451
  },
452
  {
453
  "epoch": 4.615384615384615,
454
- "grad_norm": 0.11284901946783066,
455
  "learning_rate": 0.002692307692307692,
456
- "loss": 1.0981,
457
  "step": 600
458
  },
459
  {
460
  "epoch": 4.6923076923076925,
461
- "grad_norm": 0.802207887172699,
462
  "learning_rate": 0.0026538461538461538,
463
- "loss": 1.1134,
464
  "step": 610
465
  },
466
  {
467
  "epoch": 4.769230769230769,
468
- "grad_norm": 0.22614385187625885,
469
  "learning_rate": 0.0026153846153846158,
470
- "loss": 1.1155,
471
  "step": 620
472
  },
473
  {
474
  "epoch": 4.846153846153846,
475
- "grad_norm": 0.3756657838821411,
476
  "learning_rate": 0.002576923076923077,
477
- "loss": 1.1113,
478
  "step": 630
479
  },
480
  {
481
  "epoch": 4.923076923076923,
482
- "grad_norm": 0.6828984618186951,
483
  "learning_rate": 0.0025384615384615385,
484
- "loss": 1.1249,
485
  "step": 640
486
  },
487
  {
488
  "epoch": 5.0,
489
- "grad_norm": 1.2701197862625122,
490
  "learning_rate": 0.0025,
491
- "loss": 1.102,
492
  "step": 650
493
  },
494
  {
495
  "epoch": 5.0,
496
- "eval_runtime": 0.8101,
497
- "eval_samples_per_second": 164.178,
498
- "eval_steps_per_second": 20.985,
 
 
499
  "step": 650
500
  },
501
  {
502
  "epoch": 5.076923076923077,
503
- "grad_norm": 0.30358830094337463,
504
  "learning_rate": 0.0024615384615384616,
505
- "loss": 1.1044,
506
  "step": 660
507
  },
508
  {
509
  "epoch": 5.153846153846154,
510
- "grad_norm": 0.42961689829826355,
511
  "learning_rate": 0.002423076923076923,
512
- "loss": 1.0977,
513
  "step": 670
514
  },
515
  {
516
  "epoch": 5.230769230769231,
517
- "grad_norm": 0.24340471625328064,
518
  "learning_rate": 0.0023846153846153848,
519
- "loss": 1.1379,
520
  "step": 680
521
  },
522
  {
523
  "epoch": 5.3076923076923075,
524
- "grad_norm": 0.9648124575614929,
525
  "learning_rate": 0.0023461538461538463,
526
- "loss": 1.1314,
527
  "step": 690
528
  },
529
  {
530
  "epoch": 5.384615384615385,
531
- "grad_norm": 0.6379570960998535,
532
  "learning_rate": 0.002307692307692308,
533
- "loss": 1.1104,
534
  "step": 700
535
  },
536
  {
537
  "epoch": 5.461538461538462,
538
- "grad_norm": 0.32133665680885315,
539
  "learning_rate": 0.0022692307692307695,
540
- "loss": 1.1061,
541
  "step": 710
542
  },
543
  {
544
  "epoch": 5.538461538461538,
545
- "grad_norm": 0.314644455909729,
546
  "learning_rate": 0.002230769230769231,
547
- "loss": 1.1035,
548
  "step": 720
549
  },
550
  {
551
  "epoch": 5.615384615384615,
552
- "grad_norm": 0.4097970128059387,
553
  "learning_rate": 0.002192307692307692,
554
- "loss": 1.1035,
555
  "step": 730
556
  },
557
  {
558
  "epoch": 5.6923076923076925,
559
- "grad_norm": 0.3651571273803711,
560
  "learning_rate": 0.002153846153846154,
561
- "loss": 1.1071,
562
  "step": 740
563
  },
564
  {
565
  "epoch": 5.769230769230769,
566
- "grad_norm": 0.30447086691856384,
567
  "learning_rate": 0.0021153846153846153,
568
- "loss": 1.1048,
569
  "step": 750
570
  },
571
  {
572
  "epoch": 5.846153846153846,
573
- "grad_norm": 0.35882461071014404,
574
  "learning_rate": 0.002076923076923077,
575
- "loss": 1.106,
576
  "step": 760
577
  },
578
  {
579
  "epoch": 5.923076923076923,
580
- "grad_norm": 0.6777219772338867,
581
  "learning_rate": 0.0020384615384615385,
582
- "loss": 1.101,
583
  "step": 770
584
  },
585
  {
586
  "epoch": 6.0,
587
- "grad_norm": 1.031847357749939,
588
  "learning_rate": 0.002,
589
- "loss": 1.1027,
590
  "step": 780
591
  },
592
  {
593
  "epoch": 6.0,
594
- "eval_runtime": 0.7636,
595
- "eval_samples_per_second": 174.172,
596
- "eval_steps_per_second": 22.263,
 
 
597
  "step": 780
598
  },
599
  {
600
  "epoch": 6.076923076923077,
601
- "grad_norm": 0.43265023827552795,
602
  "learning_rate": 0.0019615384615384616,
603
- "loss": 1.104,
604
  "step": 790
605
  },
606
  {
607
  "epoch": 6.153846153846154,
608
- "grad_norm": 0.4353933334350586,
609
  "learning_rate": 0.0019230769230769232,
610
- "loss": 1.099,
611
  "step": 800
612
  },
613
  {
614
  "epoch": 6.230769230769231,
615
- "grad_norm": 0.11455998569726944,
616
  "learning_rate": 0.0018846153846153845,
617
- "loss": 1.0985,
618
  "step": 810
619
  },
620
  {
621
  "epoch": 6.3076923076923075,
622
- "grad_norm": 0.1155104711651802,
623
  "learning_rate": 0.0018461538461538463,
624
- "loss": 1.1026,
625
  "step": 820
626
  },
627
  {
628
  "epoch": 6.384615384615385,
629
- "grad_norm": 0.44076037406921387,
630
  "learning_rate": 0.0018076923076923077,
631
- "loss": 1.0985,
632
  "step": 830
633
  },
634
  {
635
  "epoch": 6.461538461538462,
636
- "grad_norm": 0.5386171936988831,
637
  "learning_rate": 0.0017692307692307695,
638
- "loss": 1.1025,
639
  "step": 840
640
  },
641
  {
642
  "epoch": 6.538461538461538,
643
- "grad_norm": 0.611710786819458,
644
  "learning_rate": 0.0017307692307692308,
645
- "loss": 1.0992,
646
  "step": 850
647
  },
648
  {
649
  "epoch": 6.615384615384615,
650
- "grad_norm": 0.2715957760810852,
651
  "learning_rate": 0.0016923076923076924,
652
- "loss": 1.1008,
653
  "step": 860
654
  },
655
  {
656
  "epoch": 6.6923076923076925,
657
- "grad_norm": 0.4223721921443939,
658
  "learning_rate": 0.0016538461538461537,
659
- "loss": 1.089,
660
  "step": 870
661
  },
662
  {
663
  "epoch": 6.769230769230769,
664
- "grad_norm": 0.14638401567935944,
665
  "learning_rate": 0.0016153846153846155,
666
- "loss": 1.1237,
667
  "step": 880
668
  },
669
  {
670
  "epoch": 6.846153846153846,
671
- "grad_norm": 0.32549309730529785,
672
  "learning_rate": 0.0015769230769230769,
673
- "loss": 1.1031,
674
  "step": 890
675
  },
676
  {
677
  "epoch": 6.923076923076923,
678
- "grad_norm": 0.4604528844356537,
679
  "learning_rate": 0.0015384615384615387,
680
- "loss": 1.1001,
681
  "step": 900
682
  },
683
  {
684
  "epoch": 7.0,
685
- "grad_norm": 0.5566179156303406,
686
  "learning_rate": 0.0015,
687
- "loss": 1.0954,
688
  "step": 910
689
  },
690
  {
691
  "epoch": 7.0,
692
- "eval_runtime": 0.7734,
693
- "eval_samples_per_second": 171.968,
694
- "eval_steps_per_second": 21.981,
 
 
695
  "step": 910
696
  },
697
  {
698
  "epoch": 7.076923076923077,
699
- "grad_norm": 0.5109624266624451,
700
  "learning_rate": 0.0014615384615384616,
701
- "loss": 1.1043,
702
  "step": 920
703
  },
704
  {
705
  "epoch": 7.153846153846154,
706
- "grad_norm": 0.5437686443328857,
707
  "learning_rate": 0.001423076923076923,
708
- "loss": 1.1055,
709
  "step": 930
710
  },
711
  {
712
  "epoch": 7.230769230769231,
713
- "grad_norm": 0.6647012233734131,
714
  "learning_rate": 0.0013846153846153847,
715
- "loss": 1.1021,
716
  "step": 940
717
  },
718
  {
719
  "epoch": 7.3076923076923075,
720
- "grad_norm": 0.4888548254966736,
721
  "learning_rate": 0.001346153846153846,
722
- "loss": 1.0968,
723
  "step": 950
724
  },
725
  {
726
  "epoch": 7.384615384615385,
727
- "grad_norm": 0.26103150844573975,
728
  "learning_rate": 0.0013076923076923079,
729
- "loss": 1.104,
730
  "step": 960
731
  },
732
  {
733
  "epoch": 7.461538461538462,
734
- "grad_norm": 0.2501744031906128,
735
  "learning_rate": 0.0012692307692307692,
736
- "loss": 1.1008,
737
  "step": 970
738
  },
739
  {
740
  "epoch": 7.538461538461538,
741
- "grad_norm": 0.37639835476875305,
742
  "learning_rate": 0.0012307692307692308,
743
- "loss": 1.0998,
744
  "step": 980
745
  },
746
  {
747
  "epoch": 7.615384615384615,
748
- "grad_norm": 0.23087725043296814,
749
  "learning_rate": 0.0011923076923076924,
750
- "loss": 1.0979,
751
  "step": 990
752
  },
753
  {
754
  "epoch": 7.6923076923076925,
755
- "grad_norm": 0.3334718644618988,
756
  "learning_rate": 0.001153846153846154,
757
- "loss": 1.1068,
758
  "step": 1000
759
  },
760
  {
761
  "epoch": 7.769230769230769,
762
- "grad_norm": 0.5154008865356445,
763
  "learning_rate": 0.0011153846153846155,
764
- "loss": 1.099,
765
  "step": 1010
766
  },
767
  {
768
  "epoch": 7.846153846153846,
769
- "grad_norm": 0.2505805492401123,
770
  "learning_rate": 0.001076923076923077,
771
- "loss": 1.0985,
772
  "step": 1020
773
  },
774
  {
775
  "epoch": 7.923076923076923,
776
- "grad_norm": 0.47592583298683167,
777
  "learning_rate": 0.0010384615384615384,
778
- "loss": 1.0968,
779
  "step": 1030
780
  },
781
  {
782
  "epoch": 8.0,
783
- "grad_norm": 0.4868021309375763,
784
  "learning_rate": 0.001,
785
- "loss": 1.1047,
786
  "step": 1040
787
  },
788
  {
789
  "epoch": 8.0,
790
- "eval_runtime": 0.7758,
791
- "eval_samples_per_second": 171.434,
792
- "eval_steps_per_second": 21.913,
 
 
793
  "step": 1040
794
  },
795
  {
796
  "epoch": 8.076923076923077,
797
- "grad_norm": 0.12491131573915482,
798
  "learning_rate": 0.0009615384615384616,
799
- "loss": 1.0981,
800
  "step": 1050
801
  },
802
  {
803
  "epoch": 8.153846153846153,
804
- "grad_norm": 0.23577113449573517,
805
  "learning_rate": 0.0009230769230769232,
806
- "loss": 1.0998,
807
  "step": 1060
808
  },
809
  {
810
  "epoch": 8.23076923076923,
811
- "grad_norm": 0.12919014692306519,
812
  "learning_rate": 0.0008846153846153847,
813
- "loss": 1.098,
814
  "step": 1070
815
  },
816
  {
817
  "epoch": 8.307692307692308,
818
- "grad_norm": 0.11554103344678879,
819
  "learning_rate": 0.0008461538461538462,
820
- "loss": 1.099,
821
  "step": 1080
822
  },
823
  {
824
  "epoch": 8.384615384615385,
825
- "grad_norm": 0.3568715751171112,
826
  "learning_rate": 0.0008076923076923078,
827
- "loss": 1.0935,
828
  "step": 1090
829
  },
830
  {
831
  "epoch": 8.461538461538462,
832
- "grad_norm": 0.4273003041744232,
833
  "learning_rate": 0.0007692307692307693,
834
- "loss": 1.1025,
835
  "step": 1100
836
  },
837
  {
838
  "epoch": 8.538461538461538,
839
- "grad_norm": 0.11346741020679474,
840
  "learning_rate": 0.0007307692307692308,
841
- "loss": 1.1079,
842
  "step": 1110
843
  },
844
  {
845
  "epoch": 8.615384615384615,
846
- "grad_norm": 0.5924614667892456,
847
  "learning_rate": 0.0006923076923076924,
848
- "loss": 1.095,
849
  "step": 1120
850
  },
851
  {
852
  "epoch": 8.692307692307692,
853
- "grad_norm": 0.8924316763877869,
854
  "learning_rate": 0.0006538461538461539,
855
- "loss": 1.0969,
856
  "step": 1130
857
  },
858
  {
859
  "epoch": 8.76923076923077,
860
- "grad_norm": 0.14277611672878265,
861
  "learning_rate": 0.0006153846153846154,
862
- "loss": 1.112,
863
  "step": 1140
864
  },
865
  {
866
  "epoch": 8.846153846153847,
867
- "grad_norm": 0.29049235582351685,
868
  "learning_rate": 0.000576923076923077,
869
- "loss": 1.0914,
870
  "step": 1150
871
  },
872
  {
873
  "epoch": 8.923076923076923,
874
- "grad_norm": 0.5114134550094604,
875
  "learning_rate": 0.0005384615384615385,
876
- "loss": 1.1061,
877
  "step": 1160
878
  },
879
  {
880
  "epoch": 9.0,
881
- "grad_norm": 1.142321228981018,
882
  "learning_rate": 0.0005,
883
- "loss": 1.1105,
884
  "step": 1170
885
  },
886
  {
887
  "epoch": 9.0,
888
- "eval_runtime": 0.7827,
889
- "eval_samples_per_second": 169.933,
890
- "eval_steps_per_second": 21.721,
 
 
891
  "step": 1170
892
  },
893
  {
894
  "epoch": 9.076923076923077,
895
- "grad_norm": 0.12881995737552643,
896
  "learning_rate": 0.0004615384615384616,
897
- "loss": 1.0994,
898
  "step": 1180
899
  },
900
  {
901
  "epoch": 9.153846153846153,
902
- "grad_norm": 0.12638165056705475,
903
  "learning_rate": 0.0004230769230769231,
904
- "loss": 1.0982,
905
  "step": 1190
906
  },
907
  {
908
  "epoch": 9.23076923076923,
909
- "grad_norm": 0.13503584265708923,
910
  "learning_rate": 0.00038461538461538467,
911
- "loss": 1.0996,
912
  "step": 1200
913
  },
914
  {
915
  "epoch": 9.307692307692308,
916
- "grad_norm": 0.23928503692150116,
917
  "learning_rate": 0.0003461538461538462,
918
- "loss": 1.1,
919
  "step": 1210
920
  },
921
  {
922
  "epoch": 9.384615384615385,
923
- "grad_norm": 0.14712156355381012,
924
  "learning_rate": 0.0003076923076923077,
925
- "loss": 1.1033,
926
  "step": 1220
927
  },
928
  {
929
  "epoch": 9.461538461538462,
930
- "grad_norm": 0.603887140750885,
931
  "learning_rate": 0.0002692307692307693,
932
- "loss": 1.0979,
933
  "step": 1230
934
  },
935
  {
936
  "epoch": 9.538461538461538,
937
- "grad_norm": 0.34505996108055115,
938
  "learning_rate": 0.0002307692307692308,
939
- "loss": 1.0977,
940
  "step": 1240
941
  },
942
  {
943
  "epoch": 9.615384615384615,
944
- "grad_norm": 0.11722344160079956,
945
  "learning_rate": 0.00019230769230769233,
946
- "loss": 1.1001,
947
  "step": 1250
948
  },
949
  {
950
  "epoch": 9.692307692307692,
951
- "grad_norm": 0.33938324451446533,
952
  "learning_rate": 0.00015384615384615385,
953
- "loss": 1.0988,
954
  "step": 1260
955
  },
956
  {
957
  "epoch": 9.76923076923077,
958
- "grad_norm": 0.1380094438791275,
959
  "learning_rate": 0.0001153846153846154,
960
- "loss": 1.1004,
961
  "step": 1270
962
  },
963
  {
964
  "epoch": 9.846153846153847,
965
- "grad_norm": 0.2527919411659241,
966
  "learning_rate": 7.692307692307693e-05,
967
- "loss": 1.0999,
968
  "step": 1280
969
  },
970
  {
971
  "epoch": 9.923076923076923,
972
- "grad_norm": 0.316256582736969,
973
  "learning_rate": 3.846153846153846e-05,
974
- "loss": 1.0984,
975
  "step": 1290
976
  },
977
  {
978
  "epoch": 10.0,
979
- "grad_norm": 0.4816994071006775,
980
  "learning_rate": 0.0,
981
- "loss": 1.0979,
982
  "step": 1300
983
  },
984
  {
985
  "epoch": 10.0,
986
- "eval_runtime": 0.7803,
987
- "eval_samples_per_second": 170.446,
988
- "eval_steps_per_second": 21.786,
 
 
989
  "step": 1300
990
  },
991
  {
992
  "epoch": 10.0,
993
  "step": 1300,
994
  "total_flos": 8.400578669044531e+17,
995
- "train_loss": 0.9780362775692573,
996
- "train_runtime": 137.5902,
997
- "train_samples_per_second": 75.151,
998
- "train_steps_per_second": 9.448
999
  }
1000
  ],
1001
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.3851391077041626,
3
+ "best_model_checkpoint": "./beans_outputs/checkpoint-130",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
  "global_step": 1300,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
+ "grad_norm": 1.2493035793304443,
14
  "learning_rate": 0.004961538461538462,
15
+ "loss": 0.8568,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
+ "grad_norm": 1.4747363328933716,
21
  "learning_rate": 0.004923076923076923,
22
+ "loss": 0.6606,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
+ "grad_norm": 0.8188544511795044,
28
  "learning_rate": 0.004884615384615385,
29
+ "loss": 0.3787,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
+ "grad_norm": 1.718248724937439,
35
  "learning_rate": 0.004846153846153846,
36
+ "loss": 0.3154,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
+ "grad_norm": 2.2010860443115234,
42
  "learning_rate": 0.004807692307692308,
43
+ "loss": 0.4932,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
+ "grad_norm": 2.927755832672119,
49
  "learning_rate": 0.0047692307692307695,
50
+ "loss": 0.561,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
+ "grad_norm": 1.2203412055969238,
56
  "learning_rate": 0.004730769230769231,
57
+ "loss": 0.3309,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
+ "grad_norm": 0.08795254677534103,
63
  "learning_rate": 0.004692307692307693,
64
+ "loss": 0.3261,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
+ "grad_norm": 0.4609662890434265,
70
  "learning_rate": 0.004653846153846154,
71
+ "loss": 0.195,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
+ "grad_norm": 0.9046596884727478,
77
  "learning_rate": 0.004615384615384616,
78
+ "loss": 0.3352,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
+ "grad_norm": 2.5637238025665283,
84
  "learning_rate": 0.0045769230769230765,
85
+ "loss": 0.5961,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
+ "grad_norm": 0.32293158769607544,
91
  "learning_rate": 0.004538461538461539,
92
+ "loss": 0.236,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "grad_norm": 7.484982967376709,
98
  "learning_rate": 0.0045000000000000005,
99
+ "loss": 0.6323,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
+ "eval_accuracy": 0.8721804511278195,
105
+ "eval_loss": 0.3851391077041626,
106
+ "eval_runtime": 0.8932,
107
+ "eval_samples_per_second": 148.91,
108
+ "eval_steps_per_second": 19.034,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
+ "grad_norm": 0.12631376087665558,
114
  "learning_rate": 0.004461538461538462,
115
+ "loss": 0.747,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
+ "grad_norm": 1.5336790084838867,
121
  "learning_rate": 0.004423076923076923,
122
+ "loss": 0.3062,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
+ "grad_norm": 0.7460920214653015,
128
  "learning_rate": 0.004384615384615384,
129
+ "loss": 0.2088,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
+ "grad_norm": 0.6954104900360107,
135
  "learning_rate": 0.004346153846153846,
136
+ "loss": 0.2942,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
+ "grad_norm": 2.2199032306671143,
142
  "learning_rate": 0.004307692307692308,
143
+ "loss": 0.315,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
+ "grad_norm": 2.654019832611084,
149
  "learning_rate": 0.004269230769230769,
150
+ "loss": 0.4203,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
+ "grad_norm": 1.460809350013733,
156
  "learning_rate": 0.004230769230769231,
157
+ "loss": 0.4812,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
+ "grad_norm": 0.9371921420097351,
163
  "learning_rate": 0.004192307692307692,
164
+ "loss": 0.3436,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
+ "grad_norm": 0.573295533657074,
170
  "learning_rate": 0.004153846153846154,
171
+ "loss": 0.7394,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
+ "grad_norm": 3.2005550861358643,
177
  "learning_rate": 0.004115384615384615,
178
+ "loss": 0.5694,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
+ "grad_norm": 1.0854452848434448,
184
  "learning_rate": 0.004076923076923077,
185
+ "loss": 0.6043,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
+ "grad_norm": 1.3743146657943726,
191
  "learning_rate": 0.0040384615384615385,
192
+ "loss": 0.4406,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
+ "grad_norm": 0.11435157805681229,
198
  "learning_rate": 0.004,
199
+ "loss": 0.4102,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
+ "eval_accuracy": 0.6240601503759399,
205
+ "eval_loss": 1.1071540117263794,
206
+ "eval_runtime": 0.9165,
207
+ "eval_samples_per_second": 145.121,
208
+ "eval_steps_per_second": 18.549,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
+ "grad_norm": 0.9636590480804443,
214
  "learning_rate": 0.003961538461538462,
215
+ "loss": 0.4669,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
+ "grad_norm": 1.4855515956878662,
221
  "learning_rate": 0.003923076923076923,
222
+ "loss": 0.4618,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
+ "grad_norm": 1.6462602615356445,
228
  "learning_rate": 0.003884615384615385,
229
+ "loss": 0.5266,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
+ "grad_norm": 2.7035958766937256,
235
  "learning_rate": 0.0038461538461538464,
236
+ "loss": 0.6045,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
+ "grad_norm": 1.7253210544586182,
242
  "learning_rate": 0.0038076923076923075,
243
+ "loss": 0.4711,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
+ "grad_norm": 0.6617446541786194,
249
  "learning_rate": 0.003769230769230769,
250
+ "loss": 0.5785,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
+ "grad_norm": 0.6448426842689514,
256
  "learning_rate": 0.003730769230769231,
257
+ "loss": 0.7907,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
+ "grad_norm": 3.4137613773345947,
263
  "learning_rate": 0.0036923076923076927,
264
+ "loss": 0.9229,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
+ "grad_norm": 5.124917507171631,
270
  "learning_rate": 0.003653846153846154,
271
+ "loss": 1.0287,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
+ "grad_norm": 1.2190972566604614,
277
  "learning_rate": 0.0036153846153846154,
278
+ "loss": 0.7791,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
+ "grad_norm": 0.6898893713951111,
284
  "learning_rate": 0.003576923076923077,
285
+ "loss": 1.4277,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
+ "grad_norm": 2.563831090927124,
291
  "learning_rate": 0.003538461538461539,
292
+ "loss": 1.3955,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
+ "grad_norm": 2.519380569458008,
298
  "learning_rate": 0.0034999999999999996,
299
+ "loss": 0.8964,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
+ "eval_accuracy": 0.6616541353383458,
305
+ "eval_loss": 0.8284351825714111,
306
+ "eval_runtime": 0.9347,
307
+ "eval_samples_per_second": 142.291,
308
+ "eval_steps_per_second": 18.188,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
+ "grad_norm": 4.3924150466918945,
314
  "learning_rate": 0.0034615384615384616,
315
+ "loss": 1.2627,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
+ "grad_norm": 1.773996114730835,
321
  "learning_rate": 0.003423076923076923,
322
+ "loss": 1.3465,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
+ "grad_norm": 3.1790945529937744,
328
  "learning_rate": 0.003384615384615385,
329
+ "loss": 1.2822,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
+ "grad_norm": 5.48622465133667,
335
  "learning_rate": 0.003346153846153846,
336
+ "loss": 1.0807,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
+ "grad_norm": 3.6509623527526855,
342
  "learning_rate": 0.0033076923076923075,
343
+ "loss": 1.1806,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
+ "grad_norm": 2.3183326721191406,
349
  "learning_rate": 0.0032692307692307695,
350
+ "loss": 1.2542,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
+ "grad_norm": 3.2686471939086914,
356
  "learning_rate": 0.003230769230769231,
357
+ "loss": 1.2217,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
+ "grad_norm": 0.33250316977500916,
363
  "learning_rate": 0.003192307692307692,
364
+ "loss": 1.2107,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
+ "grad_norm": 2.4644064903259277,
370
  "learning_rate": 0.0031538461538461538,
371
+ "loss": 1.2287,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
+ "grad_norm": 0.9661989808082581,
377
  "learning_rate": 0.0031153846153846153,
378
+ "loss": 1.1392,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
+ "grad_norm": 1.0765918493270874,
384
  "learning_rate": 0.0030769230769230774,
385
+ "loss": 1.1576,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
+ "grad_norm": 0.5606147646903992,
391
  "learning_rate": 0.0030384615384615385,
392
+ "loss": 1.1148,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
+ "grad_norm": 3.787473678588867,
398
  "learning_rate": 0.003,
399
+ "loss": 1.155,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
+ "eval_accuracy": 0.3383458646616541,
405
+ "eval_loss": 1.1070177555084229,
406
+ "eval_runtime": 0.9062,
407
+ "eval_samples_per_second": 146.767,
408
+ "eval_steps_per_second": 18.76,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
+ "grad_norm": 1.8376567363739014,
414
  "learning_rate": 0.0029615384615384616,
415
+ "loss": 1.16,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
+ "grad_norm": 0.1573391705751419,
421
  "learning_rate": 0.002923076923076923,
422
+ "loss": 1.1133,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
+ "grad_norm": 1.7338229417800903,
428
  "learning_rate": 0.0028846153846153843,
429
+ "loss": 1.1604,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
+ "grad_norm": 1.532076358795166,
435
  "learning_rate": 0.002846153846153846,
436
+ "loss": 1.171,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
+ "grad_norm": 1.2591376304626465,
442
  "learning_rate": 0.002807692307692308,
443
+ "loss": 1.116,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
+ "grad_norm": 1.7697715759277344,
449
  "learning_rate": 0.0027692307692307695,
450
+ "loss": 1.1409,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
+ "grad_norm": 1.0740739107131958,
456
  "learning_rate": 0.0027307692307692306,
457
+ "loss": 1.1458,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
+ "grad_norm": 0.46824851632118225,
463
  "learning_rate": 0.002692307692307692,
464
+ "loss": 1.1056,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
+ "grad_norm": 1.8973933458328247,
470
  "learning_rate": 0.0026538461538461538,
471
+ "loss": 1.1024,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
+ "grad_norm": 0.06765652447938919,
477
  "learning_rate": 0.0026153846153846158,
478
+ "loss": 1.1514,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
+ "grad_norm": 0.8284403085708618,
484
  "learning_rate": 0.002576923076923077,
485
+ "loss": 1.1799,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
+ "grad_norm": 1.611846923828125,
491
  "learning_rate": 0.0025384615384615385,
492
+ "loss": 1.169,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
+ "grad_norm": 3.048553705215454,
498
  "learning_rate": 0.0025,
499
+ "loss": 1.1069,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
+ "eval_accuracy": 0.3308270676691729,
505
+ "eval_loss": 1.1081749200820923,
506
+ "eval_runtime": 0.9355,
507
+ "eval_samples_per_second": 142.176,
508
+ "eval_steps_per_second": 18.173,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
+ "grad_norm": 0.6719330549240112,
514
  "learning_rate": 0.0024615384615384616,
515
+ "loss": 1.1054,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
+ "grad_norm": 0.8374503254890442,
521
  "learning_rate": 0.002423076923076923,
522
+ "loss": 1.1134,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
+ "grad_norm": 0.5713789463043213,
528
  "learning_rate": 0.0023846153846153848,
529
+ "loss": 1.1534,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
+ "grad_norm": 1.0185028314590454,
535
  "learning_rate": 0.0023461538461538463,
536
+ "loss": 1.1368,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
+ "grad_norm": 1.021331787109375,
542
  "learning_rate": 0.002307692307692308,
543
+ "loss": 1.094,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
+ "grad_norm": 0.9150602221488953,
549
  "learning_rate": 0.0022692307692307695,
550
+ "loss": 1.1187,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
+ "grad_norm": 0.741632878780365,
556
  "learning_rate": 0.002230769230769231,
557
+ "loss": 1.1129,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
+ "grad_norm": 1.5859191417694092,
563
  "learning_rate": 0.002192307692307692,
564
+ "loss": 1.135,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
+ "grad_norm": 2.2667315006256104,
570
  "learning_rate": 0.002153846153846154,
571
+ "loss": 1.1474,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
+ "grad_norm": 1.3794482946395874,
577
  "learning_rate": 0.0021153846153846153,
578
+ "loss": 1.1095,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
+ "grad_norm": 1.2423471212387085,
584
  "learning_rate": 0.002076923076923077,
585
+ "loss": 1.1057,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
+ "grad_norm": 4.28483247756958,
591
  "learning_rate": 0.0020384615384615385,
592
+ "loss": 1.1093,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
+ "grad_norm": 4.998449802398682,
598
  "learning_rate": 0.002,
599
+ "loss": 1.1748,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_accuracy": 0.3308270676691729,
605
+ "eval_loss": 1.1041723489761353,
606
+ "eval_runtime": 0.9442,
607
+ "eval_samples_per_second": 140.864,
608
+ "eval_steps_per_second": 18.005,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
+ "grad_norm": 2.81730318069458,
614
  "learning_rate": 0.0019615384615384616,
615
+ "loss": 1.1381,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
+ "grad_norm": 2.5270557403564453,
621
  "learning_rate": 0.0019230769230769232,
622
+ "loss": 1.1104,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
+ "grad_norm": 0.8537469506263733,
628
  "learning_rate": 0.0018846153846153845,
629
+ "loss": 1.1327,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
+ "grad_norm": 0.8818097710609436,
635
  "learning_rate": 0.0018461538461538463,
636
+ "loss": 1.1437,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
+ "grad_norm": 2.496903896331787,
642
  "learning_rate": 0.0018076923076923077,
643
+ "loss": 1.1251,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
+ "grad_norm": 3.2999167442321777,
649
  "learning_rate": 0.0017692307692307695,
650
+ "loss": 1.1373,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
+ "grad_norm": 2.9537723064422607,
656
  "learning_rate": 0.0017307692307692308,
657
+ "loss": 1.176,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
+ "grad_norm": 1.5148764848709106,
663
  "learning_rate": 0.0016923076923076924,
664
+ "loss": 1.1146,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
+ "grad_norm": 1.3614485263824463,
670
  "learning_rate": 0.0016538461538461537,
671
+ "loss": 1.0895,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
+ "grad_norm": 0.6686407327651978,
677
  "learning_rate": 0.0016153846153846155,
678
+ "loss": 1.1245,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
+ "grad_norm": 1.6276365518569946,
684
  "learning_rate": 0.0015769230769230769,
685
+ "loss": 1.1412,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
+ "grad_norm": 1.4945542812347412,
691
  "learning_rate": 0.0015384615384615387,
692
+ "loss": 1.1247,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
+ "grad_norm": 2.74285888671875,
698
  "learning_rate": 0.0015,
699
+ "loss": 1.1514,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
+ "eval_accuracy": 0.3308270676691729,
705
+ "eval_loss": 1.1007708311080933,
706
+ "eval_runtime": 0.9264,
707
+ "eval_samples_per_second": 143.562,
708
+ "eval_steps_per_second": 18.35,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
+ "grad_norm": 2.248867988586426,
714
  "learning_rate": 0.0014615384615384616,
715
+ "loss": 1.0913,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
+ "grad_norm": 2.5912275314331055,
721
  "learning_rate": 0.001423076923076923,
722
+ "loss": 1.1193,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
+ "grad_norm": 3.573967695236206,
728
  "learning_rate": 0.0013846153846153847,
729
+ "loss": 1.1352,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
+ "grad_norm": 2.077554225921631,
735
  "learning_rate": 0.001346153846153846,
736
+ "loss": 1.0947,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
+ "grad_norm": 1.6364086866378784,
742
  "learning_rate": 0.0013076923076923079,
743
+ "loss": 1.1296,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
+ "grad_norm": 1.1806069612503052,
749
  "learning_rate": 0.0012692307692307692,
750
+ "loss": 1.1033,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
+ "grad_norm": 2.4788713455200195,
756
  "learning_rate": 0.0012307692307692308,
757
+ "loss": 1.1148,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
+ "grad_norm": 0.8245519399642944,
763
  "learning_rate": 0.0011923076923076924,
764
+ "loss": 1.1121,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
+ "grad_norm": 1.568903923034668,
770
  "learning_rate": 0.001153846153846154,
771
+ "loss": 1.1163,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
+ "grad_norm": 2.2075676918029785,
777
  "learning_rate": 0.0011153846153846155,
778
+ "loss": 1.0956,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
+ "grad_norm": 1.0866578817367554,
784
  "learning_rate": 0.001076923076923077,
785
+ "loss": 1.1065,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
+ "grad_norm": 2.4340384006500244,
791
  "learning_rate": 0.0010384615384615384,
792
+ "loss": 1.1118,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
+ "grad_norm": 2.1427950859069824,
798
  "learning_rate": 0.001,
799
+ "loss": 1.0956,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
+ "eval_accuracy": 0.3383458646616541,
805
+ "eval_loss": 1.1004995107650757,
806
+ "eval_runtime": 0.9133,
807
+ "eval_samples_per_second": 145.621,
808
+ "eval_steps_per_second": 18.613,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
+ "grad_norm": 0.689738392829895,
814
  "learning_rate": 0.0009615384615384616,
815
+ "loss": 1.1119,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
+ "grad_norm": 0.7051054835319519,
821
  "learning_rate": 0.0009230769230769232,
822
+ "loss": 1.1076,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
+ "grad_norm": 0.8233575820922852,
828
  "learning_rate": 0.0008846153846153847,
829
+ "loss": 1.1,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
+ "grad_norm": 0.4401038885116577,
835
  "learning_rate": 0.0008461538461538462,
836
+ "loss": 1.1231,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
+ "grad_norm": 2.0618515014648438,
842
  "learning_rate": 0.0008076923076923078,
843
+ "loss": 1.0877,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
+ "grad_norm": 1.629764199256897,
849
  "learning_rate": 0.0007692307692307693,
850
+ "loss": 1.1165,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
+ "grad_norm": 0.9111972451210022,
856
  "learning_rate": 0.0007307692307692308,
857
+ "loss": 1.1033,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
+ "grad_norm": 2.03430438041687,
863
  "learning_rate": 0.0006923076923076924,
864
+ "loss": 1.0906,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
+ "grad_norm": 3.2121763229370117,
870
  "learning_rate": 0.0006538461538461539,
871
+ "loss": 1.126,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
+ "grad_norm": 0.34755992889404297,
877
  "learning_rate": 0.0006153846153846154,
878
+ "loss": 1.1051,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
+ "grad_norm": 0.7230011224746704,
884
  "learning_rate": 0.000576923076923077,
885
+ "loss": 1.0721,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
+ "grad_norm": 2.8672590255737305,
891
  "learning_rate": 0.0005384615384615385,
892
+ "loss": 1.0711,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
+ "grad_norm": 2.920103073120117,
898
  "learning_rate": 0.0005,
899
+ "loss": 1.1448,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
+ "eval_accuracy": 0.3308270676691729,
905
+ "eval_loss": 1.111855149269104,
906
+ "eval_runtime": 0.9175,
907
+ "eval_samples_per_second": 144.96,
908
+ "eval_steps_per_second": 18.529,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
+ "grad_norm": 0.26905354857444763,
914
  "learning_rate": 0.0004615384615384616,
915
+ "loss": 1.1121,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
+ "grad_norm": 0.22876305878162384,
921
  "learning_rate": 0.0004230769230769231,
922
+ "loss": 1.0967,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
+ "grad_norm": 0.24429693818092346,
928
  "learning_rate": 0.00038461538461538467,
929
+ "loss": 1.1023,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
+ "grad_norm": 0.6317471861839294,
935
  "learning_rate": 0.0003461538461538462,
936
+ "loss": 1.1062,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
+ "grad_norm": 0.41996484994888306,
942
  "learning_rate": 0.0003076923076923077,
943
+ "loss": 1.1162,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
+ "grad_norm": 1.6321758031845093,
949
  "learning_rate": 0.0002692307692307693,
950
+ "loss": 1.0976,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
+ "grad_norm": 0.966174840927124,
956
  "learning_rate": 0.0002307692307692308,
957
+ "loss": 1.0979,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
+ "grad_norm": 0.23766568303108215,
963
  "learning_rate": 0.00019230769230769233,
964
+ "loss": 1.1012,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
+ "grad_norm": 0.930920422077179,
970
  "learning_rate": 0.00015384615384615385,
971
+ "loss": 1.0991,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
+ "grad_norm": 0.34004735946655273,
977
  "learning_rate": 0.0001153846153846154,
978
+ "loss": 1.1009,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
+ "grad_norm": 0.663216233253479,
984
  "learning_rate": 7.692307692307693e-05,
985
+ "loss": 1.0997,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
+ "grad_norm": 0.9082524180412292,
991
  "learning_rate": 3.846153846153846e-05,
992
+ "loss": 1.0998,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
+ "grad_norm": 1.356846570968628,
998
  "learning_rate": 0.0,
999
+ "loss": 1.0991,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
+ "eval_accuracy": 0.3308270676691729,
1005
+ "eval_loss": 1.0983037948608398,
1006
+ "eval_runtime": 0.9323,
1007
+ "eval_samples_per_second": 142.651,
1008
+ "eval_steps_per_second": 18.234,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.0,
1013
  "step": 1300,
1014
  "total_flos": 8.400578669044531e+17,
1015
+ "train_loss": 0.9618942783429072,
1016
+ "train_runtime": 164.295,
1017
+ "train_samples_per_second": 62.936,
1018
+ "train_steps_per_second": 7.913
1019
  }
1020
  ],
1021
  "logging_steps": 10,