heisenberg3376 commited on
Commit
37b2dbd
·
verified ·
1 Parent(s): 642b837

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - imagefolder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: imagefolder
18
  type: imagefolder
19
  config: default
20
  split: validation
@@ -22,7 +23,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.9018181818181819
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # vit-base-food-items-v1
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.4941
36
- - Accuracy: 0.9018
37
 
38
  ## Model description
39
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: beans
19
  type: imagefolder
20
  config: default
21
  split: validation
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.9090909090909091
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
31
 
32
  # vit-base-food-items-v1
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.4524
37
+ - Accuracy: 0.9091
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.9236363636363636,
4
- "eval_loss": 0.33629149198532104,
5
- "eval_runtime": 7.1163,
6
- "eval_samples_per_second": 77.287,
7
- "eval_steps_per_second": 9.696,
8
  "total_flos": 7.501829674622976e+17,
9
- "train_loss": 0.22265003621578217,
10
- "train_runtime": 237.6059,
11
- "train_samples_per_second": 40.74,
12
- "train_steps_per_second": 2.559
13
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.9090909090909091,
4
+ "eval_loss": 0.45239612460136414,
5
+ "eval_runtime": 6.966,
6
+ "eval_samples_per_second": 78.955,
7
+ "eval_steps_per_second": 9.905,
8
  "total_flos": 7.501829674622976e+17,
9
+ "train_loss": 0.03790271527280933,
10
+ "train_runtime": 250.8529,
11
+ "train_samples_per_second": 38.588,
12
+ "train_steps_per_second": 2.424
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.9236363636363636,
4
- "eval_loss": 0.33629149198532104,
5
- "eval_runtime": 7.1163,
6
- "eval_samples_per_second": 77.287,
7
- "eval_steps_per_second": 9.696
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.9090909090909091,
4
+ "eval_loss": 0.45239612460136414,
5
+ "eval_runtime": 6.966,
6
+ "eval_samples_per_second": 78.955,
7
+ "eval_steps_per_second": 9.905
8
  }
runs/Jul17_09-32-17_405903fcfe02/events.out.tfevents.1721209043.405903fcfe02.739.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cae91c84c280b26535650b9c88a0ab24ef8f9791cc8eb4c0a1eb3390e12b2e2b
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 7.501829674622976e+17,
4
- "train_loss": 0.22265003621578217,
5
- "train_runtime": 237.6059,
6
- "train_samples_per_second": 40.74,
7
- "train_steps_per_second": 2.559
8
  }
 
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 7.501829674622976e+17,
4
+ "train_loss": 0.03790271527280933,
5
+ "train_runtime": 250.8529,
6
+ "train_samples_per_second": 38.588,
7
+ "train_steps_per_second": 2.424
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.33629149198532104,
3
- "best_model_checkpoint": "vit-base-food-items-v1/checkpoint-400",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
  "global_step": 608,
@@ -10,486 +10,486 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.06578947368421052,
13
- "grad_norm": 2.054168224334717,
14
  "learning_rate": 0.00019671052631578949,
15
- "loss": 2.2227,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.13157894736842105,
20
- "grad_norm": 2.418569326400757,
21
  "learning_rate": 0.00019342105263157894,
22
- "loss": 1.7988,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.19736842105263158,
27
- "grad_norm": 2.0799572467803955,
28
  "learning_rate": 0.00019013157894736844,
29
- "loss": 1.3952,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2631578947368421,
34
- "grad_norm": 2.5012855529785156,
35
  "learning_rate": 0.00018684210526315792,
36
- "loss": 1.0071,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.32894736842105265,
41
- "grad_norm": 1.610549807548523,
42
  "learning_rate": 0.00018355263157894736,
43
- "loss": 0.8514,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.39473684210526316,
48
- "grad_norm": 2.7514488697052,
49
  "learning_rate": 0.00018026315789473684,
50
- "loss": 0.6752,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.4605263157894737,
55
- "grad_norm": 5.107870101928711,
56
  "learning_rate": 0.00017697368421052632,
57
- "loss": 0.617,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5263157894736842,
62
- "grad_norm": 1.621307611465454,
63
  "learning_rate": 0.0001736842105263158,
64
- "loss": 0.4383,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5921052631578947,
69
- "grad_norm": 2.050955057144165,
70
  "learning_rate": 0.00017039473684210527,
71
- "loss": 0.4703,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6578947368421053,
76
- "grad_norm": 3.5689868927001953,
77
  "learning_rate": 0.00016710526315789475,
78
- "loss": 0.4195,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6578947368421053,
83
- "eval_accuracy": 0.9054545454545454,
84
- "eval_loss": 0.5027927756309509,
85
- "eval_runtime": 6.6566,
86
- "eval_samples_per_second": 82.625,
87
- "eval_steps_per_second": 10.366,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7236842105263158,
92
- "grad_norm": 2.683819055557251,
93
  "learning_rate": 0.00016381578947368422,
94
- "loss": 0.3666,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.7894736842105263,
99
- "grad_norm": 2.7733426094055176,
100
  "learning_rate": 0.0001605263157894737,
101
- "loss": 0.3876,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.8552631578947368,
106
- "grad_norm": 3.341937303543091,
107
  "learning_rate": 0.00015723684210526318,
108
- "loss": 0.3778,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.9210526315789473,
113
- "grad_norm": 1.0890475511550903,
114
  "learning_rate": 0.00015394736842105265,
115
- "loss": 0.3368,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.9868421052631579,
120
- "grad_norm": 3.217635154724121,
121
  "learning_rate": 0.0001506578947368421,
122
- "loss": 0.2434,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.0526315789473684,
127
- "grad_norm": 1.1362298727035522,
128
  "learning_rate": 0.00014736842105263158,
129
- "loss": 0.1537,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.118421052631579,
134
- "grad_norm": 0.3043310344219208,
135
  "learning_rate": 0.00014407894736842106,
136
- "loss": 0.1786,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.1842105263157894,
141
- "grad_norm": 0.36744824051856995,
142
  "learning_rate": 0.00014078947368421053,
143
- "loss": 0.1534,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.25,
148
- "grad_norm": 0.4088458716869354,
149
  "learning_rate": 0.0001375,
150
- "loss": 0.1273,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.3157894736842106,
155
- "grad_norm": 0.22641144692897797,
156
  "learning_rate": 0.00013421052631578948,
157
- "loss": 0.1072,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.3157894736842106,
162
- "eval_accuracy": 0.8945454545454545,
163
- "eval_loss": 0.37944725155830383,
164
- "eval_runtime": 6.6833,
165
- "eval_samples_per_second": 82.295,
166
- "eval_steps_per_second": 10.324,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.381578947368421,
171
- "grad_norm": 0.14886893332004547,
172
  "learning_rate": 0.00013092105263157893,
173
- "loss": 0.0846,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.4473684210526316,
178
- "grad_norm": 0.17389647662639618,
179
  "learning_rate": 0.00012763157894736844,
180
- "loss": 0.0789,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.513157894736842,
185
- "grad_norm": 0.12492559105157852,
186
  "learning_rate": 0.00012434210526315791,
187
- "loss": 0.0605,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.5789473684210527,
192
- "grad_norm": 0.14732375741004944,
193
  "learning_rate": 0.00012105263157894738,
194
- "loss": 0.0867,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.6447368421052633,
199
- "grad_norm": 0.1113506406545639,
200
  "learning_rate": 0.00011776315789473684,
201
- "loss": 0.0436,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.7105263157894737,
206
- "grad_norm": 0.09813081473112106,
207
  "learning_rate": 0.00011447368421052632,
208
- "loss": 0.0416,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.776315789473684,
213
- "grad_norm": 6.826725006103516,
214
  "learning_rate": 0.0001111842105263158,
215
- "loss": 0.0514,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.8421052631578947,
220
- "grad_norm": 0.10619573295116425,
221
  "learning_rate": 0.00010789473684210527,
222
- "loss": 0.0601,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.9078947368421053,
227
- "grad_norm": 0.13959018886089325,
228
  "learning_rate": 0.00010460526315789475,
229
- "loss": 0.0454,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.973684210526316,
234
- "grad_norm": 0.08468258380889893,
235
  "learning_rate": 0.00010131578947368421,
236
- "loss": 0.0326,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.973684210526316,
241
- "eval_accuracy": 0.9054545454545454,
242
- "eval_loss": 0.38323774933815,
243
- "eval_runtime": 6.0691,
244
- "eval_samples_per_second": 90.622,
245
- "eval_steps_per_second": 11.369,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 2.039473684210526,
250
- "grad_norm": 0.07823757082223892,
251
  "learning_rate": 9.802631578947369e-05,
252
- "loss": 0.0392,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.1052631578947367,
257
- "grad_norm": 0.07656868547201157,
258
  "learning_rate": 9.473684210526316e-05,
259
- "loss": 0.0288,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.1710526315789473,
264
- "grad_norm": 0.07013211399316788,
265
  "learning_rate": 9.144736842105264e-05,
266
- "loss": 0.0313,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.236842105263158,
271
- "grad_norm": 0.07913695275783539,
272
  "learning_rate": 8.81578947368421e-05,
273
- "loss": 0.0378,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.3026315789473686,
278
- "grad_norm": 0.3869466483592987,
279
  "learning_rate": 8.486842105263159e-05,
280
- "loss": 0.0253,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.3684210526315788,
285
- "grad_norm": 0.06490592658519745,
286
  "learning_rate": 8.157894736842105e-05,
287
- "loss": 0.0241,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.4342105263157894,
292
- "grad_norm": 0.06631086021661758,
293
  "learning_rate": 7.828947368421053e-05,
294
- "loss": 0.0231,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.5,
299
- "grad_norm": 0.05489266291260719,
300
  "learning_rate": 7.500000000000001e-05,
301
- "loss": 0.0218,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.5657894736842106,
306
- "grad_norm": 0.07426982372999191,
307
  "learning_rate": 7.171052631578947e-05,
308
- "loss": 0.0215,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.6315789473684212,
313
- "grad_norm": 0.063384510576725,
314
  "learning_rate": 6.842105263157895e-05,
315
- "loss": 0.0207,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.6315789473684212,
320
- "eval_accuracy": 0.9236363636363636,
321
- "eval_loss": 0.33629149198532104,
322
- "eval_runtime": 6.0608,
323
- "eval_samples_per_second": 90.746,
324
- "eval_steps_per_second": 11.385,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.6973684210526314,
329
- "grad_norm": 0.05782260745763779,
330
  "learning_rate": 6.513157894736842e-05,
331
- "loss": 0.0201,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.763157894736842,
336
- "grad_norm": 0.05535552278161049,
337
  "learning_rate": 6.18421052631579e-05,
338
- "loss": 0.0194,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.8289473684210527,
343
- "grad_norm": 0.05756945163011551,
344
  "learning_rate": 5.855263157894737e-05,
345
- "loss": 0.0191,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 2.8947368421052633,
350
- "grad_norm": 0.05671467259526253,
351
  "learning_rate": 5.526315789473685e-05,
352
- "loss": 0.0188,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 2.9605263157894735,
357
- "grad_norm": 0.05619660019874573,
358
  "learning_rate": 5.197368421052632e-05,
359
- "loss": 0.0183,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 3.026315789473684,
364
- "grad_norm": 0.05277419090270996,
365
  "learning_rate": 4.868421052631579e-05,
366
- "loss": 0.0177,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.0921052631578947,
371
- "grad_norm": 0.05281645059585571,
372
  "learning_rate": 4.539473684210527e-05,
373
- "loss": 0.0174,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.1578947368421053,
378
- "grad_norm": 0.06867770105600357,
379
  "learning_rate": 4.210526315789474e-05,
380
- "loss": 0.017,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.223684210526316,
385
- "grad_norm": 0.047292064875364304,
386
  "learning_rate": 3.8815789473684214e-05,
387
- "loss": 0.0168,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.2894736842105265,
392
- "grad_norm": 0.043311525136232376,
393
  "learning_rate": 3.5526315789473684e-05,
394
- "loss": 0.0167,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.2894736842105265,
399
- "eval_accuracy": 0.9236363636363636,
400
- "eval_loss": 0.33733832836151123,
401
- "eval_runtime": 5.7257,
402
- "eval_samples_per_second": 96.057,
403
- "eval_steps_per_second": 12.051,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.3552631578947367,
408
- "grad_norm": 0.04796218127012253,
409
  "learning_rate": 3.223684210526316e-05,
410
- "loss": 0.0165,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.4210526315789473,
415
- "grad_norm": 0.048424966633319855,
416
  "learning_rate": 2.8947368421052634e-05,
417
- "loss": 0.0163,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.486842105263158,
422
- "grad_norm": 0.046178512275218964,
423
  "learning_rate": 2.565789473684211e-05,
424
- "loss": 0.0157,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.5526315789473686,
429
- "grad_norm": 0.04182315245270729,
430
  "learning_rate": 2.236842105263158e-05,
431
- "loss": 0.0156,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.6184210526315788,
436
- "grad_norm": 0.04811399057507515,
437
  "learning_rate": 1.9078947368421056e-05,
438
- "loss": 0.0157,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.6842105263157894,
443
- "grad_norm": 0.04523231461644173,
444
  "learning_rate": 1.5789473684210526e-05,
445
- "loss": 0.0157,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.75,
450
- "grad_norm": 0.04799880087375641,
451
  "learning_rate": 1.25e-05,
452
- "loss": 0.0155,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 3.8157894736842106,
457
- "grad_norm": 0.04668057709932327,
458
  "learning_rate": 9.210526315789474e-06,
459
- "loss": 0.0154,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 3.8815789473684212,
464
- "grad_norm": 0.044472016394138336,
465
  "learning_rate": 5.921052631578948e-06,
466
- "loss": 0.0154,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 3.9473684210526314,
471
- "grad_norm": 0.05030672252178192,
472
  "learning_rate": 2.631578947368421e-06,
473
- "loss": 0.0153,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 3.9473684210526314,
478
- "eval_accuracy": 0.9236363636363636,
479
- "eval_loss": 0.33738574385643005,
480
- "eval_runtime": 6.0053,
481
- "eval_samples_per_second": 91.586,
482
- "eval_steps_per_second": 11.49,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 4.0,
487
  "step": 608,
488
  "total_flos": 7.501829674622976e+17,
489
- "train_loss": 0.22265003621578217,
490
- "train_runtime": 237.6059,
491
- "train_samples_per_second": 40.74,
492
- "train_steps_per_second": 2.559
493
  }
494
  ],
495
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.45239612460136414,
3
+ "best_model_checkpoint": "vit-base-food-items-v1/checkpoint-300",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
  "global_step": 608,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.06578947368421052,
13
+ "grad_norm": 0.04839174449443817,
14
  "learning_rate": 0.00019671052631578949,
15
+ "loss": 0.0259,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.13157894736842105,
20
+ "grad_norm": 5.086187362670898,
21
  "learning_rate": 0.00019342105263157894,
22
+ "loss": 0.0743,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.19736842105263158,
27
+ "grad_norm": 8.687716484069824,
28
  "learning_rate": 0.00019013157894736844,
29
+ "loss": 0.0621,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2631578947368421,
34
+ "grad_norm": 0.0554538369178772,
35
  "learning_rate": 0.00018684210526315792,
36
+ "loss": 0.1584,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.32894736842105265,
41
+ "grad_norm": 7.25691556930542,
42
  "learning_rate": 0.00018355263157894736,
43
+ "loss": 0.0284,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.39473684210526316,
48
+ "grad_norm": 0.0355791412293911,
49
  "learning_rate": 0.00018026315789473684,
50
+ "loss": 0.1607,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.4605263157894737,
55
+ "grad_norm": 6.474045276641846,
56
  "learning_rate": 0.00017697368421052632,
57
+ "loss": 0.2034,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5263157894736842,
62
+ "grad_norm": 0.047177255153656006,
63
  "learning_rate": 0.0001736842105263158,
64
+ "loss": 0.1755,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5921052631578947,
69
+ "grad_norm": 7.999953269958496,
70
  "learning_rate": 0.00017039473684210527,
71
+ "loss": 0.078,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6578947368421053,
76
+ "grad_norm": 0.2906012237071991,
77
  "learning_rate": 0.00016710526315789475,
78
+ "loss": 0.1773,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6578947368421053,
83
+ "eval_accuracy": 0.8472727272727273,
84
+ "eval_loss": 0.7279737591743469,
85
+ "eval_runtime": 6.7097,
86
+ "eval_samples_per_second": 81.971,
87
+ "eval_steps_per_second": 10.284,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7236842105263158,
92
+ "grad_norm": 0.038031741976737976,
93
  "learning_rate": 0.00016381578947368422,
94
+ "loss": 0.1011,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.7894736842105263,
99
+ "grad_norm": 0.8751915097236633,
100
  "learning_rate": 0.0001605263157894737,
101
+ "loss": 0.1059,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.8552631578947368,
106
+ "grad_norm": 0.08943302929401398,
107
  "learning_rate": 0.00015723684210526318,
108
+ "loss": 0.0334,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.9210526315789473,
113
+ "grad_norm": 0.17175784707069397,
114
  "learning_rate": 0.00015394736842105265,
115
+ "loss": 0.1515,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.9868421052631579,
120
+ "grad_norm": 0.053591687232255936,
121
  "learning_rate": 0.0001506578947368421,
122
+ "loss": 0.1301,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.0526315789473684,
127
+ "grad_norm": 0.026137366890907288,
128
  "learning_rate": 0.00014736842105263158,
129
+ "loss": 0.0102,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.118421052631579,
134
+ "grad_norm": 0.09105370193719864,
135
  "learning_rate": 0.00014407894736842106,
136
+ "loss": 0.0066,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.1842105263157894,
141
+ "grad_norm": 0.050408605486154556,
142
  "learning_rate": 0.00014078947368421053,
143
+ "loss": 0.0679,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.25,
148
+ "grad_norm": 0.051493316888809204,
149
  "learning_rate": 0.0001375,
150
+ "loss": 0.007,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.3157894736842106,
155
+ "grad_norm": 0.023582015186548233,
156
  "learning_rate": 0.00013421052631578948,
157
+ "loss": 0.0589,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.3157894736842106,
162
+ "eval_accuracy": 0.8872727272727273,
163
+ "eval_loss": 0.5529205203056335,
164
+ "eval_runtime": 5.9487,
165
+ "eval_samples_per_second": 92.458,
166
+ "eval_steps_per_second": 11.599,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.381578947368421,
171
+ "grad_norm": 0.0221235528588295,
172
  "learning_rate": 0.00013092105263157893,
173
+ "loss": 0.0046,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.4473684210526316,
178
+ "grad_norm": 6.497156620025635,
179
  "learning_rate": 0.00012763157894736844,
180
+ "loss": 0.0086,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.513157894736842,
185
+ "grad_norm": 0.013416736386716366,
186
  "learning_rate": 0.00012434210526315791,
187
+ "loss": 0.0042,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.5789473684210527,
192
+ "grad_norm": 0.012088390998542309,
193
  "learning_rate": 0.00012105263157894738,
194
+ "loss": 0.1094,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.6447368421052633,
199
+ "grad_norm": 7.198599338531494,
200
  "learning_rate": 0.00011776315789473684,
201
+ "loss": 0.045,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.7105263157894737,
206
+ "grad_norm": 0.031135905534029007,
207
  "learning_rate": 0.00011447368421052632,
208
+ "loss": 0.0331,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.776315789473684,
213
+ "grad_norm": 0.07299932837486267,
214
  "learning_rate": 0.0001111842105263158,
215
+ "loss": 0.0535,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.8421052631578947,
220
+ "grad_norm": 0.02018345519900322,
221
  "learning_rate": 0.00010789473684210527,
222
+ "loss": 0.0241,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.9078947368421053,
227
+ "grad_norm": 0.012926718220114708,
228
  "learning_rate": 0.00010460526315789475,
229
+ "loss": 0.033,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.973684210526316,
234
+ "grad_norm": 8.804197311401367,
235
  "learning_rate": 0.00010131578947368421,
236
+ "loss": 0.043,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.973684210526316,
241
+ "eval_accuracy": 0.9090909090909091,
242
+ "eval_loss": 0.45239612460136414,
243
+ "eval_runtime": 5.7174,
244
+ "eval_samples_per_second": 96.197,
245
+ "eval_steps_per_second": 12.068,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 2.039473684210526,
250
+ "grad_norm": 21.062307357788086,
251
  "learning_rate": 9.802631578947369e-05,
252
+ "loss": 0.0327,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.1052631578947367,
257
+ "grad_norm": 0.009257642552256584,
258
  "learning_rate": 9.473684210526316e-05,
259
+ "loss": 0.0151,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.1710526315789473,
264
+ "grad_norm": 0.014151917770504951,
265
  "learning_rate": 9.144736842105264e-05,
266
+ "loss": 0.0248,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.236842105263158,
271
+ "grad_norm": 0.013802828267216682,
272
  "learning_rate": 8.81578947368421e-05,
273
+ "loss": 0.003,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.3026315789473686,
278
+ "grad_norm": 0.014456182718276978,
279
  "learning_rate": 8.486842105263159e-05,
280
+ "loss": 0.0035,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.3684210526315788,
285
+ "grad_norm": 0.006758903618901968,
286
  "learning_rate": 8.157894736842105e-05,
287
+ "loss": 0.0024,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.4342105263157894,
292
+ "grad_norm": 0.009314753115177155,
293
  "learning_rate": 7.828947368421053e-05,
294
+ "loss": 0.0024,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.5,
299
+ "grad_norm": 0.006471664644777775,
300
  "learning_rate": 7.500000000000001e-05,
301
+ "loss": 0.0022,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.5657894736842106,
306
+ "grad_norm": 0.013896413147449493,
307
  "learning_rate": 7.171052631578947e-05,
308
+ "loss": 0.0023,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.6315789473684212,
313
+ "grad_norm": 0.009549788199365139,
314
  "learning_rate": 6.842105263157895e-05,
315
+ "loss": 0.0022,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.6315789473684212,
320
+ "eval_accuracy": 0.8909090909090909,
321
+ "eval_loss": 0.5150398015975952,
322
+ "eval_runtime": 6.2356,
323
+ "eval_samples_per_second": 88.203,
324
+ "eval_steps_per_second": 11.065,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.6973684210526314,
329
+ "grad_norm": 0.00833881739526987,
330
  "learning_rate": 6.513157894736842e-05,
331
+ "loss": 0.0024,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.763157894736842,
336
+ "grad_norm": 0.006957135163247585,
337
  "learning_rate": 6.18421052631579e-05,
338
+ "loss": 0.0021,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.8289473684210527,
343
+ "grad_norm": 0.006556599400937557,
344
  "learning_rate": 5.855263157894737e-05,
345
+ "loss": 0.0021,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 2.8947368421052633,
350
+ "grad_norm": 0.007122657261788845,
351
  "learning_rate": 5.526315789473685e-05,
352
+ "loss": 0.0021,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 2.9605263157894735,
357
+ "grad_norm": 0.0069893728941679,
358
  "learning_rate": 5.197368421052632e-05,
359
+ "loss": 0.002,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 3.026315789473684,
364
+ "grad_norm": 0.006159682292491198,
365
  "learning_rate": 4.868421052631579e-05,
366
+ "loss": 0.002,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.0921052631578947,
371
+ "grad_norm": 0.0069947754964232445,
372
  "learning_rate": 4.539473684210527e-05,
373
+ "loss": 0.0019,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.1578947368421053,
378
+ "grad_norm": 0.007419601548463106,
379
  "learning_rate": 4.210526315789474e-05,
380
+ "loss": 0.0018,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.223684210526316,
385
+ "grad_norm": 0.006330096162855625,
386
  "learning_rate": 3.8815789473684214e-05,
387
+ "loss": 0.0018,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.2894736842105265,
392
+ "grad_norm": 0.006105512380599976,
393
  "learning_rate": 3.5526315789473684e-05,
394
+ "loss": 0.0018,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.2894736842105265,
399
+ "eval_accuracy": 0.9018181818181819,
400
+ "eval_loss": 0.49247637391090393,
401
+ "eval_runtime": 6.5136,
402
+ "eval_samples_per_second": 84.439,
403
+ "eval_steps_per_second": 10.593,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.3552631578947367,
408
+ "grad_norm": 0.006337973289191723,
409
  "learning_rate": 3.223684210526316e-05,
410
+ "loss": 0.0018,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.4210526315789473,
415
+ "grad_norm": 0.005863433238118887,
416
  "learning_rate": 2.8947368421052634e-05,
417
+ "loss": 0.0018,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.486842105263158,
422
+ "grad_norm": 0.0057103936560451984,
423
  "learning_rate": 2.565789473684211e-05,
424
+ "loss": 0.0017,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.5526315789473686,
429
+ "grad_norm": 0.004713858477771282,
430
  "learning_rate": 2.236842105263158e-05,
431
+ "loss": 0.0018,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.6184210526315788,
436
+ "grad_norm": 0.007430619560182095,
437
  "learning_rate": 1.9078947368421056e-05,
438
+ "loss": 0.0017,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.6842105263157894,
443
+ "grad_norm": 0.0051925876177847385,
444
  "learning_rate": 1.5789473684210526e-05,
445
+ "loss": 0.0018,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.75,
450
+ "grad_norm": 0.0064788335002958775,
451
  "learning_rate": 1.25e-05,
452
+ "loss": 0.0017,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 3.8157894736842106,
457
+ "grad_norm": 0.006365407258272171,
458
  "learning_rate": 9.210526315789474e-06,
459
+ "loss": 0.0017,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 3.8815789473684212,
464
+ "grad_norm": 0.005164624657481909,
465
  "learning_rate": 5.921052631578948e-06,
466
+ "loss": 0.0018,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 3.9473684210526314,
471
+ "grad_norm": 0.006292811129242182,
472
  "learning_rate": 2.631578947368421e-06,
473
+ "loss": 0.0017,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 3.9473684210526314,
478
+ "eval_accuracy": 0.9018181818181819,
479
+ "eval_loss": 0.4941176474094391,
480
+ "eval_runtime": 6.4553,
481
+ "eval_samples_per_second": 85.201,
482
+ "eval_steps_per_second": 10.689,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 4.0,
487
  "step": 608,
488
  "total_flos": 7.501829674622976e+17,
489
+ "train_loss": 0.03790271527280933,
490
+ "train_runtime": 250.8529,
491
+ "train_samples_per_second": 38.588,
492
+ "train_steps_per_second": 2.424
493
  }
494
  ],
495
  "logging_steps": 10,