rhlc commited on
Commit
6c9684b
1 Parent(s): 29fabf5

End of training

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.9953125
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [facebook/vit-msn-small](https://huggingface.co/facebook/vit-msn-small) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.0131
36
- - Accuracy: 0.9953
37
 
38
  ## Model description
39
 
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.996875
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [facebook/vit-msn-small](https://huggingface.co/facebook/vit-msn-small) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.0160
36
+ - Accuracy: 0.9969
37
 
38
  ## Model description
39
 
all_results.json CHANGED
@@ -1,8 +1,13 @@
1
  {
2
- "epoch": 9.777777777777779,
3
- "total_flos": 1.102133137023959e+18,
4
- "train_loss": 0.670017595724626,
5
- "train_runtime": 696.1068,
6
- "train_samples_per_second": 82.746,
7
- "train_steps_per_second": 0.316
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 48.888888888888886,
3
+ "eval_accuracy": 0.996875,
4
+ "eval_loss": 0.015964530408382416,
5
+ "eval_runtime": 3.6348,
6
+ "eval_samples_per_second": 176.076,
7
+ "eval_steps_per_second": 2.751,
8
+ "total_flos": 5.510665685119795e+18,
9
+ "train_loss": 0.17931611462072894,
10
+ "train_runtime": 3528.3547,
11
+ "train_samples_per_second": 81.624,
12
+ "train_steps_per_second": 0.312
13
  }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 48.888888888888886,
3
+ "eval_accuracy": 0.996875,
4
+ "eval_loss": 0.015964530408382416,
5
+ "eval_runtime": 3.6348,
6
+ "eval_samples_per_second": 176.076,
7
+ "eval_steps_per_second": 2.751
8
+ }
runs/Apr26_13-20-29_1c59b5307e97/events.out.tfevents.1714141191.1c59b5307e97.453.6 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14bc85df55fb09d6cf9f5c72d186133246ae0a3d0b6f858ad7af7a1e16c68889
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.777777777777779,
3
- "total_flos": 1.102133137023959e+18,
4
- "train_loss": 0.670017595724626,
5
- "train_runtime": 696.1068,
6
- "train_samples_per_second": 82.746,
7
- "train_steps_per_second": 0.316
8
  }
 
1
  {
2
+ "epoch": 48.888888888888886,
3
+ "total_flos": 5.510665685119795e+18,
4
+ "train_loss": 0.17931611462072894,
5
+ "train_runtime": 3528.3547,
6
+ "train_samples_per_second": 81.624,
7
+ "train_steps_per_second": 0.312
8
  }
trainer_state.json CHANGED
@@ -1,273 +1,1240 @@
1
  {
2
- "best_metric": 0.8625,
3
- "best_model_checkpoint": "vit-msn-small-finetuned-alzheimers/checkpoint-220",
4
- "epoch": 9.777777777777779,
5
  "eval_steps": 500,
6
- "global_step": 220,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.4444444444444444,
13
- "grad_norm": 4.161161422729492,
14
- "learning_rate": 2.272727272727273e-05,
15
- "loss": 1.0688,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.8888888888888888,
20
- "grad_norm": 19.670530319213867,
21
- "learning_rate": 4.545454545454546e-05,
22
- "loss": 0.9297,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.9777777777777777,
27
- "eval_accuracy": 0.615625,
28
- "eval_loss": 0.8769256472587585,
29
- "eval_runtime": 3.9355,
30
- "eval_samples_per_second": 162.623,
31
- "eval_steps_per_second": 2.541,
32
  "step": 22
33
  },
34
  {
35
  "epoch": 1.3333333333333333,
36
- "grad_norm": 7.473578929901123,
37
- "learning_rate": 4.797979797979798e-05,
38
- "loss": 0.8854,
39
  "step": 30
40
  },
41
  {
42
  "epoch": 1.7777777777777777,
43
- "grad_norm": 7.05064058303833,
44
- "learning_rate": 4.545454545454546e-05,
45
- "loss": 0.8601,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 2.0,
50
- "eval_accuracy": 0.634375,
51
- "eval_loss": 0.7798857688903809,
52
- "eval_runtime": 3.5552,
53
- "eval_samples_per_second": 180.019,
54
- "eval_steps_per_second": 2.813,
55
  "step": 45
56
  },
57
  {
58
  "epoch": 2.2222222222222223,
59
- "grad_norm": 12.152384757995605,
60
- "learning_rate": 4.292929292929293e-05,
61
- "loss": 0.8571,
62
  "step": 50
63
  },
64
  {
65
  "epoch": 2.6666666666666665,
66
- "grad_norm": 11.653566360473633,
67
- "learning_rate": 4.0404040404040405e-05,
68
- "loss": 0.7954,
69
  "step": 60
70
  },
71
  {
72
  "epoch": 2.977777777777778,
73
- "eval_accuracy": 0.6828125,
74
- "eval_loss": 0.7196512222290039,
75
- "eval_runtime": 3.5735,
76
- "eval_samples_per_second": 179.098,
77
- "eval_steps_per_second": 2.798,
78
  "step": 67
79
  },
80
  {
81
  "epoch": 3.111111111111111,
82
- "grad_norm": 10.308629989624023,
83
- "learning_rate": 3.787878787878788e-05,
84
- "loss": 0.7808,
85
  "step": 70
86
  },
87
  {
88
  "epoch": 3.5555555555555554,
89
- "grad_norm": 15.87501335144043,
90
- "learning_rate": 3.535353535353535e-05,
91
- "loss": 0.7552,
92
  "step": 80
93
  },
94
  {
95
  "epoch": 4.0,
96
- "grad_norm": 9.567431449890137,
97
- "learning_rate": 3.282828282828283e-05,
98
- "loss": 0.7468,
99
  "step": 90
100
  },
101
  {
102
  "epoch": 4.0,
103
- "eval_accuracy": 0.6734375,
104
- "eval_loss": 0.7003158330917358,
105
- "eval_runtime": 3.5895,
106
- "eval_samples_per_second": 178.3,
107
- "eval_steps_per_second": 2.786,
108
  "step": 90
109
  },
110
  {
111
  "epoch": 4.444444444444445,
112
- "grad_norm": 6.920608043670654,
113
- "learning_rate": 3.0303030303030306e-05,
114
- "loss": 0.683,
115
  "step": 100
116
  },
117
  {
118
  "epoch": 4.888888888888889,
119
- "grad_norm": 8.078107833862305,
120
- "learning_rate": 2.777777777777778e-05,
121
- "loss": 0.6935,
122
  "step": 110
123
  },
124
  {
125
  "epoch": 4.977777777777778,
126
- "eval_accuracy": 0.7546875,
127
- "eval_loss": 0.6063631772994995,
128
- "eval_runtime": 3.8885,
129
- "eval_samples_per_second": 164.589,
130
- "eval_steps_per_second": 2.572,
131
  "step": 112
132
  },
133
  {
134
  "epoch": 5.333333333333333,
135
- "grad_norm": 11.370922088623047,
136
- "learning_rate": 2.5252525252525256e-05,
137
- "loss": 0.6469,
138
  "step": 120
139
  },
140
  {
141
  "epoch": 5.777777777777778,
142
- "grad_norm": 12.74963665008545,
143
- "learning_rate": 2.272727272727273e-05,
144
- "loss": 0.6271,
145
  "step": 130
146
  },
147
  {
148
  "epoch": 6.0,
149
- "eval_accuracy": 0.76875,
150
- "eval_loss": 0.5647965669631958,
151
- "eval_runtime": 3.9273,
152
- "eval_samples_per_second": 162.963,
153
- "eval_steps_per_second": 2.546,
154
  "step": 135
155
  },
156
  {
157
  "epoch": 6.222222222222222,
158
- "grad_norm": 11.86828327178955,
159
- "learning_rate": 2.0202020202020203e-05,
160
- "loss": 0.5635,
161
  "step": 140
162
  },
163
  {
164
  "epoch": 6.666666666666667,
165
- "grad_norm": 13.892380714416504,
166
- "learning_rate": 1.7676767676767676e-05,
167
- "loss": 0.5622,
168
  "step": 150
169
  },
170
  {
171
  "epoch": 6.977777777777778,
172
- "eval_accuracy": 0.809375,
173
- "eval_loss": 0.48242831230163574,
174
- "eval_runtime": 3.5816,
175
- "eval_samples_per_second": 178.692,
176
- "eval_steps_per_second": 2.792,
177
  "step": 157
178
  },
179
  {
180
  "epoch": 7.111111111111111,
181
- "grad_norm": 8.864046096801758,
182
- "learning_rate": 1.5151515151515153e-05,
183
- "loss": 0.525,
184
  "step": 160
185
  },
186
  {
187
  "epoch": 7.555555555555555,
188
- "grad_norm": 8.477625846862793,
189
- "learning_rate": 1.2626262626262628e-05,
190
- "loss": 0.4967,
191
  "step": 170
192
  },
193
  {
194
  "epoch": 8.0,
195
- "grad_norm": 12.289462089538574,
196
- "learning_rate": 1.0101010101010101e-05,
197
- "loss": 0.4815,
198
  "step": 180
199
  },
200
  {
201
  "epoch": 8.0,
202
- "eval_accuracy": 0.8609375,
203
- "eval_loss": 0.4012059271335602,
204
- "eval_runtime": 3.613,
205
- "eval_samples_per_second": 177.138,
206
- "eval_steps_per_second": 2.768,
207
  "step": 180
208
  },
209
  {
210
  "epoch": 8.444444444444445,
211
- "grad_norm": 8.446834564208984,
212
- "learning_rate": 7.5757575757575764e-06,
213
- "loss": 0.45,
214
  "step": 190
215
  },
216
  {
217
  "epoch": 8.88888888888889,
218
- "grad_norm": 15.683026313781738,
219
- "learning_rate": 5.050505050505051e-06,
220
- "loss": 0.4771,
221
  "step": 200
222
  },
223
  {
224
  "epoch": 8.977777777777778,
225
- "eval_accuracy": 0.85625,
226
- "eval_loss": 0.3798871636390686,
227
- "eval_runtime": 3.5689,
228
- "eval_samples_per_second": 179.325,
229
- "eval_steps_per_second": 2.802,
230
  "step": 202
231
  },
232
  {
233
  "epoch": 9.333333333333334,
234
- "grad_norm": 9.576162338256836,
235
- "learning_rate": 2.5252525252525253e-06,
236
- "loss": 0.4376,
237
  "step": 210
238
  },
239
  {
240
  "epoch": 9.777777777777779,
241
- "grad_norm": 9.685094833374023,
242
- "learning_rate": 0.0,
243
- "loss": 0.4171,
244
  "step": 220
245
  },
246
  {
247
- "epoch": 9.777777777777779,
248
- "eval_accuracy": 0.8625,
249
- "eval_loss": 0.3611668050289154,
250
- "eval_runtime": 3.9731,
251
- "eval_samples_per_second": 161.083,
252
- "eval_steps_per_second": 2.517,
253
- "step": 220
254
  },
255
  {
256
- "epoch": 9.777777777777779,
257
- "step": 220,
258
- "total_flos": 1.102133137023959e+18,
259
- "train_loss": 0.670017595724626,
260
- "train_runtime": 696.1068,
261
- "train_samples_per_second": 82.746,
262
- "train_steps_per_second": 0.316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  }
264
  ],
265
  "logging_steps": 10,
266
- "max_steps": 220,
267
  "num_input_tokens_seen": 0,
268
- "num_train_epochs": 10,
269
  "save_steps": 500,
270
- "total_flos": 1.102133137023959e+18,
271
  "train_batch_size": 64,
272
  "trial_name": null,
273
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.996875,
3
+ "best_model_checkpoint": "vit-msn-small-finetuned-alzheimers/checkpoint-765",
4
+ "epoch": 48.888888888888886,
5
  "eval_steps": 500,
6
+ "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.4444444444444444,
13
+ "grad_norm": 8.449820518493652,
14
+ "learning_rate": 4.5454545454545455e-06,
15
+ "loss": 0.2587,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.8888888888888888,
20
+ "grad_norm": 13.680850982666016,
21
+ "learning_rate": 9.090909090909091e-06,
22
+ "loss": 0.2996,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.9777777777777777,
27
+ "eval_accuracy": 0.84375,
28
+ "eval_loss": 0.38971763849258423,
29
+ "eval_runtime": 3.5179,
30
+ "eval_samples_per_second": 181.926,
31
+ "eval_steps_per_second": 2.843,
32
  "step": 22
33
  },
34
  {
35
  "epoch": 1.3333333333333333,
36
+ "grad_norm": 9.488574981689453,
37
+ "learning_rate": 1.3636363636363637e-05,
38
+ "loss": 0.4023,
39
  "step": 30
40
  },
41
  {
42
  "epoch": 1.7777777777777777,
43
+ "grad_norm": 18.977561950683594,
44
+ "learning_rate": 1.8181818181818182e-05,
45
+ "loss": 0.3703,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 2.0,
50
+ "eval_accuracy": 0.859375,
51
+ "eval_loss": 0.3594878911972046,
52
+ "eval_runtime": 3.9024,
53
+ "eval_samples_per_second": 164.001,
54
+ "eval_steps_per_second": 2.563,
55
  "step": 45
56
  },
57
  {
58
  "epoch": 2.2222222222222223,
59
+ "grad_norm": 11.33133602142334,
60
+ "learning_rate": 2.272727272727273e-05,
61
+ "loss": 0.3541,
62
  "step": 50
63
  },
64
  {
65
  "epoch": 2.6666666666666665,
66
+ "grad_norm": 16.366662979125977,
67
+ "learning_rate": 2.7272727272727273e-05,
68
+ "loss": 0.3087,
69
  "step": 60
70
  },
71
  {
72
  "epoch": 2.977777777777778,
73
+ "eval_accuracy": 0.8625,
74
+ "eval_loss": 0.3777163326740265,
75
+ "eval_runtime": 3.8599,
76
+ "eval_samples_per_second": 165.808,
77
+ "eval_steps_per_second": 2.591,
78
  "step": 67
79
  },
80
  {
81
  "epoch": 3.111111111111111,
82
+ "grad_norm": 18.307331085205078,
83
+ "learning_rate": 3.181818181818182e-05,
84
+ "loss": 0.3195,
85
  "step": 70
86
  },
87
  {
88
  "epoch": 3.5555555555555554,
89
+ "grad_norm": 42.80950164794922,
90
+ "learning_rate": 3.6363636363636364e-05,
91
+ "loss": 0.3483,
92
  "step": 80
93
  },
94
  {
95
  "epoch": 4.0,
96
+ "grad_norm": 18.051124572753906,
97
+ "learning_rate": 4.0909090909090915e-05,
98
+ "loss": 0.486,
99
  "step": 90
100
  },
101
  {
102
  "epoch": 4.0,
103
+ "eval_accuracy": 0.81875,
104
+ "eval_loss": 0.4530211389064789,
105
+ "eval_runtime": 3.6057,
106
+ "eval_samples_per_second": 177.495,
107
+ "eval_steps_per_second": 2.773,
108
  "step": 90
109
  },
110
  {
111
  "epoch": 4.444444444444445,
112
+ "grad_norm": 15.844127655029297,
113
+ "learning_rate": 4.545454545454546e-05,
114
+ "loss": 0.3521,
115
  "step": 100
116
  },
117
  {
118
  "epoch": 4.888888888888889,
119
+ "grad_norm": 11.87112808227539,
120
+ "learning_rate": 5e-05,
121
+ "loss": 0.3307,
122
  "step": 110
123
  },
124
  {
125
  "epoch": 4.977777777777778,
126
+ "eval_accuracy": 0.8234375,
127
+ "eval_loss": 0.45600825548171997,
128
+ "eval_runtime": 3.572,
129
+ "eval_samples_per_second": 179.171,
130
+ "eval_steps_per_second": 2.8,
131
  "step": 112
132
  },
133
  {
134
  "epoch": 5.333333333333333,
135
+ "grad_norm": 17.418668746948242,
136
+ "learning_rate": 4.94949494949495e-05,
137
+ "loss": 0.3404,
138
  "step": 120
139
  },
140
  {
141
  "epoch": 5.777777777777778,
142
+ "grad_norm": 20.148906707763672,
143
+ "learning_rate": 4.898989898989899e-05,
144
+ "loss": 0.306,
145
  "step": 130
146
  },
147
  {
148
  "epoch": 6.0,
149
+ "eval_accuracy": 0.8671875,
150
+ "eval_loss": 0.3470742106437683,
151
+ "eval_runtime": 3.5697,
152
+ "eval_samples_per_second": 179.289,
153
+ "eval_steps_per_second": 2.801,
154
  "step": 135
155
  },
156
  {
157
  "epoch": 6.222222222222222,
158
+ "grad_norm": 11.451733589172363,
159
+ "learning_rate": 4.848484848484849e-05,
160
+ "loss": 0.2873,
161
  "step": 140
162
  },
163
  {
164
  "epoch": 6.666666666666667,
165
+ "grad_norm": 15.43708610534668,
166
+ "learning_rate": 4.797979797979798e-05,
167
+ "loss": 0.3005,
168
  "step": 150
169
  },
170
  {
171
  "epoch": 6.977777777777778,
172
+ "eval_accuracy": 0.8859375,
173
+ "eval_loss": 0.3024638891220093,
174
+ "eval_runtime": 3.8788,
175
+ "eval_samples_per_second": 164.998,
176
+ "eval_steps_per_second": 2.578,
177
  "step": 157
178
  },
179
  {
180
  "epoch": 7.111111111111111,
181
+ "grad_norm": 20.696516036987305,
182
+ "learning_rate": 4.7474747474747476e-05,
183
+ "loss": 0.3206,
184
  "step": 160
185
  },
186
  {
187
  "epoch": 7.555555555555555,
188
+ "grad_norm": 12.01241397857666,
189
+ "learning_rate": 4.696969696969697e-05,
190
+ "loss": 0.2851,
191
  "step": 170
192
  },
193
  {
194
  "epoch": 8.0,
195
+ "grad_norm": 17.638214111328125,
196
+ "learning_rate": 4.6464646464646464e-05,
197
+ "loss": 0.319,
198
  "step": 180
199
  },
200
  {
201
  "epoch": 8.0,
202
+ "eval_accuracy": 0.8984375,
203
+ "eval_loss": 0.24509796500205994,
204
+ "eval_runtime": 3.9737,
205
+ "eval_samples_per_second": 161.061,
206
+ "eval_steps_per_second": 2.517,
207
  "step": 180
208
  },
209
  {
210
  "epoch": 8.444444444444445,
211
+ "grad_norm": 21.502132415771484,
212
+ "learning_rate": 4.595959595959596e-05,
213
+ "loss": 0.2968,
214
  "step": 190
215
  },
216
  {
217
  "epoch": 8.88888888888889,
218
+ "grad_norm": 20.09746742248535,
219
+ "learning_rate": 4.545454545454546e-05,
220
+ "loss": 0.3489,
221
  "step": 200
222
  },
223
  {
224
  "epoch": 8.977777777777778,
225
+ "eval_accuracy": 0.928125,
226
+ "eval_loss": 0.18142804503440857,
227
+ "eval_runtime": 3.7455,
228
+ "eval_samples_per_second": 170.872,
229
+ "eval_steps_per_second": 2.67,
230
  "step": 202
231
  },
232
  {
233
  "epoch": 9.333333333333334,
234
+ "grad_norm": 16.205760955810547,
235
+ "learning_rate": 4.494949494949495e-05,
236
+ "loss": 0.2915,
237
  "step": 210
238
  },
239
  {
240
  "epoch": 9.777777777777779,
241
+ "grad_norm": 27.511030197143555,
242
+ "learning_rate": 4.4444444444444447e-05,
243
+ "loss": 0.3251,
244
  "step": 220
245
  },
246
  {
247
+ "epoch": 10.0,
248
+ "eval_accuracy": 0.915625,
249
+ "eval_loss": 0.24511559307575226,
250
+ "eval_runtime": 3.6361,
251
+ "eval_samples_per_second": 176.011,
252
+ "eval_steps_per_second": 2.75,
253
+ "step": 225
254
  },
255
  {
256
+ "epoch": 10.222222222222221,
257
+ "grad_norm": 11.171629905700684,
258
+ "learning_rate": 4.3939393939393944e-05,
259
+ "loss": 0.308,
260
+ "step": 230
261
+ },
262
+ {
263
+ "epoch": 10.666666666666666,
264
+ "grad_norm": 12.315302848815918,
265
+ "learning_rate": 4.343434343434344e-05,
266
+ "loss": 0.3034,
267
+ "step": 240
268
+ },
269
+ {
270
+ "epoch": 10.977777777777778,
271
+ "eval_accuracy": 0.940625,
272
+ "eval_loss": 0.15658709406852722,
273
+ "eval_runtime": 3.5876,
274
+ "eval_samples_per_second": 178.392,
275
+ "eval_steps_per_second": 2.787,
276
+ "step": 247
277
+ },
278
+ {
279
+ "epoch": 11.11111111111111,
280
+ "grad_norm": 14.539319038391113,
281
+ "learning_rate": 4.292929292929293e-05,
282
+ "loss": 0.2847,
283
+ "step": 250
284
+ },
285
+ {
286
+ "epoch": 11.555555555555555,
287
+ "grad_norm": 17.26177215576172,
288
+ "learning_rate": 4.242424242424243e-05,
289
+ "loss": 0.2754,
290
+ "step": 260
291
+ },
292
+ {
293
+ "epoch": 12.0,
294
+ "grad_norm": 12.163039207458496,
295
+ "learning_rate": 4.191919191919192e-05,
296
+ "loss": 0.2746,
297
+ "step": 270
298
+ },
299
+ {
300
+ "epoch": 12.0,
301
+ "eval_accuracy": 0.8921875,
302
+ "eval_loss": 0.24925951659679413,
303
+ "eval_runtime": 3.5808,
304
+ "eval_samples_per_second": 178.73,
305
+ "eval_steps_per_second": 2.793,
306
+ "step": 270
307
+ },
308
+ {
309
+ "epoch": 12.444444444444445,
310
+ "grad_norm": 11.92519474029541,
311
+ "learning_rate": 4.141414141414142e-05,
312
+ "loss": 0.2889,
313
+ "step": 280
314
+ },
315
+ {
316
+ "epoch": 12.88888888888889,
317
+ "grad_norm": 13.212408065795898,
318
+ "learning_rate": 4.0909090909090915e-05,
319
+ "loss": 0.2369,
320
+ "step": 290
321
+ },
322
+ {
323
+ "epoch": 12.977777777777778,
324
+ "eval_accuracy": 0.9375,
325
+ "eval_loss": 0.1622403860092163,
326
+ "eval_runtime": 3.8343,
327
+ "eval_samples_per_second": 166.914,
328
+ "eval_steps_per_second": 2.608,
329
+ "step": 292
330
+ },
331
+ {
332
+ "epoch": 13.333333333333334,
333
+ "grad_norm": 11.68896770477295,
334
+ "learning_rate": 4.0404040404040405e-05,
335
+ "loss": 0.2465,
336
+ "step": 300
337
+ },
338
+ {
339
+ "epoch": 13.777777777777779,
340
+ "grad_norm": 14.610076904296875,
341
+ "learning_rate": 3.98989898989899e-05,
342
+ "loss": 0.2231,
343
+ "step": 310
344
+ },
345
+ {
346
+ "epoch": 14.0,
347
+ "eval_accuracy": 0.9359375,
348
+ "eval_loss": 0.17805945873260498,
349
+ "eval_runtime": 3.9695,
350
+ "eval_samples_per_second": 161.231,
351
+ "eval_steps_per_second": 2.519,
352
+ "step": 315
353
+ },
354
+ {
355
+ "epoch": 14.222222222222221,
356
+ "grad_norm": 12.407272338867188,
357
+ "learning_rate": 3.939393939393939e-05,
358
+ "loss": 0.2177,
359
+ "step": 320
360
+ },
361
+ {
362
+ "epoch": 14.666666666666666,
363
+ "grad_norm": 7.3430256843566895,
364
+ "learning_rate": 3.888888888888889e-05,
365
+ "loss": 0.2281,
366
+ "step": 330
367
+ },
368
+ {
369
+ "epoch": 14.977777777777778,
370
+ "eval_accuracy": 0.953125,
371
+ "eval_loss": 0.12681424617767334,
372
+ "eval_runtime": 3.6554,
373
+ "eval_samples_per_second": 175.082,
374
+ "eval_steps_per_second": 2.736,
375
+ "step": 337
376
+ },
377
+ {
378
+ "epoch": 15.11111111111111,
379
+ "grad_norm": 10.262022018432617,
380
+ "learning_rate": 3.838383838383838e-05,
381
+ "loss": 0.209,
382
+ "step": 340
383
+ },
384
+ {
385
+ "epoch": 15.555555555555555,
386
+ "grad_norm": 9.078124046325684,
387
+ "learning_rate": 3.787878787878788e-05,
388
+ "loss": 0.2134,
389
+ "step": 350
390
+ },
391
+ {
392
+ "epoch": 16.0,
393
+ "grad_norm": 14.094355583190918,
394
+ "learning_rate": 3.7373737373737376e-05,
395
+ "loss": 0.2001,
396
+ "step": 360
397
+ },
398
+ {
399
+ "epoch": 16.0,
400
+ "eval_accuracy": 0.9140625,
401
+ "eval_loss": 0.24309130012989044,
402
+ "eval_runtime": 3.5892,
403
+ "eval_samples_per_second": 178.311,
404
+ "eval_steps_per_second": 2.786,
405
+ "step": 360
406
+ },
407
+ {
408
+ "epoch": 16.444444444444443,
409
+ "grad_norm": 12.868298530578613,
410
+ "learning_rate": 3.686868686868687e-05,
411
+ "loss": 0.2312,
412
+ "step": 370
413
+ },
414
+ {
415
+ "epoch": 16.88888888888889,
416
+ "grad_norm": 7.863047122955322,
417
+ "learning_rate": 3.6363636363636364e-05,
418
+ "loss": 0.183,
419
+ "step": 380
420
+ },
421
+ {
422
+ "epoch": 16.977777777777778,
423
+ "eval_accuracy": 0.9625,
424
+ "eval_loss": 0.10167054831981659,
425
+ "eval_runtime": 3.6007,
426
+ "eval_samples_per_second": 177.743,
427
+ "eval_steps_per_second": 2.777,
428
+ "step": 382
429
+ },
430
+ {
431
+ "epoch": 17.333333333333332,
432
+ "grad_norm": 8.919840812683105,
433
+ "learning_rate": 3.5858585858585855e-05,
434
+ "loss": 0.1997,
435
+ "step": 390
436
+ },
437
+ {
438
+ "epoch": 17.77777777777778,
439
+ "grad_norm": 11.91215705871582,
440
+ "learning_rate": 3.535353535353535e-05,
441
+ "loss": 0.1891,
442
+ "step": 400
443
+ },
444
+ {
445
+ "epoch": 18.0,
446
+ "eval_accuracy": 0.9390625,
447
+ "eval_loss": 0.180230051279068,
448
+ "eval_runtime": 3.5491,
449
+ "eval_samples_per_second": 180.325,
450
+ "eval_steps_per_second": 2.818,
451
+ "step": 405
452
+ },
453
+ {
454
+ "epoch": 18.22222222222222,
455
+ "grad_norm": 9.109786033630371,
456
+ "learning_rate": 3.484848484848485e-05,
457
+ "loss": 0.213,
458
+ "step": 410
459
+ },
460
+ {
461
+ "epoch": 18.666666666666668,
462
+ "grad_norm": 5.232081413269043,
463
+ "learning_rate": 3.434343434343435e-05,
464
+ "loss": 0.1862,
465
+ "step": 420
466
+ },
467
+ {
468
+ "epoch": 18.977777777777778,
469
+ "eval_accuracy": 0.9765625,
470
+ "eval_loss": 0.08689282089471817,
471
+ "eval_runtime": 3.9268,
472
+ "eval_samples_per_second": 162.982,
473
+ "eval_steps_per_second": 2.547,
474
+ "step": 427
475
+ },
476
+ {
477
+ "epoch": 19.11111111111111,
478
+ "grad_norm": 13.340733528137207,
479
+ "learning_rate": 3.3838383838383844e-05,
480
+ "loss": 0.1664,
481
+ "step": 430
482
+ },
483
+ {
484
+ "epoch": 19.555555555555557,
485
+ "grad_norm": 7.66475772857666,
486
+ "learning_rate": 3.3333333333333335e-05,
487
+ "loss": 0.1727,
488
+ "step": 440
489
+ },
490
+ {
491
+ "epoch": 20.0,
492
+ "grad_norm": 13.612215995788574,
493
+ "learning_rate": 3.282828282828283e-05,
494
+ "loss": 0.1935,
495
+ "step": 450
496
+ },
497
+ {
498
+ "epoch": 20.0,
499
+ "eval_accuracy": 0.96875,
500
+ "eval_loss": 0.10791148245334625,
501
+ "eval_runtime": 3.9472,
502
+ "eval_samples_per_second": 162.141,
503
+ "eval_steps_per_second": 2.533,
504
+ "step": 450
505
+ },
506
+ {
507
+ "epoch": 20.444444444444443,
508
+ "grad_norm": 9.189305305480957,
509
+ "learning_rate": 3.232323232323233e-05,
510
+ "loss": 0.1945,
511
+ "step": 460
512
+ },
513
+ {
514
+ "epoch": 20.88888888888889,
515
+ "grad_norm": 9.650483131408691,
516
+ "learning_rate": 3.181818181818182e-05,
517
+ "loss": 0.1797,
518
+ "step": 470
519
+ },
520
+ {
521
+ "epoch": 20.977777777777778,
522
+ "eval_accuracy": 0.95625,
523
+ "eval_loss": 0.12502644956111908,
524
+ "eval_runtime": 3.6171,
525
+ "eval_samples_per_second": 176.939,
526
+ "eval_steps_per_second": 2.765,
527
+ "step": 472
528
+ },
529
+ {
530
+ "epoch": 21.333333333333332,
531
+ "grad_norm": 7.25011682510376,
532
+ "learning_rate": 3.131313131313132e-05,
533
+ "loss": 0.1767,
534
+ "step": 480
535
+ },
536
+ {
537
+ "epoch": 21.77777777777778,
538
+ "grad_norm": 9.036290168762207,
539
+ "learning_rate": 3.080808080808081e-05,
540
+ "loss": 0.1605,
541
+ "step": 490
542
+ },
543
+ {
544
+ "epoch": 22.0,
545
+ "eval_accuracy": 0.971875,
546
+ "eval_loss": 0.06545940786600113,
547
+ "eval_runtime": 3.5923,
548
+ "eval_samples_per_second": 178.16,
549
+ "eval_steps_per_second": 2.784,
550
+ "step": 495
551
+ },
552
+ {
553
+ "epoch": 22.22222222222222,
554
+ "grad_norm": 5.982744216918945,
555
+ "learning_rate": 3.0303030303030306e-05,
556
+ "loss": 0.1493,
557
+ "step": 500
558
+ },
559
+ {
560
+ "epoch": 22.666666666666668,
561
+ "grad_norm": 13.987672805786133,
562
+ "learning_rate": 2.9797979797979796e-05,
563
+ "loss": 0.1848,
564
+ "step": 510
565
+ },
566
+ {
567
+ "epoch": 22.977777777777778,
568
+ "eval_accuracy": 0.9765625,
569
+ "eval_loss": 0.08063917607069016,
570
+ "eval_runtime": 3.5801,
571
+ "eval_samples_per_second": 178.764,
572
+ "eval_steps_per_second": 2.793,
573
+ "step": 517
574
+ },
575
+ {
576
+ "epoch": 23.11111111111111,
577
+ "grad_norm": 15.488668441772461,
578
+ "learning_rate": 2.9292929292929294e-05,
579
+ "loss": 0.1426,
580
+ "step": 520
581
+ },
582
+ {
583
+ "epoch": 23.555555555555557,
584
+ "grad_norm": 11.646829605102539,
585
+ "learning_rate": 2.878787878787879e-05,
586
+ "loss": 0.1667,
587
+ "step": 530
588
+ },
589
+ {
590
+ "epoch": 24.0,
591
+ "grad_norm": 6.619264602661133,
592
+ "learning_rate": 2.8282828282828282e-05,
593
+ "loss": 0.1498,
594
+ "step": 540
595
+ },
596
+ {
597
+ "epoch": 24.0,
598
+ "eval_accuracy": 0.9578125,
599
+ "eval_loss": 0.11159060150384903,
600
+ "eval_runtime": 3.6016,
601
+ "eval_samples_per_second": 177.701,
602
+ "eval_steps_per_second": 2.777,
603
+ "step": 540
604
+ },
605
+ {
606
+ "epoch": 24.444444444444443,
607
+ "grad_norm": 7.8661627769470215,
608
+ "learning_rate": 2.777777777777778e-05,
609
+ "loss": 0.1287,
610
+ "step": 550
611
+ },
612
+ {
613
+ "epoch": 24.88888888888889,
614
+ "grad_norm": 7.934934616088867,
615
+ "learning_rate": 2.7272727272727273e-05,
616
+ "loss": 0.1394,
617
+ "step": 560
618
+ },
619
+ {
620
+ "epoch": 24.977777777777778,
621
+ "eval_accuracy": 0.9671875,
622
+ "eval_loss": 0.0806862860918045,
623
+ "eval_runtime": 3.8749,
624
+ "eval_samples_per_second": 165.167,
625
+ "eval_steps_per_second": 2.581,
626
+ "step": 562
627
+ },
628
+ {
629
+ "epoch": 25.333333333333332,
630
+ "grad_norm": 10.52723217010498,
631
+ "learning_rate": 2.676767676767677e-05,
632
+ "loss": 0.1524,
633
+ "step": 570
634
+ },
635
+ {
636
+ "epoch": 25.77777777777778,
637
+ "grad_norm": 7.490493297576904,
638
+ "learning_rate": 2.6262626262626268e-05,
639
+ "loss": 0.1584,
640
+ "step": 580
641
+ },
642
+ {
643
+ "epoch": 26.0,
644
+ "eval_accuracy": 0.9796875,
645
+ "eval_loss": 0.05252554267644882,
646
+ "eval_runtime": 3.9703,
647
+ "eval_samples_per_second": 161.197,
648
+ "eval_steps_per_second": 2.519,
649
+ "step": 585
650
+ },
651
+ {
652
+ "epoch": 26.22222222222222,
653
+ "grad_norm": 7.178821563720703,
654
+ "learning_rate": 2.575757575757576e-05,
655
+ "loss": 0.153,
656
+ "step": 590
657
+ },
658
+ {
659
+ "epoch": 26.666666666666668,
660
+ "grad_norm": 7.702730178833008,
661
+ "learning_rate": 2.5252525252525256e-05,
662
+ "loss": 0.1302,
663
+ "step": 600
664
+ },
665
+ {
666
+ "epoch": 26.977777777777778,
667
+ "eval_accuracy": 0.9828125,
668
+ "eval_loss": 0.05131406709551811,
669
+ "eval_runtime": 3.7014,
670
+ "eval_samples_per_second": 172.905,
671
+ "eval_steps_per_second": 2.702,
672
+ "step": 607
673
+ },
674
+ {
675
+ "epoch": 27.11111111111111,
676
+ "grad_norm": 9.482915878295898,
677
+ "learning_rate": 2.474747474747475e-05,
678
+ "loss": 0.1577,
679
+ "step": 610
680
+ },
681
+ {
682
+ "epoch": 27.555555555555557,
683
+ "grad_norm": 10.196369171142578,
684
+ "learning_rate": 2.4242424242424244e-05,
685
+ "loss": 0.1543,
686
+ "step": 620
687
+ },
688
+ {
689
+ "epoch": 28.0,
690
+ "grad_norm": 10.799006462097168,
691
+ "learning_rate": 2.3737373737373738e-05,
692
+ "loss": 0.1356,
693
+ "step": 630
694
+ },
695
+ {
696
+ "epoch": 28.0,
697
+ "eval_accuracy": 0.9875,
698
+ "eval_loss": 0.04204293340444565,
699
+ "eval_runtime": 3.9774,
700
+ "eval_samples_per_second": 160.91,
701
+ "eval_steps_per_second": 2.514,
702
+ "step": 630
703
+ },
704
+ {
705
+ "epoch": 28.444444444444443,
706
+ "grad_norm": 7.129752159118652,
707
+ "learning_rate": 2.3232323232323232e-05,
708
+ "loss": 0.1291,
709
+ "step": 640
710
+ },
711
+ {
712
+ "epoch": 28.88888888888889,
713
+ "grad_norm": 6.642085552215576,
714
+ "learning_rate": 2.272727272727273e-05,
715
+ "loss": 0.1101,
716
+ "step": 650
717
+ },
718
+ {
719
+ "epoch": 28.977777777777778,
720
+ "eval_accuracy": 0.9875,
721
+ "eval_loss": 0.03539272025227547,
722
+ "eval_runtime": 4.0166,
723
+ "eval_samples_per_second": 159.34,
724
+ "eval_steps_per_second": 2.49,
725
+ "step": 652
726
+ },
727
+ {
728
+ "epoch": 29.333333333333332,
729
+ "grad_norm": 8.5753755569458,
730
+ "learning_rate": 2.2222222222222223e-05,
731
+ "loss": 0.1445,
732
+ "step": 660
733
+ },
734
+ {
735
+ "epoch": 29.77777777777778,
736
+ "grad_norm": 13.183974266052246,
737
+ "learning_rate": 2.171717171717172e-05,
738
+ "loss": 0.1227,
739
+ "step": 670
740
+ },
741
+ {
742
+ "epoch": 30.0,
743
+ "eval_accuracy": 0.9765625,
744
+ "eval_loss": 0.0582769513130188,
745
+ "eval_runtime": 3.9913,
746
+ "eval_samples_per_second": 160.35,
747
+ "eval_steps_per_second": 2.505,
748
+ "step": 675
749
+ },
750
+ {
751
+ "epoch": 30.22222222222222,
752
+ "grad_norm": 9.026564598083496,
753
+ "learning_rate": 2.1212121212121215e-05,
754
+ "loss": 0.1209,
755
+ "step": 680
756
+ },
757
+ {
758
+ "epoch": 30.666666666666668,
759
+ "grad_norm": 12.504347801208496,
760
+ "learning_rate": 2.070707070707071e-05,
761
+ "loss": 0.1158,
762
+ "step": 690
763
+ },
764
+ {
765
+ "epoch": 30.977777777777778,
766
+ "eval_accuracy": 0.990625,
767
+ "eval_loss": 0.025325458496809006,
768
+ "eval_runtime": 3.9678,
769
+ "eval_samples_per_second": 161.299,
770
+ "eval_steps_per_second": 2.52,
771
+ "step": 697
772
+ },
773
+ {
774
+ "epoch": 31.11111111111111,
775
+ "grad_norm": 5.276214599609375,
776
+ "learning_rate": 2.0202020202020203e-05,
777
+ "loss": 0.119,
778
+ "step": 700
779
+ },
780
+ {
781
+ "epoch": 31.555555555555557,
782
+ "grad_norm": 8.732769012451172,
783
+ "learning_rate": 1.9696969696969697e-05,
784
+ "loss": 0.1156,
785
+ "step": 710
786
+ },
787
+ {
788
+ "epoch": 32.0,
789
+ "grad_norm": 5.604591369628906,
790
+ "learning_rate": 1.919191919191919e-05,
791
+ "loss": 0.117,
792
+ "step": 720
793
+ },
794
+ {
795
+ "epoch": 32.0,
796
+ "eval_accuracy": 0.990625,
797
+ "eval_loss": 0.023098567500710487,
798
+ "eval_runtime": 3.8505,
799
+ "eval_samples_per_second": 166.213,
800
+ "eval_steps_per_second": 2.597,
801
+ "step": 720
802
+ },
803
+ {
804
+ "epoch": 32.44444444444444,
805
+ "grad_norm": 7.332610607147217,
806
+ "learning_rate": 1.8686868686868688e-05,
807
+ "loss": 0.1213,
808
+ "step": 730
809
+ },
810
+ {
811
+ "epoch": 32.888888888888886,
812
+ "grad_norm": 12.890093803405762,
813
+ "learning_rate": 1.8181818181818182e-05,
814
+ "loss": 0.1022,
815
+ "step": 740
816
+ },
817
+ {
818
+ "epoch": 32.977777777777774,
819
+ "eval_accuracy": 0.9796875,
820
+ "eval_loss": 0.0725882276892662,
821
+ "eval_runtime": 3.8065,
822
+ "eval_samples_per_second": 168.135,
823
+ "eval_steps_per_second": 2.627,
824
+ "step": 742
825
+ },
826
+ {
827
+ "epoch": 33.333333333333336,
828
+ "grad_norm": 13.247682571411133,
829
+ "learning_rate": 1.7676767676767676e-05,
830
+ "loss": 0.1257,
831
+ "step": 750
832
+ },
833
+ {
834
+ "epoch": 33.77777777777778,
835
+ "grad_norm": 6.758236885070801,
836
+ "learning_rate": 1.7171717171717173e-05,
837
+ "loss": 0.1221,
838
+ "step": 760
839
+ },
840
+ {
841
+ "epoch": 34.0,
842
+ "eval_accuracy": 0.996875,
843
+ "eval_loss": 0.015964530408382416,
844
+ "eval_runtime": 3.7585,
845
+ "eval_samples_per_second": 170.283,
846
+ "eval_steps_per_second": 2.661,
847
+ "step": 765
848
+ },
849
+ {
850
+ "epoch": 34.22222222222222,
851
+ "grad_norm": 8.521262168884277,
852
+ "learning_rate": 1.6666666666666667e-05,
853
+ "loss": 0.1014,
854
+ "step": 770
855
+ },
856
+ {
857
+ "epoch": 34.666666666666664,
858
+ "grad_norm": 5.949100971221924,
859
+ "learning_rate": 1.6161616161616165e-05,
860
+ "loss": 0.0956,
861
+ "step": 780
862
+ },
863
+ {
864
+ "epoch": 34.977777777777774,
865
+ "eval_accuracy": 0.984375,
866
+ "eval_loss": 0.048214979469776154,
867
+ "eval_runtime": 3.6909,
868
+ "eval_samples_per_second": 173.399,
869
+ "eval_steps_per_second": 2.709,
870
+ "step": 787
871
+ },
872
+ {
873
+ "epoch": 35.111111111111114,
874
+ "grad_norm": 10.151766777038574,
875
+ "learning_rate": 1.565656565656566e-05,
876
+ "loss": 0.1135,
877
+ "step": 790
878
+ },
879
+ {
880
+ "epoch": 35.55555555555556,
881
+ "grad_norm": 9.514137268066406,
882
+ "learning_rate": 1.5151515151515153e-05,
883
+ "loss": 0.1109,
884
+ "step": 800
885
+ },
886
+ {
887
+ "epoch": 36.0,
888
+ "grad_norm": 4.4278717041015625,
889
+ "learning_rate": 1.4646464646464647e-05,
890
+ "loss": 0.0856,
891
+ "step": 810
892
+ },
893
+ {
894
+ "epoch": 36.0,
895
+ "eval_accuracy": 0.9875,
896
+ "eval_loss": 0.025567293167114258,
897
+ "eval_runtime": 3.6537,
898
+ "eval_samples_per_second": 175.165,
899
+ "eval_steps_per_second": 2.737,
900
+ "step": 810
901
+ },
902
+ {
903
+ "epoch": 36.44444444444444,
904
+ "grad_norm": 8.582184791564941,
905
+ "learning_rate": 1.4141414141414141e-05,
906
+ "loss": 0.0994,
907
+ "step": 820
908
+ },
909
+ {
910
+ "epoch": 36.888888888888886,
911
+ "grad_norm": 9.628859519958496,
912
+ "learning_rate": 1.3636363636363637e-05,
913
+ "loss": 0.0996,
914
+ "step": 830
915
+ },
916
+ {
917
+ "epoch": 36.977777777777774,
918
+ "eval_accuracy": 0.990625,
919
+ "eval_loss": 0.021057253703475,
920
+ "eval_runtime": 3.6772,
921
+ "eval_samples_per_second": 174.046,
922
+ "eval_steps_per_second": 2.719,
923
+ "step": 832
924
+ },
925
+ {
926
+ "epoch": 37.333333333333336,
927
+ "grad_norm": 5.165952682495117,
928
+ "learning_rate": 1.3131313131313134e-05,
929
+ "loss": 0.0915,
930
+ "step": 840
931
+ },
932
+ {
933
+ "epoch": 37.77777777777778,
934
+ "grad_norm": 5.831385135650635,
935
+ "learning_rate": 1.2626262626262628e-05,
936
+ "loss": 0.0848,
937
+ "step": 850
938
+ },
939
+ {
940
+ "epoch": 38.0,
941
+ "eval_accuracy": 0.9796875,
942
+ "eval_loss": 0.04457371309399605,
943
+ "eval_runtime": 3.6584,
944
+ "eval_samples_per_second": 174.938,
945
+ "eval_steps_per_second": 2.733,
946
+ "step": 855
947
+ },
948
+ {
949
+ "epoch": 38.22222222222222,
950
+ "grad_norm": 9.629181861877441,
951
+ "learning_rate": 1.2121212121212122e-05,
952
+ "loss": 0.0972,
953
+ "step": 860
954
+ },
955
+ {
956
+ "epoch": 38.666666666666664,
957
+ "grad_norm": 6.214244365692139,
958
+ "learning_rate": 1.1616161616161616e-05,
959
+ "loss": 0.1001,
960
+ "step": 870
961
+ },
962
+ {
963
+ "epoch": 38.977777777777774,
964
+ "eval_accuracy": 0.9875,
965
+ "eval_loss": 0.02742326818406582,
966
+ "eval_runtime": 3.668,
967
+ "eval_samples_per_second": 174.481,
968
+ "eval_steps_per_second": 2.726,
969
+ "step": 877
970
+ },
971
+ {
972
+ "epoch": 39.111111111111114,
973
+ "grad_norm": 11.1734619140625,
974
+ "learning_rate": 1.1111111111111112e-05,
975
+ "loss": 0.0919,
976
+ "step": 880
977
+ },
978
+ {
979
+ "epoch": 39.55555555555556,
980
+ "grad_norm": 6.438005447387695,
981
+ "learning_rate": 1.0606060606060607e-05,
982
+ "loss": 0.0988,
983
+ "step": 890
984
+ },
985
+ {
986
+ "epoch": 40.0,
987
+ "grad_norm": 5.9803643226623535,
988
+ "learning_rate": 1.0101010101010101e-05,
989
+ "loss": 0.0976,
990
+ "step": 900
991
+ },
992
+ {
993
+ "epoch": 40.0,
994
+ "eval_accuracy": 0.9921875,
995
+ "eval_loss": 0.022529248148202896,
996
+ "eval_runtime": 3.7092,
997
+ "eval_samples_per_second": 172.543,
998
+ "eval_steps_per_second": 2.696,
999
+ "step": 900
1000
+ },
1001
+ {
1002
+ "epoch": 40.44444444444444,
1003
+ "grad_norm": 7.562661647796631,
1004
+ "learning_rate": 9.595959595959595e-06,
1005
+ "loss": 0.085,
1006
+ "step": 910
1007
+ },
1008
+ {
1009
+ "epoch": 40.888888888888886,
1010
+ "grad_norm": 7.695030212402344,
1011
+ "learning_rate": 9.090909090909091e-06,
1012
+ "loss": 0.0864,
1013
+ "step": 920
1014
+ },
1015
+ {
1016
+ "epoch": 40.977777777777774,
1017
+ "eval_accuracy": 0.9921875,
1018
+ "eval_loss": 0.0207191314548254,
1019
+ "eval_runtime": 4.0024,
1020
+ "eval_samples_per_second": 159.904,
1021
+ "eval_steps_per_second": 2.499,
1022
+ "step": 922
1023
+ },
1024
+ {
1025
+ "epoch": 41.333333333333336,
1026
+ "grad_norm": 8.4052734375,
1027
+ "learning_rate": 8.585858585858587e-06,
1028
+ "loss": 0.088,
1029
+ "step": 930
1030
+ },
1031
+ {
1032
+ "epoch": 41.77777777777778,
1033
+ "grad_norm": 8.705794334411621,
1034
+ "learning_rate": 8.080808080808082e-06,
1035
+ "loss": 0.0865,
1036
+ "step": 940
1037
+ },
1038
+ {
1039
+ "epoch": 42.0,
1040
+ "eval_accuracy": 0.996875,
1041
+ "eval_loss": 0.01933131366968155,
1042
+ "eval_runtime": 3.9909,
1043
+ "eval_samples_per_second": 160.365,
1044
+ "eval_steps_per_second": 2.506,
1045
+ "step": 945
1046
+ },
1047
+ {
1048
+ "epoch": 42.22222222222222,
1049
+ "grad_norm": 7.478874683380127,
1050
+ "learning_rate": 7.5757575757575764e-06,
1051
+ "loss": 0.0815,
1052
+ "step": 950
1053
+ },
1054
+ {
1055
+ "epoch": 42.666666666666664,
1056
+ "grad_norm": 5.25657320022583,
1057
+ "learning_rate": 7.0707070707070704e-06,
1058
+ "loss": 0.0773,
1059
+ "step": 960
1060
+ },
1061
+ {
1062
+ "epoch": 42.977777777777774,
1063
+ "eval_accuracy": 0.9921875,
1064
+ "eval_loss": 0.020288193598389626,
1065
+ "eval_runtime": 3.6594,
1066
+ "eval_samples_per_second": 174.89,
1067
+ "eval_steps_per_second": 2.733,
1068
+ "step": 967
1069
+ },
1070
+ {
1071
+ "epoch": 43.111111111111114,
1072
+ "grad_norm": 4.1972246170043945,
1073
+ "learning_rate": 6.565656565656567e-06,
1074
+ "loss": 0.0799,
1075
+ "step": 970
1076
+ },
1077
+ {
1078
+ "epoch": 43.55555555555556,
1079
+ "grad_norm": 6.9554972648620605,
1080
+ "learning_rate": 6.060606060606061e-06,
1081
+ "loss": 0.0772,
1082
+ "step": 980
1083
+ },
1084
+ {
1085
+ "epoch": 44.0,
1086
+ "grad_norm": 6.343081951141357,
1087
+ "learning_rate": 5.555555555555556e-06,
1088
+ "loss": 0.075,
1089
+ "step": 990
1090
+ },
1091
+ {
1092
+ "epoch": 44.0,
1093
+ "eval_accuracy": 0.996875,
1094
+ "eval_loss": 0.013058523647487164,
1095
+ "eval_runtime": 3.5774,
1096
+ "eval_samples_per_second": 178.899,
1097
+ "eval_steps_per_second": 2.795,
1098
+ "step": 990
1099
+ },
1100
+ {
1101
+ "epoch": 44.44444444444444,
1102
+ "grad_norm": 4.900812149047852,
1103
+ "learning_rate": 5.050505050505051e-06,
1104
+ "loss": 0.0736,
1105
+ "step": 1000
1106
+ },
1107
+ {
1108
+ "epoch": 44.888888888888886,
1109
+ "grad_norm": 5.955135345458984,
1110
+ "learning_rate": 4.5454545454545455e-06,
1111
+ "loss": 0.0761,
1112
+ "step": 1010
1113
+ },
1114
+ {
1115
+ "epoch": 44.977777777777774,
1116
+ "eval_accuracy": 0.99375,
1117
+ "eval_loss": 0.012860281392931938,
1118
+ "eval_runtime": 3.5897,
1119
+ "eval_samples_per_second": 178.288,
1120
+ "eval_steps_per_second": 2.786,
1121
+ "step": 1012
1122
+ },
1123
+ {
1124
+ "epoch": 45.333333333333336,
1125
+ "grad_norm": 4.250102996826172,
1126
+ "learning_rate": 4.040404040404041e-06,
1127
+ "loss": 0.0707,
1128
+ "step": 1020
1129
+ },
1130
+ {
1131
+ "epoch": 45.77777777777778,
1132
+ "grad_norm": 6.8997931480407715,
1133
+ "learning_rate": 3.5353535353535352e-06,
1134
+ "loss": 0.0624,
1135
+ "step": 1030
1136
+ },
1137
+ {
1138
+ "epoch": 46.0,
1139
+ "eval_accuracy": 0.996875,
1140
+ "eval_loss": 0.011364495381712914,
1141
+ "eval_runtime": 3.6128,
1142
+ "eval_samples_per_second": 177.146,
1143
+ "eval_steps_per_second": 2.768,
1144
+ "step": 1035
1145
+ },
1146
+ {
1147
+ "epoch": 46.22222222222222,
1148
+ "grad_norm": 10.210082054138184,
1149
+ "learning_rate": 3.0303030303030305e-06,
1150
+ "loss": 0.0762,
1151
+ "step": 1040
1152
+ },
1153
+ {
1154
+ "epoch": 46.666666666666664,
1155
+ "grad_norm": 4.9201788902282715,
1156
+ "learning_rate": 2.5252525252525253e-06,
1157
+ "loss": 0.0557,
1158
+ "step": 1050
1159
+ },
1160
+ {
1161
+ "epoch": 46.977777777777774,
1162
+ "eval_accuracy": 0.9953125,
1163
+ "eval_loss": 0.010208332911133766,
1164
+ "eval_runtime": 3.8474,
1165
+ "eval_samples_per_second": 166.347,
1166
+ "eval_steps_per_second": 2.599,
1167
+ "step": 1057
1168
+ },
1169
+ {
1170
+ "epoch": 47.111111111111114,
1171
+ "grad_norm": 3.725327491760254,
1172
+ "learning_rate": 2.0202020202020206e-06,
1173
+ "loss": 0.0613,
1174
+ "step": 1060
1175
+ },
1176
+ {
1177
+ "epoch": 47.55555555555556,
1178
+ "grad_norm": 3.1549530029296875,
1179
+ "learning_rate": 1.5151515151515152e-06,
1180
+ "loss": 0.0601,
1181
+ "step": 1070
1182
+ },
1183
+ {
1184
+ "epoch": 48.0,
1185
+ "grad_norm": 6.280518054962158,
1186
+ "learning_rate": 1.0101010101010103e-06,
1187
+ "loss": 0.0708,
1188
+ "step": 1080
1189
+ },
1190
+ {
1191
+ "epoch": 48.0,
1192
+ "eval_accuracy": 0.9953125,
1193
+ "eval_loss": 0.01160599384456873,
1194
+ "eval_runtime": 3.9922,
1195
+ "eval_samples_per_second": 160.314,
1196
+ "eval_steps_per_second": 2.505,
1197
+ "step": 1080
1198
+ },
1199
+ {
1200
+ "epoch": 48.44444444444444,
1201
+ "grad_norm": 6.1849260330200195,
1202
+ "learning_rate": 5.050505050505052e-07,
1203
+ "loss": 0.0699,
1204
+ "step": 1090
1205
+ },
1206
+ {
1207
+ "epoch": 48.888888888888886,
1208
+ "grad_norm": 7.637501239776611,
1209
+ "learning_rate": 0.0,
1210
+ "loss": 0.0667,
1211
+ "step": 1100
1212
+ },
1213
+ {
1214
+ "epoch": 48.888888888888886,
1215
+ "eval_accuracy": 0.9953125,
1216
+ "eval_loss": 0.013088616542518139,
1217
+ "eval_runtime": 3.601,
1218
+ "eval_samples_per_second": 177.727,
1219
+ "eval_steps_per_second": 2.777,
1220
+ "step": 1100
1221
+ },
1222
+ {
1223
+ "epoch": 48.888888888888886,
1224
+ "step": 1100,
1225
+ "total_flos": 5.510665685119795e+18,
1226
+ "train_loss": 0.17931611462072894,
1227
+ "train_runtime": 3528.3547,
1228
+ "train_samples_per_second": 81.624,
1229
+ "train_steps_per_second": 0.312
1230
  }
1231
  ],
1232
  "logging_steps": 10,
1233
+ "max_steps": 1100,
1234
  "num_input_tokens_seen": 0,
1235
+ "num_train_epochs": 50,
1236
  "save_steps": 500,
1237
+ "total_flos": 5.510665685119795e+18,
1238
  "train_batch_size": 64,
1239
  "trial_name": null,
1240
  "trial_params": null