valdmocha commited on
Commit
aaa7978
·
verified ·
1 Parent(s): f2aafba

Model save

Browse files
Files changed (6) hide show
  1. README.md +16 -11
  2. all_results.json +7 -7
  3. model.safetensors +1 -1
  4. test_results.json +7 -7
  5. trainer_state.json +449 -231
  6. val_results.json +7 -7
README.md CHANGED
@@ -18,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [facebook/timesformer-base-finetuned-k400](https://huggingface.co/facebook/timesformer-base-finetuned-k400) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.4794
22
- - Accuracy: 0.8340
23
- - F1: 0.8347
24
 
25
  ## Model description
26
 
@@ -40,23 +40,28 @@ More information needed
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 5e-05
43
- - train_batch_size: 10
44
- - eval_batch_size: 10
45
  - seed: 42
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
  - lr_scheduler_warmup_ratio: 0.1
49
- - training_steps: 370
50
 
51
  ### Training results
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 |
54
  |:-------------:|:------:|:----:|:---------------:|:--------:|:------:|
55
- | 0.6213 | 0.2027 | 75 | 0.7774 | 0.6971 | 0.6902 |
56
- | 0.3269 | 1.2027 | 150 | 0.6851 | 0.7593 | 0.7562 |
57
- | 0.2339 | 2.2027 | 225 | 0.5252 | 0.8008 | 0.7993 |
58
- | 0.1931 | 3.2027 | 300 | 0.4942 | 0.8340 | 0.8342 |
59
- | 0.1037 | 4.1892 | 370 | 0.4794 | 0.8340 | 0.8347 |
 
 
 
 
 
60
 
61
 
62
  ### Framework versions
 
18
 
19
  This model is a fine-tuned version of [facebook/timesformer-base-finetuned-k400](https://huggingface.co/facebook/timesformer-base-finetuned-k400) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4027
22
+ - Accuracy: 0.8838
23
+ - F1: 0.8838
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 5e-05
43
+ - train_batch_size: 12
44
+ - eval_batch_size: 12
45
  - seed: 42
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
  - lr_scheduler_warmup_ratio: 0.1
49
+ - training_steps: 610
50
 
51
  ### Training results
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 |
54
  |:-------------:|:------:|:----:|:---------------:|:--------:|:------:|
55
+ | 0.6712 | 0.1016 | 62 | 0.8671 | 0.6680 | 0.6623 |
56
+ | 0.3119 | 1.1016 | 124 | 0.5911 | 0.7884 | 0.7887 |
57
+ | 0.2505 | 2.1016 | 186 | 0.5297 | 0.8008 | 0.8002 |
58
+ | 0.207 | 3.1016 | 248 | 0.5970 | 0.7801 | 0.7787 |
59
+ | 0.1743 | 4.1016 | 310 | 0.5612 | 0.8050 | 0.7984 |
60
+ | 0.1005 | 5.1016 | 372 | 0.4027 | 0.8838 | 0.8838 |
61
+ | 0.0147 | 6.1016 | 434 | 0.4360 | 0.8589 | 0.8573 |
62
+ | 0.0573 | 7.1016 | 496 | 0.4451 | 0.8714 | 0.8697 |
63
+ | 0.0143 | 8.1016 | 558 | 0.4099 | 0.8672 | 0.8666 |
64
+ | 0.1311 | 9.0852 | 610 | 0.4056 | 0.8755 | 0.8752 |
65
 
66
 
67
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.1891891891891895,
3
- "eval_accuracy": 0.8340248962655602,
4
- "eval_f1": 0.8346934110853106,
5
- "eval_loss": 0.47942548990249634,
6
- "eval_runtime": 73.4732,
7
- "eval_samples_per_second": 3.28,
8
- "eval_steps_per_second": 0.34
9
  }
 
1
  {
2
+ "epoch": 9.085245901639345,
3
+ "eval_accuracy": 0.8838174273858921,
4
+ "eval_f1": 0.8838017754864972,
5
+ "eval_loss": 0.40273845195770264,
6
+ "eval_runtime": 76.8071,
7
+ "eval_samples_per_second": 3.138,
8
+ "eval_steps_per_second": 0.273
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:110e3f8d9108a5c99331db36f4a3ea54ec65f58f0032d594b9f5e9ffee13f4d3
3
  size 485078408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6771e05fca54be28cdcd593bce1ae9495f8bb0e6f7177ed9683d50368ee125d1
3
  size 485078408
test_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.1891891891891895,
3
- "eval_accuracy": 0.8067415730337079,
4
- "eval_f1": 0.8059471321937128,
5
- "eval_loss": 0.6186416745185852,
6
- "eval_runtime": 137.3351,
7
- "eval_samples_per_second": 3.24,
8
- "eval_steps_per_second": 0.328
9
  }
 
1
  {
2
+ "epoch": 9.085245901639345,
3
+ "eval_accuracy": 0.8337078651685393,
4
+ "eval_f1": 0.8322580739791249,
5
+ "eval_loss": 0.6123429536819458,
6
+ "eval_runtime": 140.7823,
7
+ "eval_samples_per_second": 3.161,
8
+ "eval_steps_per_second": 0.27
9
  }
trainer_state.json CHANGED
@@ -1,364 +1,582 @@
1
  {
2
- "best_metric": 0.8346934110853106,
3
- "best_model_checkpoint": "videomae-surf-analytics-runpod/checkpoint-370",
4
- "epoch": 4.1891891891891895,
5
  "eval_steps": 500,
6
- "global_step": 370,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02702702702702703,
13
- "grad_norm": 17.739418029785156,
14
- "learning_rate": 1.3513513513513515e-05,
15
- "loss": 1.4705,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.05405405405405406,
20
- "grad_norm": 6.248379707336426,
21
- "learning_rate": 2.702702702702703e-05,
22
- "loss": 1.1378,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.08108108108108109,
27
- "grad_norm": 6.359808444976807,
28
- "learning_rate": 4.0540540540540545e-05,
29
- "loss": 1.0852,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.10810810810810811,
34
- "grad_norm": 7.196521282196045,
35
- "learning_rate": 4.954954954954955e-05,
36
- "loss": 0.7478,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.13513513513513514,
41
- "grad_norm": 7.588693141937256,
42
- "learning_rate": 4.804804804804805e-05,
43
- "loss": 0.653,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.16216216216216217,
48
- "grad_norm": 9.380898475646973,
49
- "learning_rate": 4.654654654654655e-05,
50
- "loss": 0.7808,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.1891891891891892,
55
- "grad_norm": 9.640559196472168,
56
- "learning_rate": 4.5045045045045046e-05,
57
- "loss": 0.6213,
58
- "step": 70
 
 
 
59
  },
60
  {
61
- "epoch": 0.20270270270270271,
62
- "eval_accuracy": 0.6970954356846473,
63
- "eval_f1": 0.6901963848053563,
64
- "eval_loss": 0.7773587703704834,
65
- "eval_runtime": 77.0947,
66
- "eval_samples_per_second": 3.126,
67
- "eval_steps_per_second": 0.324,
68
- "step": 75
69
  },
70
  {
71
- "epoch": 1.0135135135135136,
72
- "grad_norm": 4.199209690093994,
73
- "learning_rate": 4.354354354354355e-05,
74
- "loss": 0.4215,
75
  "step": 80
76
  },
77
  {
78
- "epoch": 1.0405405405405406,
79
- "grad_norm": 3.8096096515655518,
80
- "learning_rate": 4.204204204204204e-05,
81
- "loss": 0.391,
82
  "step": 90
83
  },
84
  {
85
- "epoch": 1.0675675675675675,
86
- "grad_norm": 13.743525505065918,
87
- "learning_rate": 4.0540540540540545e-05,
88
- "loss": 0.4713,
89
  "step": 100
90
  },
91
  {
92
- "epoch": 1.0945945945945945,
93
- "grad_norm": 3.954926013946533,
94
- "learning_rate": 3.903903903903904e-05,
95
- "loss": 0.4197,
96
  "step": 110
97
  },
98
  {
99
- "epoch": 1.1216216216216217,
100
- "grad_norm": 7.9359211921691895,
101
- "learning_rate": 3.7537537537537536e-05,
102
- "loss": 0.3719,
103
  "step": 120
104
  },
105
  {
106
- "epoch": 1.1486486486486487,
107
- "grad_norm": 10.24318790435791,
108
- "learning_rate": 3.603603603603604e-05,
109
- "loss": 0.3922,
110
- "step": 130
 
 
 
111
  },
112
  {
113
- "epoch": 1.1756756756756757,
114
- "grad_norm": 13.8519926071167,
115
- "learning_rate": 3.453453453453453e-05,
116
- "loss": 0.4027,
117
- "step": 140
118
  },
119
  {
120
- "epoch": 1.2027027027027026,
121
- "grad_norm": 1.3211474418640137,
122
- "learning_rate": 3.3033033033033035e-05,
123
- "loss": 0.3269,
124
- "step": 150
125
  },
126
  {
127
- "epoch": 1.2027027027027026,
128
- "eval_accuracy": 0.7593360995850622,
129
- "eval_f1": 0.7561505755143376,
130
- "eval_loss": 0.6851304173469543,
131
- "eval_runtime": 78.2702,
132
- "eval_samples_per_second": 3.079,
133
- "eval_steps_per_second": 0.319,
134
  "step": 150
135
  },
136
  {
137
- "epoch": 2.027027027027027,
138
- "grad_norm": 6.484254360198975,
139
- "learning_rate": 3.153153153153153e-05,
140
- "loss": 0.1699,
141
  "step": 160
142
  },
143
  {
144
- "epoch": 2.054054054054054,
145
- "grad_norm": 4.243548393249512,
146
- "learning_rate": 3.0030030030030033e-05,
147
- "loss": 0.1404,
148
  "step": 170
149
  },
150
  {
151
- "epoch": 2.081081081081081,
152
- "grad_norm": 4.49724006652832,
153
- "learning_rate": 2.852852852852853e-05,
154
- "loss": 0.2383,
155
  "step": 180
156
  },
157
  {
158
- "epoch": 2.108108108108108,
159
- "grad_norm": 5.5315632820129395,
160
- "learning_rate": 2.702702702702703e-05,
161
- "loss": 0.3597,
 
 
 
 
 
 
 
 
 
 
162
  "step": 190
163
  },
164
  {
165
- "epoch": 2.135135135135135,
166
- "grad_norm": 7.347559452056885,
167
- "learning_rate": 2.552552552552553e-05,
168
- "loss": 0.1501,
169
  "step": 200
170
  },
171
  {
172
- "epoch": 2.1621621621621623,
173
- "grad_norm": 0.20061562955379486,
174
- "learning_rate": 2.4024024024024024e-05,
175
- "loss": 0.1769,
176
  "step": 210
177
  },
178
  {
179
- "epoch": 2.189189189189189,
180
- "grad_norm": 0.2758616507053375,
181
- "learning_rate": 2.2522522522522523e-05,
182
- "loss": 0.2339,
183
  "step": 220
184
  },
185
  {
186
- "epoch": 2.2027027027027026,
187
- "eval_accuracy": 0.8008298755186722,
188
- "eval_f1": 0.7992975614249908,
189
- "eval_loss": 0.5251602530479431,
190
- "eval_runtime": 75.07,
191
- "eval_samples_per_second": 3.21,
192
- "eval_steps_per_second": 0.333,
193
- "step": 225
194
- },
195
- {
196
- "epoch": 3.0135135135135136,
197
- "grad_norm": 8.580607414245605,
198
- "learning_rate": 2.102102102102102e-05,
199
- "loss": 0.8979,
200
  "step": 230
201
  },
202
  {
203
- "epoch": 3.0405405405405403,
204
- "grad_norm": 0.5315948724746704,
205
- "learning_rate": 1.951951951951952e-05,
206
- "loss": 0.0559,
207
  "step": 240
208
  },
209
  {
210
- "epoch": 3.0675675675675675,
211
- "grad_norm": 0.13150528073310852,
212
- "learning_rate": 1.801801801801802e-05,
213
- "loss": 0.1062,
 
 
 
 
 
 
 
 
 
 
214
  "step": 250
215
  },
216
  {
217
- "epoch": 3.0945945945945947,
218
- "grad_norm": 19.857810974121094,
219
- "learning_rate": 1.6516516516516518e-05,
220
- "loss": 0.1066,
221
  "step": 260
222
  },
223
  {
224
- "epoch": 3.1216216216216215,
225
- "grad_norm": 1.9958362579345703,
226
- "learning_rate": 1.5015015015015016e-05,
227
- "loss": 0.1944,
228
  "step": 270
229
  },
230
  {
231
- "epoch": 3.1486486486486487,
232
- "grad_norm": 16.243772506713867,
233
- "learning_rate": 1.3513513513513515e-05,
234
- "loss": 0.137,
235
  "step": 280
236
  },
237
  {
238
- "epoch": 3.175675675675676,
239
- "grad_norm": 3.6971592903137207,
240
- "learning_rate": 1.2012012012012012e-05,
241
- "loss": 0.1812,
242
  "step": 290
243
  },
244
  {
245
- "epoch": 3.2027027027027026,
246
- "grad_norm": 1.1694248914718628,
247
- "learning_rate": 1.051051051051051e-05,
248
- "loss": 0.1931,
249
  "step": 300
250
  },
251
  {
252
- "epoch": 3.2027027027027026,
253
- "eval_accuracy": 0.8340248962655602,
254
- "eval_f1": 0.8341831246711504,
255
- "eval_loss": 0.4942285716533661,
256
- "eval_runtime": 73.8969,
257
- "eval_samples_per_second": 3.261,
258
- "eval_steps_per_second": 0.338,
259
- "step": 300
260
  },
261
  {
262
- "epoch": 4.027027027027027,
263
- "grad_norm": 2.825990915298462,
264
- "learning_rate": 9.00900900900901e-06,
265
- "loss": 0.0731,
 
 
 
266
  "step": 310
267
  },
268
  {
269
- "epoch": 4.054054054054054,
270
- "grad_norm": 0.2709617614746094,
271
- "learning_rate": 7.507507507507508e-06,
272
- "loss": 0.0751,
273
  "step": 320
274
  },
275
  {
276
- "epoch": 4.081081081081081,
277
- "grad_norm": 0.10882856696844101,
278
- "learning_rate": 6.006006006006006e-06,
279
- "loss": 0.0774,
280
  "step": 330
281
  },
282
  {
283
- "epoch": 4.108108108108108,
284
- "grad_norm": 0.09481658786535263,
285
- "learning_rate": 4.504504504504505e-06,
286
- "loss": 0.0567,
287
  "step": 340
288
  },
289
  {
290
- "epoch": 4.135135135135135,
291
- "grad_norm": 0.3658810257911682,
292
- "learning_rate": 3.003003003003003e-06,
293
- "loss": 0.0438,
294
  "step": 350
295
  },
296
  {
297
- "epoch": 4.162162162162162,
298
- "grad_norm": 0.9697806239128113,
299
- "learning_rate": 1.5015015015015015e-06,
300
- "loss": 0.098,
301
  "step": 360
302
  },
303
  {
304
- "epoch": 4.1891891891891895,
305
- "grad_norm": 0.357666552066803,
306
- "learning_rate": 0.0,
307
- "loss": 0.1037,
308
  "step": 370
309
  },
310
  {
311
- "epoch": 4.1891891891891895,
312
- "eval_accuracy": 0.8340248962655602,
313
- "eval_f1": 0.8346934110853106,
314
- "eval_loss": 0.47942548990249634,
315
- "eval_runtime": 73.9617,
316
- "eval_samples_per_second": 3.258,
317
- "eval_steps_per_second": 0.338,
318
- "step": 370
319
  },
320
  {
321
- "epoch": 4.1891891891891895,
322
- "step": 370,
323
- "total_flos": 3.2102444558954004e+18,
324
- "train_loss": 0.36656422937238536,
325
- "train_runtime": 1888.2958,
326
- "train_samples_per_second": 1.959,
327
- "train_steps_per_second": 0.196
328
  },
329
  {
330
- "epoch": 4.1891891891891895,
331
- "eval_accuracy": 0.9784075573549258,
332
- "eval_f1": 0.9783996585344938,
333
- "eval_loss": 0.06370694935321808,
334
- "eval_runtime": 252.958,
335
- "eval_samples_per_second": 2.929,
336
- "eval_steps_per_second": 0.296,
337
- "step": 370
338
  },
339
  {
340
- "epoch": 4.1891891891891895,
341
- "eval_accuracy": 0.8067415730337079,
342
- "eval_f1": 0.8059471321937128,
343
- "eval_loss": 0.6186416745185852,
344
- "eval_runtime": 137.3351,
345
- "eval_samples_per_second": 3.24,
346
- "eval_steps_per_second": 0.328,
347
- "step": 370
348
  },
349
  {
350
- "epoch": 4.1891891891891895,
351
- "eval_accuracy": 0.8340248962655602,
352
- "eval_f1": 0.8346934110853106,
353
- "eval_loss": 0.47942548990249634,
354
- "eval_runtime": 73.4732,
355
- "eval_samples_per_second": 3.28,
356
- "eval_steps_per_second": 0.34,
357
- "step": 370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  }
359
  ],
360
  "logging_steps": 10,
361
- "max_steps": 370,
362
  "num_input_tokens_seen": 0,
363
  "num_train_epochs": 9223372036854775807,
364
  "save_steps": 500,
@@ -374,8 +592,8 @@
374
  "attributes": {}
375
  }
376
  },
377
- "total_flos": 3.2102444558954004e+18,
378
- "train_batch_size": 10,
379
  "trial_name": null,
380
  "trial_params": null
381
  }
 
1
  {
2
+ "best_metric": 0.8838017754864972,
3
+ "best_model_checkpoint": "videomae-surf-analytics-runpod/checkpoint-372",
4
+ "epoch": 9.085245901639345,
5
  "eval_steps": 500,
6
+ "global_step": 610,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01639344262295082,
13
+ "grad_norm": 14.228123664855957,
14
+ "learning_rate": 8.196721311475409e-06,
15
+ "loss": 1.5374,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.03278688524590164,
20
+ "grad_norm": 8.263284683227539,
21
+ "learning_rate": 1.6393442622950818e-05,
22
+ "loss": 1.33,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.04918032786885246,
27
+ "grad_norm": 10.129837989807129,
28
+ "learning_rate": 2.459016393442623e-05,
29
+ "loss": 1.1214,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.06557377049180328,
34
+ "grad_norm": 5.447906017303467,
35
+ "learning_rate": 3.2786885245901635e-05,
36
+ "loss": 1.023,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.08196721311475409,
41
+ "grad_norm": 5.764439105987549,
42
+ "learning_rate": 4.098360655737705e-05,
43
+ "loss": 0.7497,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.09836065573770492,
48
+ "grad_norm": 6.531442165374756,
49
+ "learning_rate": 4.918032786885246e-05,
50
+ "loss": 0.6712,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.10163934426229508,
55
+ "eval_accuracy": 0.6680497925311203,
56
+ "eval_f1": 0.6623298693885865,
57
+ "eval_loss": 0.8671284317970276,
58
+ "eval_runtime": 81.9529,
59
+ "eval_samples_per_second": 2.941,
60
+ "eval_steps_per_second": 0.256,
61
+ "step": 62
62
  },
63
  {
64
+ "epoch": 1.0131147540983607,
65
+ "grad_norm": 6.3692240715026855,
66
+ "learning_rate": 4.918032786885246e-05,
67
+ "loss": 0.5055,
68
+ "step": 70
 
 
 
69
  },
70
  {
71
+ "epoch": 1.0295081967213116,
72
+ "grad_norm": 7.524365425109863,
73
+ "learning_rate": 4.8269581056466304e-05,
74
+ "loss": 0.4465,
75
  "step": 80
76
  },
77
  {
78
+ "epoch": 1.0459016393442624,
79
+ "grad_norm": 3.4210591316223145,
80
+ "learning_rate": 4.7358834244080144e-05,
81
+ "loss": 0.3804,
82
  "step": 90
83
  },
84
  {
85
+ "epoch": 1.0622950819672132,
86
+ "grad_norm": 7.561025619506836,
87
+ "learning_rate": 4.644808743169399e-05,
88
+ "loss": 0.4557,
89
  "step": 100
90
  },
91
  {
92
+ "epoch": 1.0786885245901638,
93
+ "grad_norm": 2.567615509033203,
94
+ "learning_rate": 4.553734061930783e-05,
95
+ "loss": 0.3558,
96
  "step": 110
97
  },
98
  {
99
+ "epoch": 1.0950819672131147,
100
+ "grad_norm": 5.366397380828857,
101
+ "learning_rate": 4.462659380692168e-05,
102
+ "loss": 0.3119,
103
  "step": 120
104
  },
105
  {
106
+ "epoch": 1.1016393442622952,
107
+ "eval_accuracy": 0.7883817427385892,
108
+ "eval_f1": 0.7887466325291007,
109
+ "eval_loss": 0.5910844802856445,
110
+ "eval_runtime": 82.6936,
111
+ "eval_samples_per_second": 2.914,
112
+ "eval_steps_per_second": 0.254,
113
+ "step": 124
114
  },
115
  {
116
+ "epoch": 2.0098360655737704,
117
+ "grad_norm": 2.0202689170837402,
118
+ "learning_rate": 4.371584699453552e-05,
119
+ "loss": 0.2248,
120
+ "step": 130
121
  },
122
  {
123
+ "epoch": 2.0262295081967214,
124
+ "grad_norm": 5.155245780944824,
125
+ "learning_rate": 4.280510018214937e-05,
126
+ "loss": 0.2797,
127
+ "step": 140
128
  },
129
  {
130
+ "epoch": 2.042622950819672,
131
+ "grad_norm": 8.79983139038086,
132
+ "learning_rate": 4.189435336976321e-05,
133
+ "loss": 0.2696,
 
 
 
134
  "step": 150
135
  },
136
  {
137
+ "epoch": 2.059016393442623,
138
+ "grad_norm": 2.6221516132354736,
139
+ "learning_rate": 4.098360655737705e-05,
140
+ "loss": 0.2733,
141
  "step": 160
142
  },
143
  {
144
+ "epoch": 2.0754098360655737,
145
+ "grad_norm": 1.32801353931427,
146
+ "learning_rate": 4.007285974499089e-05,
147
+ "loss": 0.159,
148
  "step": 170
149
  },
150
  {
151
+ "epoch": 2.091803278688525,
152
+ "grad_norm": 7.7792792320251465,
153
+ "learning_rate": 3.916211293260474e-05,
154
+ "loss": 0.2505,
155
  "step": 180
156
  },
157
  {
158
+ "epoch": 2.101639344262295,
159
+ "eval_accuracy": 0.8008298755186722,
160
+ "eval_f1": 0.8002014090167812,
161
+ "eval_loss": 0.5296825170516968,
162
+ "eval_runtime": 80.6025,
163
+ "eval_samples_per_second": 2.99,
164
+ "eval_steps_per_second": 0.261,
165
+ "step": 186
166
+ },
167
+ {
168
+ "epoch": 3.0065573770491802,
169
+ "grad_norm": 6.463658332824707,
170
+ "learning_rate": 3.825136612021858e-05,
171
+ "loss": 0.3667,
172
  "step": 190
173
  },
174
  {
175
+ "epoch": 3.0229508196721313,
176
+ "grad_norm": 7.710232257843018,
177
+ "learning_rate": 3.7340619307832425e-05,
178
+ "loss": 0.1461,
179
  "step": 200
180
  },
181
  {
182
+ "epoch": 3.039344262295082,
183
+ "grad_norm": 8.827251434326172,
184
+ "learning_rate": 3.6429872495446266e-05,
185
+ "loss": 0.1328,
186
  "step": 210
187
  },
188
  {
189
+ "epoch": 3.055737704918033,
190
+ "grad_norm": 14.754895210266113,
191
+ "learning_rate": 3.551912568306011e-05,
192
+ "loss": 0.4213,
193
  "step": 220
194
  },
195
  {
196
+ "epoch": 3.0721311475409836,
197
+ "grad_norm": 7.798481464385986,
198
+ "learning_rate": 3.4608378870673954e-05,
199
+ "loss": 0.2256,
 
 
 
 
 
 
 
 
 
 
200
  "step": 230
201
  },
202
  {
203
+ "epoch": 3.088524590163934,
204
+ "grad_norm": 1.4291377067565918,
205
+ "learning_rate": 3.36976320582878e-05,
206
+ "loss": 0.207,
207
  "step": 240
208
  },
209
  {
210
+ "epoch": 3.101639344262295,
211
+ "eval_accuracy": 0.7800829875518672,
212
+ "eval_f1": 0.778687519310884,
213
+ "eval_loss": 0.5970368981361389,
214
+ "eval_runtime": 76.1019,
215
+ "eval_samples_per_second": 3.167,
216
+ "eval_steps_per_second": 0.276,
217
+ "step": 248
218
+ },
219
+ {
220
+ "epoch": 4.00327868852459,
221
+ "grad_norm": 6.123374938964844,
222
+ "learning_rate": 3.2786885245901635e-05,
223
+ "loss": 0.1879,
224
  "step": 250
225
  },
226
  {
227
+ "epoch": 4.019672131147541,
228
+ "grad_norm": 2.9475815296173096,
229
+ "learning_rate": 3.187613843351548e-05,
230
+ "loss": 0.0801,
231
  "step": 260
232
  },
233
  {
234
+ "epoch": 4.036065573770492,
235
+ "grad_norm": 0.0647067278623581,
236
+ "learning_rate": 3.096539162112932e-05,
237
+ "loss": 0.0925,
238
  "step": 270
239
  },
240
  {
241
+ "epoch": 4.052459016393443,
242
+ "grad_norm": 2.0591495037078857,
243
+ "learning_rate": 3.005464480874317e-05,
244
+ "loss": 0.1952,
245
  "step": 280
246
  },
247
  {
248
+ "epoch": 4.0688524590163935,
249
+ "grad_norm": 9.379964828491211,
250
+ "learning_rate": 2.9143897996357018e-05,
251
+ "loss": 0.2213,
252
  "step": 290
253
  },
254
  {
255
+ "epoch": 4.085245901639344,
256
+ "grad_norm": 0.2886744737625122,
257
+ "learning_rate": 2.823315118397086e-05,
258
+ "loss": 0.1651,
259
  "step": 300
260
  },
261
  {
262
+ "epoch": 4.101639344262295,
263
+ "grad_norm": 10.280256271362305,
264
+ "learning_rate": 2.7322404371584703e-05,
265
+ "loss": 0.1743,
266
+ "step": 310
 
 
 
267
  },
268
  {
269
+ "epoch": 4.101639344262295,
270
+ "eval_accuracy": 0.8049792531120332,
271
+ "eval_f1": 0.7984134204702362,
272
+ "eval_loss": 0.5612084865570068,
273
+ "eval_runtime": 76.7892,
274
+ "eval_samples_per_second": 3.138,
275
+ "eval_steps_per_second": 0.273,
276
  "step": 310
277
  },
278
  {
279
+ "epoch": 5.016393442622951,
280
+ "grad_norm": 0.10767544060945511,
281
+ "learning_rate": 2.6411657559198543e-05,
282
+ "loss": 0.1745,
283
  "step": 320
284
  },
285
  {
286
+ "epoch": 5.032786885245901,
287
+ "grad_norm": 0.31735455989837646,
288
+ "learning_rate": 2.550091074681239e-05,
289
+ "loss": 0.0893,
290
  "step": 330
291
  },
292
  {
293
+ "epoch": 5.049180327868853,
294
+ "grad_norm": 0.657698392868042,
295
+ "learning_rate": 2.459016393442623e-05,
296
+ "loss": 0.0209,
297
  "step": 340
298
  },
299
  {
300
+ "epoch": 5.065573770491803,
301
+ "grad_norm": 6.128834247589111,
302
+ "learning_rate": 2.3679417122040072e-05,
303
+ "loss": 0.0931,
304
  "step": 350
305
  },
306
  {
307
+ "epoch": 5.081967213114754,
308
+ "grad_norm": 3.8232431411743164,
309
+ "learning_rate": 2.2768670309653916e-05,
310
+ "loss": 0.1862,
311
  "step": 360
312
  },
313
  {
314
+ "epoch": 5.098360655737705,
315
+ "grad_norm": 0.28946903347969055,
316
+ "learning_rate": 2.185792349726776e-05,
317
+ "loss": 0.1005,
318
  "step": 370
319
  },
320
  {
321
+ "epoch": 5.101639344262295,
322
+ "eval_accuracy": 0.8838174273858921,
323
+ "eval_f1": 0.8838017754864972,
324
+ "eval_loss": 0.40273839235305786,
325
+ "eval_runtime": 79.1005,
326
+ "eval_samples_per_second": 3.047,
327
+ "eval_steps_per_second": 0.265,
328
+ "step": 372
329
  },
330
  {
331
+ "epoch": 6.0131147540983605,
332
+ "grad_norm": 7.813482761383057,
333
+ "learning_rate": 2.0947176684881604e-05,
334
+ "loss": 0.1164,
335
+ "step": 380
 
 
336
  },
337
  {
338
+ "epoch": 6.029508196721311,
339
+ "grad_norm": 4.695272445678711,
340
+ "learning_rate": 2.0036429872495445e-05,
341
+ "loss": 0.128,
342
+ "step": 390
 
 
 
343
  },
344
  {
345
+ "epoch": 6.045901639344263,
346
+ "grad_norm": 0.22056636214256287,
347
+ "learning_rate": 1.912568306010929e-05,
348
+ "loss": 0.1201,
349
+ "step": 400
 
 
 
350
  },
351
  {
352
+ "epoch": 6.062295081967213,
353
+ "grad_norm": 0.24654638767242432,
354
+ "learning_rate": 1.8214936247723133e-05,
355
+ "loss": 0.0823,
356
+ "step": 410
357
+ },
358
+ {
359
+ "epoch": 6.078688524590164,
360
+ "grad_norm": 1.5276825428009033,
361
+ "learning_rate": 1.7304189435336977e-05,
362
+ "loss": 0.0994,
363
+ "step": 420
364
+ },
365
+ {
366
+ "epoch": 6.0950819672131145,
367
+ "grad_norm": 5.22976016998291,
368
+ "learning_rate": 1.6393442622950818e-05,
369
+ "loss": 0.0147,
370
+ "step": 430
371
+ },
372
+ {
373
+ "epoch": 6.101639344262295,
374
+ "eval_accuracy": 0.8589211618257261,
375
+ "eval_f1": 0.8572866763193951,
376
+ "eval_loss": 0.4360053241252899,
377
+ "eval_runtime": 76.601,
378
+ "eval_samples_per_second": 3.146,
379
+ "eval_steps_per_second": 0.274,
380
+ "step": 434
381
+ },
382
+ {
383
+ "epoch": 7.00983606557377,
384
+ "grad_norm": 0.01562822423875332,
385
+ "learning_rate": 1.548269581056466e-05,
386
+ "loss": 0.0178,
387
+ "step": 440
388
+ },
389
+ {
390
+ "epoch": 7.026229508196721,
391
+ "grad_norm": 0.15828734636306763,
392
+ "learning_rate": 1.4571948998178509e-05,
393
+ "loss": 0.1265,
394
+ "step": 450
395
+ },
396
+ {
397
+ "epoch": 7.0426229508196725,
398
+ "grad_norm": 0.9054508805274963,
399
+ "learning_rate": 1.3661202185792351e-05,
400
+ "loss": 0.0251,
401
+ "step": 460
402
+ },
403
+ {
404
+ "epoch": 7.059016393442623,
405
+ "grad_norm": 0.2960349917411804,
406
+ "learning_rate": 1.2750455373406195e-05,
407
+ "loss": 0.0241,
408
+ "step": 470
409
+ },
410
+ {
411
+ "epoch": 7.075409836065574,
412
+ "grad_norm": 0.08049295842647552,
413
+ "learning_rate": 1.1839708561020036e-05,
414
+ "loss": 0.0518,
415
+ "step": 480
416
+ },
417
+ {
418
+ "epoch": 7.091803278688524,
419
+ "grad_norm": 0.25148022174835205,
420
+ "learning_rate": 1.092896174863388e-05,
421
+ "loss": 0.0573,
422
+ "step": 490
423
+ },
424
+ {
425
+ "epoch": 7.101639344262295,
426
+ "eval_accuracy": 0.8713692946058091,
427
+ "eval_f1": 0.8697143392786835,
428
+ "eval_loss": 0.4450831115245819,
429
+ "eval_runtime": 75.7767,
430
+ "eval_samples_per_second": 3.18,
431
+ "eval_steps_per_second": 0.277,
432
+ "step": 496
433
+ },
434
+ {
435
+ "epoch": 8.00655737704918,
436
+ "grad_norm": 0.028504155576229095,
437
+ "learning_rate": 1.0018214936247722e-05,
438
+ "loss": 0.0705,
439
+ "step": 500
440
+ },
441
+ {
442
+ "epoch": 8.02295081967213,
443
+ "grad_norm": 0.7975661158561707,
444
+ "learning_rate": 9.107468123861566e-06,
445
+ "loss": 0.0547,
446
+ "step": 510
447
+ },
448
+ {
449
+ "epoch": 8.039344262295081,
450
+ "grad_norm": 0.035094812512397766,
451
+ "learning_rate": 8.196721311475409e-06,
452
+ "loss": 0.0484,
453
+ "step": 520
454
+ },
455
+ {
456
+ "epoch": 8.055737704918032,
457
+ "grad_norm": 4.1898651123046875,
458
+ "learning_rate": 7.2859744990892545e-06,
459
+ "loss": 0.0834,
460
+ "step": 530
461
+ },
462
+ {
463
+ "epoch": 8.072131147540984,
464
+ "grad_norm": 0.8504851460456848,
465
+ "learning_rate": 6.375227686703098e-06,
466
+ "loss": 0.1502,
467
+ "step": 540
468
+ },
469
+ {
470
+ "epoch": 8.088524590163935,
471
+ "grad_norm": 0.8113920092582703,
472
+ "learning_rate": 5.46448087431694e-06,
473
+ "loss": 0.0143,
474
+ "step": 550
475
+ },
476
+ {
477
+ "epoch": 8.101639344262296,
478
+ "eval_accuracy": 0.8672199170124482,
479
+ "eval_f1": 0.8666497215968316,
480
+ "eval_loss": 0.40985623002052307,
481
+ "eval_runtime": 79.8008,
482
+ "eval_samples_per_second": 3.02,
483
+ "eval_steps_per_second": 0.263,
484
+ "step": 558
485
+ },
486
+ {
487
+ "epoch": 9.00327868852459,
488
+ "grad_norm": 6.550829887390137,
489
+ "learning_rate": 4.553734061930783e-06,
490
+ "loss": 0.067,
491
+ "step": 560
492
+ },
493
+ {
494
+ "epoch": 9.01967213114754,
495
+ "grad_norm": 2.0722174644470215,
496
+ "learning_rate": 3.6429872495446273e-06,
497
+ "loss": 0.0158,
498
+ "step": 570
499
+ },
500
+ {
501
+ "epoch": 9.036065573770491,
502
+ "grad_norm": 0.09152109175920486,
503
+ "learning_rate": 2.73224043715847e-06,
504
+ "loss": 0.0203,
505
+ "step": 580
506
+ },
507
+ {
508
+ "epoch": 9.052459016393442,
509
+ "grad_norm": 0.12251006811857224,
510
+ "learning_rate": 1.8214936247723136e-06,
511
+ "loss": 0.0169,
512
+ "step": 590
513
+ },
514
+ {
515
+ "epoch": 9.068852459016393,
516
+ "grad_norm": 0.1367557793855667,
517
+ "learning_rate": 9.107468123861568e-07,
518
+ "loss": 0.0043,
519
+ "step": 600
520
+ },
521
+ {
522
+ "epoch": 9.085245901639345,
523
+ "grad_norm": 6.975312232971191,
524
+ "learning_rate": 0.0,
525
+ "loss": 0.1311,
526
+ "step": 610
527
+ },
528
+ {
529
+ "epoch": 9.085245901639345,
530
+ "eval_accuracy": 0.8755186721991701,
531
+ "eval_f1": 0.8751571381793353,
532
+ "eval_loss": 0.40564292669296265,
533
+ "eval_runtime": 76.4754,
534
+ "eval_samples_per_second": 3.151,
535
+ "eval_steps_per_second": 0.275,
536
+ "step": 610
537
+ },
538
+ {
539
+ "epoch": 9.085245901639345,
540
+ "step": 610,
541
+ "total_flos": 6.389823367042892e+18,
542
+ "train_loss": 0.24862837887933997,
543
+ "train_runtime": 3777.5727,
544
+ "train_samples_per_second": 1.938,
545
+ "train_steps_per_second": 0.161
546
+ },
547
+ {
548
+ "epoch": 9.085245901639345,
549
+ "eval_accuracy": 0.9824561403508771,
550
+ "eval_f1": 0.9824594539682324,
551
+ "eval_loss": 0.04523608461022377,
552
+ "eval_runtime": 250.7162,
553
+ "eval_samples_per_second": 2.956,
554
+ "eval_steps_per_second": 0.247,
555
+ "step": 610
556
+ },
557
+ {
558
+ "epoch": 9.085245901639345,
559
+ "eval_accuracy": 0.8337078651685393,
560
+ "eval_f1": 0.8322580739791249,
561
+ "eval_loss": 0.6123429536819458,
562
+ "eval_runtime": 140.7823,
563
+ "eval_samples_per_second": 3.161,
564
+ "eval_steps_per_second": 0.27,
565
+ "step": 610
566
+ },
567
+ {
568
+ "epoch": 9.085245901639345,
569
+ "eval_accuracy": 0.8838174273858921,
570
+ "eval_f1": 0.8838017754864972,
571
+ "eval_loss": 0.40273845195770264,
572
+ "eval_runtime": 76.8071,
573
+ "eval_samples_per_second": 3.138,
574
+ "eval_steps_per_second": 0.273,
575
+ "step": 610
576
  }
577
  ],
578
  "logging_steps": 10,
579
+ "max_steps": 610,
580
  "num_input_tokens_seen": 0,
581
  "num_train_epochs": 9223372036854775807,
582
  "save_steps": 500,
 
592
  "attributes": {}
593
  }
594
  },
595
+ "total_flos": 6.389823367042892e+18,
596
+ "train_batch_size": 12,
597
  "trial_name": null,
598
  "trial_params": null
599
  }
val_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.1891891891891895,
3
- "eval_accuracy": 0.8340248962655602,
4
- "eval_f1": 0.8346934110853106,
5
- "eval_loss": 0.47942548990249634,
6
- "eval_runtime": 73.4732,
7
- "eval_samples_per_second": 3.28,
8
- "eval_steps_per_second": 0.34
9
  }
 
1
  {
2
+ "epoch": 9.085245901639345,
3
+ "eval_accuracy": 0.8838174273858921,
4
+ "eval_f1": 0.8838017754864972,
5
+ "eval_loss": 0.40273845195770264,
6
+ "eval_runtime": 76.8071,
7
+ "eval_samples_per_second": 3.138,
8
+ "eval_steps_per_second": 0.273
9
  }