anh-dangminh commited on
Commit
dc94387
·
verified ·
1 Parent(s): 36e1704

End of training

Browse files
Files changed (5) hide show
  1. README.md +34 -25
  2. all_results.json +6 -11
  3. model.safetensors +1 -1
  4. train_results.json +6 -6
  5. trainer_state.json +310 -586
README.md CHANGED
@@ -8,6 +8,9 @@ datasets:
8
  - oxford102_flower_dataset
9
  metrics:
10
  - accuracy
 
 
 
11
  model-index:
12
  - name: resnet-50-finetuned-oxfordflowers
13
  results:
@@ -23,7 +26,16 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.85
 
 
 
 
 
 
 
 
 
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +45,11 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) on the oxford102_flower_dataset dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.5915
37
- - Accuracy: 0.85
 
 
 
38
 
39
  ## Model description
40
 
@@ -63,28 +78,22 @@ The following hyperparameters were used during training:
63
 
64
  ### Training results
65
 
66
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
67
- |:-------------:|:-----:|:----:|:---------------:|:--------:|
68
- | 4.5224 | 1.0 | 32 | 4.2939 | 0.25 |
69
- | 2.8139 | 2.0 | 64 | 2.1128 | 0.4892 |
70
- | 1.4505 | 3.0 | 96 | 1.2261 | 0.6843 |
71
- | 0.5751 | 4.0 | 128 | 1.0176 | 0.7441 |
72
- | 0.2265 | 5.0 | 160 | 0.8487 | 0.7559 |
73
- | 0.0531 | 6.0 | 192 | 0.7609 | 0.8 |
74
- | 0.0411 | 7.0 | 224 | 0.7191 | 0.8029 |
75
- | 0.0351 | 8.0 | 256 | 0.6987 | 0.8078 |
76
- | 0.0107 | 9.0 | 288 | 0.6843 | 0.8225 |
77
- | 0.0094 | 10.0 | 320 | 0.6314 | 0.8343 |
78
- | 0.0081 | 11.0 | 352 | 0.6320 | 0.8353 |
79
- | 0.0053 | 12.0 | 384 | 0.6049 | 0.8353 |
80
- | 0.0048 | 13.0 | 416 | 0.5961 | 0.8373 |
81
- | 0.0024 | 14.0 | 448 | 0.5880 | 0.8471 |
82
- | 0.0028 | 15.0 | 480 | 0.5927 | 0.8441 |
83
- | 0.0023 | 16.0 | 512 | 0.5878 | 0.8520 |
84
- | 0.0027 | 17.0 | 544 | 0.5872 | 0.8471 |
85
- | 0.0028 | 18.0 | 576 | 0.5892 | 0.8451 |
86
- | 0.002 | 19.0 | 608 | 0.5933 | 0.8412 |
87
- | 0.0017 | 20.0 | 640 | 0.5915 | 0.85 |
88
 
89
 
90
  ### Framework versions
 
8
  - oxford102_flower_dataset
9
  metrics:
10
  - accuracy
11
+ - precision
12
+ - recall
13
+ - f1
14
  model-index:
15
  - name: resnet-50-finetuned-oxfordflowers
16
  results:
 
26
  metrics:
27
  - name: Accuracy
28
  type: accuracy
29
+ value: 0.8284273865669215
30
+ - name: Precision
31
+ type: precision
32
+ value: 0.8492938596426545
33
+ - name: Recall
34
+ type: recall
35
+ value: 0.8284273865669215
36
+ - name: F1
37
+ type: f1
38
+ value: 0.8283468243702176
39
  ---
40
 
41
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
45
 
46
  This model is a fine-tuned version of [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) on the oxford102_flower_dataset dataset.
47
  It achieves the following results on the evaluation set:
48
+ - Loss: 0.6286
49
+ - Accuracy: 0.8284
50
+ - Precision: 0.8493
51
+ - Recall: 0.8284
52
+ - F1: 0.8283
53
 
54
  ## Model description
55
 
 
78
 
79
  ### Training results
80
 
81
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall | F1 |
82
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:---------:|:------:|:------:|
83
+ | 4.5237 | 1.0 | 32 | 4.3400 | 0.3451 | 0.4129 | 0.3451 | 0.2875 |
84
+ | 2.8508 | 2.0 | 64 | 1.9785 | 0.5206 | 0.5610 | 0.5206 | 0.4816 |
85
+ | 1.346 | 3.0 | 96 | 1.1449 | 0.7088 | 0.7738 | 0.7088 | 0.6957 |
86
+ | 0.5544 | 4.0 | 128 | 0.9265 | 0.7539 | 0.8162 | 0.7539 | 0.7480 |
87
+ | 0.1847 | 5.0 | 160 | 0.7754 | 0.8029 | 0.8324 | 0.8029 | 0.7997 |
88
+ | 0.0863 | 6.0 | 192 | 0.7393 | 0.8020 | 0.8450 | 0.8020 | 0.8000 |
89
+ | 0.0516 | 7.0 | 224 | 0.6631 | 0.8284 | 0.8569 | 0.8284 | 0.8259 |
90
+ | 0.023 | 8.0 | 256 | 0.5880 | 0.8471 | 0.8631 | 0.8471 | 0.8429 |
91
+ | 0.011 | 9.0 | 288 | 0.5422 | 0.8569 | 0.8686 | 0.8569 | 0.8520 |
92
+ | 0.0079 | 10.0 | 320 | 0.5335 | 0.8510 | 0.8637 | 0.8510 | 0.8470 |
93
+ | 0.0072 | 11.0 | 352 | 0.5107 | 0.8647 | 0.8735 | 0.8647 | 0.8605 |
94
+ | 0.0086 | 12.0 | 384 | 0.5290 | 0.8578 | 0.8720 | 0.8578 | 0.8548 |
95
+ | 0.0058 | 13.0 | 416 | 0.5161 | 0.8569 | 0.8658 | 0.8569 | 0.8523 |
96
+ | 0.0027 | 14.0 | 448 | 0.5139 | 0.8588 | 0.8702 | 0.8588 | 0.8538 |
 
 
 
 
 
 
97
 
98
 
99
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.7801268498942917,
4
- "eval_loss": 0.8647737503051758,
5
- "eval_runtime": 123.7654,
6
- "eval_samples_per_second": 49.683,
7
- "eval_steps_per_second": 0.396,
8
- "total_flos": 4.36977436041216e+17,
9
- "train_loss": 0.5368185924002319,
10
- "train_runtime": 902.6693,
11
- "train_samples_per_second": 22.6,
12
- "train_steps_per_second": 0.709
13
  }
 
1
  {
2
+ "epoch": 14.0,
3
+ "total_flos": 3.058842052288512e+17,
4
+ "train_loss": 0.7544229235707982,
5
+ "train_runtime": 674.4917,
6
+ "train_samples_per_second": 30.245,
7
+ "train_steps_per_second": 0.949
 
 
 
 
 
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9f6515139de92486682b8ab4b26eaa75f616f163f3c2214015877cb8aa5c5dc
3
  size 95122680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fda6a9820f2992de0ff984ceec1a98f0df2dd245077d01a5db17936e7e095e2
3
  size 95122680
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 4.36977436041216e+17,
4
- "train_loss": 0.5368185924002319,
5
- "train_runtime": 902.6693,
6
- "train_samples_per_second": 22.6,
7
- "train_steps_per_second": 0.709
8
  }
 
1
  {
2
+ "epoch": 14.0,
3
+ "total_flos": 3.058842052288512e+17,
4
+ "train_loss": 0.7544229235707982,
5
+ "train_runtime": 674.4917,
6
+ "train_samples_per_second": 30.245,
7
+ "train_steps_per_second": 0.949
8
  }
trainer_state.json CHANGED
@@ -1,1097 +1,812 @@
1
  {
2
- "best_metric": 0.8519607843137255,
3
- "best_model_checkpoint": "resnet-50-finetuned-oxfordflowers/checkpoint-512",
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 640,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.15625,
13
- "grad_norm": 1.6800851821899414,
14
  "learning_rate": 0.0009921875,
15
- "loss": 4.6507,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.3125,
20
- "grad_norm": 1.4043323993682861,
21
  "learning_rate": 0.000984375,
22
- "loss": 4.6146,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.46875,
27
- "grad_norm": 1.4668281078338623,
28
  "learning_rate": 0.0009765625,
29
- "loss": 4.6484,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.625,
34
- "grad_norm": 1.4262796640396118,
35
  "learning_rate": 0.00096875,
36
- "loss": 4.6292,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.78125,
41
- "grad_norm": 1.1266566514968872,
42
  "learning_rate": 0.0009609375,
43
- "loss": 4.5702,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.9375,
48
- "grad_norm": 1.0406345129013062,
49
  "learning_rate": 0.000953125,
50
- "loss": 4.5224,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.0,
55
- "eval_accuracy": 0.25,
56
- "eval_loss": 4.293937683105469,
57
- "eval_runtime": 15.9684,
58
- "eval_samples_per_second": 63.876,
59
- "eval_steps_per_second": 2.004,
 
 
 
60
  "step": 32
61
  },
62
  {
63
  "epoch": 1.09375,
64
- "grad_norm": 2.009798765182495,
65
  "learning_rate": 0.0009453125,
66
- "loss": 4.2571,
67
  "step": 35
68
  },
69
  {
70
  "epoch": 1.25,
71
- "grad_norm": 1.697478175163269,
72
  "learning_rate": 0.0009375,
73
- "loss": 3.9421,
74
  "step": 40
75
  },
76
  {
77
  "epoch": 1.40625,
78
- "grad_norm": 2.285863161087036,
79
  "learning_rate": 0.0009296875000000001,
80
- "loss": 3.6971,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 1.5625,
85
- "grad_norm": 2.211660623550415,
86
  "learning_rate": 0.0009218750000000001,
87
- "loss": 3.3611,
88
  "step": 50
89
  },
90
  {
91
  "epoch": 1.71875,
92
- "grad_norm": 2.331829786300659,
93
  "learning_rate": 0.0009140625,
94
- "loss": 3.1108,
95
  "step": 55
96
  },
97
  {
98
  "epoch": 1.875,
99
- "grad_norm": 2.799659013748169,
100
  "learning_rate": 0.00090625,
101
- "loss": 2.8139,
102
  "step": 60
103
  },
104
  {
105
  "epoch": 2.0,
106
- "eval_accuracy": 0.4892156862745098,
107
- "eval_loss": 2.112804651260376,
108
- "eval_runtime": 16.5487,
109
- "eval_samples_per_second": 61.636,
110
- "eval_steps_per_second": 1.934,
 
 
 
111
  "step": 64
112
  },
113
  {
114
  "epoch": 2.03125,
115
- "grad_norm": 2.7438573837280273,
116
  "learning_rate": 0.0008984375,
117
- "loss": 2.6103,
118
  "step": 65
119
  },
120
  {
121
  "epoch": 2.1875,
122
- "grad_norm": 2.3204867839813232,
123
  "learning_rate": 0.000890625,
124
- "loss": 2.0599,
125
  "step": 70
126
  },
127
  {
128
  "epoch": 2.34375,
129
- "grad_norm": 2.3990378379821777,
130
  "learning_rate": 0.0008828125,
131
- "loss": 1.7052,
132
  "step": 75
133
  },
134
  {
135
  "epoch": 2.5,
136
- "grad_norm": 3.4195637702941895,
137
  "learning_rate": 0.000875,
138
- "loss": 1.5619,
139
  "step": 80
140
  },
141
  {
142
  "epoch": 2.65625,
143
- "grad_norm": 2.6798551082611084,
144
  "learning_rate": 0.0008671875,
145
- "loss": 1.4689,
146
  "step": 85
147
  },
148
  {
149
  "epoch": 2.8125,
150
- "grad_norm": 3.0105719566345215,
151
  "learning_rate": 0.000859375,
152
- "loss": 1.4125,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 2.96875,
157
- "grad_norm": 3.218193531036377,
158
  "learning_rate": 0.0008515625,
159
- "loss": 1.4505,
160
  "step": 95
161
  },
162
  {
163
  "epoch": 3.0,
164
- "eval_accuracy": 0.6843137254901961,
165
- "eval_loss": 1.226142406463623,
166
- "eval_runtime": 16.601,
167
- "eval_samples_per_second": 61.442,
168
- "eval_steps_per_second": 1.928,
 
 
 
169
  "step": 96
170
  },
171
  {
172
  "epoch": 3.125,
173
- "grad_norm": 2.6317319869995117,
174
  "learning_rate": 0.00084375,
175
- "loss": 0.7778,
176
  "step": 100
177
  },
178
  {
179
  "epoch": 3.28125,
180
- "grad_norm": 2.3407766819000244,
181
  "learning_rate": 0.0008359375,
182
- "loss": 0.8178,
183
  "step": 105
184
  },
185
  {
186
  "epoch": 3.4375,
187
- "grad_norm": 2.060016632080078,
188
  "learning_rate": 0.000828125,
189
- "loss": 0.7545,
190
  "step": 110
191
  },
192
  {
193
  "epoch": 3.59375,
194
- "grad_norm": 2.2562413215637207,
195
  "learning_rate": 0.0008203125,
196
- "loss": 0.6023,
197
  "step": 115
198
  },
199
  {
200
  "epoch": 3.75,
201
- "grad_norm": 2.7784945964813232,
202
  "learning_rate": 0.0008125000000000001,
203
- "loss": 0.5268,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 3.90625,
208
- "grad_norm": 2.47145676612854,
209
  "learning_rate": 0.0008046875000000001,
210
- "loss": 0.5751,
211
  "step": 125
212
  },
213
  {
214
  "epoch": 4.0,
215
- "eval_accuracy": 0.7441176470588236,
216
- "eval_loss": 1.0175817012786865,
217
- "eval_runtime": 18.5424,
218
- "eval_samples_per_second": 55.009,
219
- "eval_steps_per_second": 1.726,
 
 
 
220
  "step": 128
221
  },
222
  {
223
  "epoch": 4.0625,
224
- "grad_norm": 1.2707927227020264,
225
  "learning_rate": 0.0007968750000000001,
226
- "loss": 0.41,
227
  "step": 130
228
  },
229
  {
230
  "epoch": 4.21875,
231
- "grad_norm": 2.2418272495269775,
232
  "learning_rate": 0.0007890625,
233
- "loss": 0.2771,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 4.375,
238
- "grad_norm": 1.0117669105529785,
239
  "learning_rate": 0.00078125,
240
- "loss": 0.2848,
241
  "step": 140
242
  },
243
  {
244
  "epoch": 4.53125,
245
- "grad_norm": 1.5163785219192505,
246
  "learning_rate": 0.0007734375,
247
- "loss": 0.2322,
248
  "step": 145
249
  },
250
  {
251
  "epoch": 4.6875,
252
- "grad_norm": 1.693102478981018,
253
  "learning_rate": 0.000765625,
254
- "loss": 0.292,
255
  "step": 150
256
  },
257
  {
258
  "epoch": 4.84375,
259
- "grad_norm": 1.6366838216781616,
260
  "learning_rate": 0.0007578125,
261
- "loss": 0.2391,
262
  "step": 155
263
  },
264
  {
265
  "epoch": 5.0,
266
- "grad_norm": 1.1743065118789673,
267
  "learning_rate": 0.00075,
268
- "loss": 0.2265,
269
  "step": 160
270
  },
271
  {
272
  "epoch": 5.0,
273
- "eval_accuracy": 0.7558823529411764,
274
- "eval_loss": 0.8487027287483215,
275
- "eval_runtime": 20.2945,
276
- "eval_samples_per_second": 50.26,
277
- "eval_steps_per_second": 1.577,
 
 
 
278
  "step": 160
279
  },
280
  {
281
  "epoch": 5.15625,
282
- "grad_norm": 0.5249314308166504,
283
  "learning_rate": 0.0007421875,
284
- "loss": 0.1254,
285
  "step": 165
286
  },
287
  {
288
  "epoch": 5.3125,
289
- "grad_norm": 0.41112297773361206,
290
  "learning_rate": 0.000734375,
291
- "loss": 0.0917,
292
  "step": 170
293
  },
294
  {
295
  "epoch": 5.46875,
296
- "grad_norm": 1.9200881719589233,
297
  "learning_rate": 0.0007265625,
298
- "loss": 0.1139,
299
  "step": 175
300
  },
301
  {
302
  "epoch": 5.625,
303
- "grad_norm": 0.7506140470504761,
304
  "learning_rate": 0.00071875,
305
- "loss": 0.116,
306
  "step": 180
307
  },
308
  {
309
  "epoch": 5.78125,
310
- "grad_norm": 1.2240333557128906,
311
  "learning_rate": 0.0007109375,
312
- "loss": 0.1251,
313
  "step": 185
314
  },
315
  {
316
  "epoch": 5.9375,
317
- "grad_norm": 1.3143774271011353,
318
  "learning_rate": 0.000703125,
319
- "loss": 0.0531,
320
  "step": 190
321
  },
322
  {
323
  "epoch": 6.0,
324
- "eval_accuracy": 0.8,
325
- "eval_loss": 0.7608510255813599,
326
- "eval_runtime": 24.6873,
327
- "eval_samples_per_second": 41.317,
328
- "eval_steps_per_second": 1.296,
 
 
 
329
  "step": 192
330
  },
331
  {
332
  "epoch": 6.09375,
333
- "grad_norm": 1.9283502101898193,
334
  "learning_rate": 0.0006953125,
335
- "loss": 0.1274,
336
  "step": 195
337
  },
338
  {
339
  "epoch": 6.25,
340
- "grad_norm": 0.5062114000320435,
341
  "learning_rate": 0.0006875,
342
- "loss": 0.0358,
343
  "step": 200
344
  },
345
  {
346
  "epoch": 6.40625,
347
- "grad_norm": 1.057132601737976,
348
  "learning_rate": 0.0006796875000000001,
349
- "loss": 0.0426,
350
  "step": 205
351
  },
352
  {
353
  "epoch": 6.5625,
354
- "grad_norm": 0.2724122107028961,
355
  "learning_rate": 0.0006718750000000001,
356
- "loss": 0.0668,
357
  "step": 210
358
  },
359
  {
360
  "epoch": 6.71875,
361
- "grad_norm": 0.3335299789905548,
362
  "learning_rate": 0.0006640625,
363
- "loss": 0.0838,
364
  "step": 215
365
  },
366
  {
367
  "epoch": 6.875,
368
- "grad_norm": 0.5840352177619934,
369
  "learning_rate": 0.00065625,
370
- "loss": 0.0411,
371
  "step": 220
372
  },
373
  {
374
  "epoch": 7.0,
375
- "eval_accuracy": 0.8029411764705883,
376
- "eval_loss": 0.7190886735916138,
377
- "eval_runtime": 18.919,
378
- "eval_samples_per_second": 53.914,
379
- "eval_steps_per_second": 1.691,
 
 
 
380
  "step": 224
381
  },
382
  {
383
  "epoch": 7.03125,
384
- "grad_norm": 0.6974908709526062,
385
  "learning_rate": 0.0006484375,
386
- "loss": 0.0412,
387
  "step": 225
388
  },
389
  {
390
  "epoch": 7.1875,
391
- "grad_norm": 0.27331459522247314,
392
  "learning_rate": 0.000640625,
393
- "loss": 0.0238,
394
  "step": 230
395
  },
396
  {
397
  "epoch": 7.34375,
398
- "grad_norm": 0.26315683126449585,
399
  "learning_rate": 0.0006328125,
400
- "loss": 0.0181,
401
  "step": 235
402
  },
403
  {
404
  "epoch": 7.5,
405
- "grad_norm": 0.979246199131012,
406
  "learning_rate": 0.000625,
407
- "loss": 0.0368,
408
  "step": 240
409
  },
410
  {
411
  "epoch": 7.65625,
412
- "grad_norm": 0.18979792296886444,
413
  "learning_rate": 0.0006171875,
414
- "loss": 0.0293,
415
  "step": 245
416
  },
417
  {
418
  "epoch": 7.8125,
419
- "grad_norm": 2.098189115524292,
420
  "learning_rate": 0.000609375,
421
- "loss": 0.0263,
422
  "step": 250
423
  },
424
  {
425
  "epoch": 7.96875,
426
- "grad_norm": 0.20951713621616364,
427
  "learning_rate": 0.0006015625,
428
- "loss": 0.0351,
429
  "step": 255
430
  },
431
  {
432
  "epoch": 8.0,
433
- "eval_accuracy": 0.807843137254902,
434
- "eval_loss": 0.698701798915863,
435
- "eval_runtime": 20.5747,
436
- "eval_samples_per_second": 49.575,
437
- "eval_steps_per_second": 1.555,
 
 
 
438
  "step": 256
439
  },
440
  {
441
  "epoch": 8.125,
442
- "grad_norm": 0.08398638665676117,
443
  "learning_rate": 0.00059375,
444
- "loss": 0.0138,
445
  "step": 260
446
  },
447
  {
448
  "epoch": 8.28125,
449
- "grad_norm": 0.94996577501297,
450
  "learning_rate": 0.0005859375,
451
- "loss": 0.0176,
452
  "step": 265
453
  },
454
  {
455
  "epoch": 8.4375,
456
- "grad_norm": 0.14498768746852875,
457
  "learning_rate": 0.000578125,
458
- "loss": 0.0149,
459
  "step": 270
460
  },
461
  {
462
  "epoch": 8.59375,
463
- "grad_norm": 0.1302383691072464,
464
  "learning_rate": 0.0005703125,
465
- "loss": 0.0146,
466
  "step": 275
467
  },
468
  {
469
  "epoch": 8.75,
470
- "grad_norm": 0.3484581708908081,
471
  "learning_rate": 0.0005625000000000001,
472
- "loss": 0.0183,
473
  "step": 280
474
  },
475
  {
476
  "epoch": 8.90625,
477
- "grad_norm": 0.1543685644865036,
478
  "learning_rate": 0.0005546875000000001,
479
- "loss": 0.0107,
480
  "step": 285
481
  },
482
  {
483
  "epoch": 9.0,
484
- "eval_accuracy": 0.8225490196078431,
485
- "eval_loss": 0.6843494176864624,
486
- "eval_runtime": 16.0725,
487
- "eval_samples_per_second": 63.462,
488
- "eval_steps_per_second": 1.991,
 
 
 
489
  "step": 288
490
  },
491
  {
492
  "epoch": 9.0625,
493
- "grad_norm": 0.9732298851013184,
494
  "learning_rate": 0.000546875,
495
- "loss": 0.0156,
496
  "step": 290
497
  },
498
  {
499
  "epoch": 9.21875,
500
- "grad_norm": 0.09730440378189087,
501
  "learning_rate": 0.0005390625,
502
- "loss": 0.0114,
503
  "step": 295
504
  },
505
  {
506
  "epoch": 9.375,
507
- "grad_norm": 0.41419529914855957,
508
  "learning_rate": 0.00053125,
509
- "loss": 0.0101,
510
  "step": 300
511
  },
512
  {
513
  "epoch": 9.53125,
514
- "grad_norm": 0.055323634296655655,
515
  "learning_rate": 0.0005234375,
516
- "loss": 0.0074,
517
  "step": 305
518
  },
519
  {
520
  "epoch": 9.6875,
521
- "grad_norm": 0.07538346946239471,
522
  "learning_rate": 0.000515625,
523
- "loss": 0.0051,
524
  "step": 310
525
  },
526
  {
527
  "epoch": 9.84375,
528
- "grad_norm": 0.037017084658145905,
529
  "learning_rate": 0.0005078125,
530
- "loss": 0.0133,
531
  "step": 315
532
  },
533
  {
534
  "epoch": 10.0,
535
- "grad_norm": 0.05076463520526886,
536
  "learning_rate": 0.0005,
537
- "loss": 0.0094,
538
  "step": 320
539
  },
540
  {
541
  "epoch": 10.0,
542
- "eval_accuracy": 0.8343137254901961,
543
- "eval_loss": 0.6314178109169006,
544
- "eval_runtime": 16.9633,
545
- "eval_samples_per_second": 60.13,
546
- "eval_steps_per_second": 1.886,
 
 
 
547
  "step": 320
548
  },
549
  {
550
  "epoch": 10.15625,
551
- "grad_norm": 0.04146264120936394,
552
  "learning_rate": 0.0004921875,
553
  "loss": 0.0045,
554
  "step": 325
555
  },
556
  {
557
  "epoch": 10.3125,
558
- "grad_norm": 1.678152084350586,
559
  "learning_rate": 0.000484375,
560
- "loss": 0.0111,
561
  "step": 330
562
  },
563
  {
564
  "epoch": 10.46875,
565
- "grad_norm": 0.08414560556411743,
566
  "learning_rate": 0.0004765625,
567
- "loss": 0.004,
568
  "step": 335
569
  },
570
  {
571
  "epoch": 10.625,
572
- "grad_norm": 0.062152933329343796,
573
  "learning_rate": 0.00046875,
574
- "loss": 0.0058,
575
  "step": 340
576
  },
577
  {
578
  "epoch": 10.78125,
579
- "grad_norm": 0.18813878297805786,
580
  "learning_rate": 0.00046093750000000003,
581
- "loss": 0.0059,
582
  "step": 345
583
  },
584
  {
585
  "epoch": 10.9375,
586
- "grad_norm": 0.03264420107007027,
587
  "learning_rate": 0.000453125,
588
- "loss": 0.0081,
589
  "step": 350
590
  },
591
  {
592
  "epoch": 11.0,
593
- "eval_accuracy": 0.8352941176470589,
594
- "eval_loss": 0.6319591999053955,
595
- "eval_runtime": 16.0721,
596
- "eval_samples_per_second": 63.464,
597
- "eval_steps_per_second": 1.991,
 
 
 
598
  "step": 352
599
  },
600
  {
601
  "epoch": 11.09375,
602
- "grad_norm": 0.01650502346456051,
603
  "learning_rate": 0.0004453125,
604
- "loss": 0.0058,
605
  "step": 355
606
  },
607
  {
608
  "epoch": 11.25,
609
- "grad_norm": 0.03100210428237915,
610
  "learning_rate": 0.0004375,
611
- "loss": 0.0032,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 11.40625,
616
- "grad_norm": 0.31530651450157166,
617
  "learning_rate": 0.0004296875,
618
- "loss": 0.0055,
619
  "step": 365
620
  },
621
  {
622
  "epoch": 11.5625,
623
- "grad_norm": 0.018279677256941795,
624
  "learning_rate": 0.000421875,
625
- "loss": 0.0042,
626
  "step": 370
627
  },
628
  {
629
  "epoch": 11.71875,
630
- "grad_norm": 0.039065517485141754,
631
  "learning_rate": 0.0004140625,
632
- "loss": 0.004,
633
  "step": 375
634
  },
635
  {
636
  "epoch": 11.875,
637
- "grad_norm": 0.17956194281578064,
638
  "learning_rate": 0.00040625000000000004,
639
- "loss": 0.0053,
640
  "step": 380
641
  },
642
  {
643
  "epoch": 12.0,
644
- "eval_accuracy": 0.8352941176470589,
645
- "eval_loss": 0.6048569679260254,
646
- "eval_runtime": 16.0302,
647
- "eval_samples_per_second": 63.63,
648
- "eval_steps_per_second": 1.996,
 
 
 
649
  "step": 384
650
  },
651
  {
652
  "epoch": 12.03125,
653
- "grad_norm": 0.0491081103682518,
654
  "learning_rate": 0.00039843750000000003,
655
- "loss": 0.004,
656
  "step": 385
657
  },
658
  {
659
  "epoch": 12.1875,
660
- "grad_norm": 0.06726662814617157,
661
  "learning_rate": 0.000390625,
662
- "loss": 0.0032,
663
  "step": 390
664
  },
665
  {
666
  "epoch": 12.34375,
667
- "grad_norm": 0.0226299911737442,
668
  "learning_rate": 0.0003828125,
669
- "loss": 0.0027,
670
  "step": 395
671
  },
672
  {
673
  "epoch": 12.5,
674
- "grad_norm": 0.021714534610509872,
675
  "learning_rate": 0.000375,
676
- "loss": 0.0029,
677
  "step": 400
678
  },
679
  {
680
  "epoch": 12.65625,
681
- "grad_norm": 0.07769683748483658,
682
  "learning_rate": 0.0003671875,
683
- "loss": 0.0034,
684
  "step": 405
685
  },
686
  {
687
  "epoch": 12.8125,
688
- "grad_norm": 0.017162494361400604,
689
  "learning_rate": 0.000359375,
690
- "loss": 0.0029,
691
  "step": 410
692
  },
693
  {
694
  "epoch": 12.96875,
695
- "grad_norm": 0.08164256066083908,
696
  "learning_rate": 0.0003515625,
697
- "loss": 0.0048,
698
  "step": 415
699
  },
700
  {
701
  "epoch": 13.0,
702
- "eval_accuracy": 0.8372549019607843,
703
- "eval_loss": 0.5961340665817261,
704
- "eval_runtime": 18.3515,
705
- "eval_samples_per_second": 55.581,
706
- "eval_steps_per_second": 1.744,
 
 
 
707
  "step": 416
708
  },
709
  {
710
  "epoch": 13.125,
711
- "grad_norm": 0.05423242226243019,
712
  "learning_rate": 0.00034375,
713
- "loss": 0.0024,
714
  "step": 420
715
  },
716
  {
717
  "epoch": 13.28125,
718
- "grad_norm": 0.19008223712444305,
719
  "learning_rate": 0.00033593750000000003,
720
- "loss": 0.0083,
721
  "step": 425
722
  },
723
  {
724
  "epoch": 13.4375,
725
- "grad_norm": 0.0373542457818985,
726
  "learning_rate": 0.000328125,
727
- "loss": 0.002,
728
  "step": 430
729
  },
730
  {
731
  "epoch": 13.59375,
732
- "grad_norm": 0.014899961650371552,
733
  "learning_rate": 0.0003203125,
734
- "loss": 0.0029,
735
  "step": 435
736
  },
737
  {
738
  "epoch": 13.75,
739
- "grad_norm": 0.03342936560511589,
740
  "learning_rate": 0.0003125,
741
- "loss": 0.0031,
742
  "step": 440
743
  },
744
  {
745
  "epoch": 13.90625,
746
- "grad_norm": 0.018663976341485977,
747
  "learning_rate": 0.0003046875,
748
- "loss": 0.0024,
749
  "step": 445
750
  },
751
  {
752
  "epoch": 14.0,
753
- "eval_accuracy": 0.8470588235294118,
754
- "eval_loss": 0.588026225566864,
755
- "eval_runtime": 16.6123,
756
- "eval_samples_per_second": 61.4,
757
- "eval_steps_per_second": 1.926,
 
 
 
758
  "step": 448
759
  },
760
  {
761
- "epoch": 14.0625,
762
- "grad_norm": 0.03626991808414459,
763
- "learning_rate": 0.000296875,
764
- "loss": 0.004,
765
- "step": 450
766
- },
767
- {
768
- "epoch": 14.21875,
769
- "grad_norm": 0.021257249638438225,
770
- "learning_rate": 0.0002890625,
771
- "loss": 0.0026,
772
- "step": 455
773
- },
774
- {
775
- "epoch": 14.375,
776
- "grad_norm": 0.032649360597133636,
777
- "learning_rate": 0.00028125000000000003,
778
- "loss": 0.002,
779
- "step": 460
780
- },
781
- {
782
- "epoch": 14.53125,
783
- "grad_norm": 0.022741030901670456,
784
- "learning_rate": 0.0002734375,
785
- "loss": 0.007,
786
- "step": 465
787
- },
788
- {
789
- "epoch": 14.6875,
790
- "grad_norm": 0.020442800596356392,
791
- "learning_rate": 0.000265625,
792
- "loss": 0.0023,
793
- "step": 470
794
- },
795
- {
796
- "epoch": 14.84375,
797
- "grad_norm": 0.022834857925772667,
798
- "learning_rate": 0.0002578125,
799
- "loss": 0.0031,
800
- "step": 475
801
- },
802
- {
803
- "epoch": 15.0,
804
- "grad_norm": 0.014007111079990864,
805
- "learning_rate": 0.00025,
806
- "loss": 0.0028,
807
- "step": 480
808
- },
809
- {
810
- "epoch": 15.0,
811
- "eval_accuracy": 0.8441176470588235,
812
- "eval_loss": 0.5926622748374939,
813
- "eval_runtime": 16.2356,
814
- "eval_samples_per_second": 62.825,
815
- "eval_steps_per_second": 1.971,
816
- "step": 480
817
- },
818
- {
819
- "epoch": 15.15625,
820
- "grad_norm": 0.00949984509497881,
821
- "learning_rate": 0.0002421875,
822
- "loss": 0.0023,
823
- "step": 485
824
- },
825
- {
826
- "epoch": 15.3125,
827
- "grad_norm": 0.04143200442194939,
828
- "learning_rate": 0.000234375,
829
- "loss": 0.0021,
830
- "step": 490
831
- },
832
- {
833
- "epoch": 15.46875,
834
- "grad_norm": 0.012401225045323372,
835
- "learning_rate": 0.0002265625,
836
- "loss": 0.0021,
837
- "step": 495
838
- },
839
- {
840
- "epoch": 15.625,
841
- "grad_norm": 0.040582917630672455,
842
- "learning_rate": 0.00021875,
843
- "loss": 0.0031,
844
- "step": 500
845
- },
846
- {
847
- "epoch": 15.78125,
848
- "grad_norm": 0.025907032191753387,
849
- "learning_rate": 0.0002109375,
850
- "loss": 0.0021,
851
- "step": 505
852
- },
853
- {
854
- "epoch": 15.9375,
855
- "grad_norm": 0.008175536058843136,
856
- "learning_rate": 0.00020312500000000002,
857
- "loss": 0.0023,
858
- "step": 510
859
- },
860
- {
861
- "epoch": 16.0,
862
- "eval_accuracy": 0.8519607843137255,
863
- "eval_loss": 0.5878445506095886,
864
- "eval_runtime": 16.1518,
865
- "eval_samples_per_second": 63.151,
866
- "eval_steps_per_second": 1.981,
867
- "step": 512
868
- },
869
- {
870
- "epoch": 16.09375,
871
- "grad_norm": 0.3129185140132904,
872
- "learning_rate": 0.0001953125,
873
- "loss": 0.0044,
874
- "step": 515
875
- },
876
- {
877
- "epoch": 16.25,
878
- "grad_norm": 0.030808325856924057,
879
- "learning_rate": 0.0001875,
880
- "loss": 0.0036,
881
- "step": 520
882
- },
883
- {
884
- "epoch": 16.40625,
885
- "grad_norm": 0.019886957481503487,
886
- "learning_rate": 0.0001796875,
887
- "loss": 0.0026,
888
- "step": 525
889
- },
890
- {
891
- "epoch": 16.5625,
892
- "grad_norm": 0.019268082454800606,
893
- "learning_rate": 0.000171875,
894
- "loss": 0.0034,
895
- "step": 530
896
- },
897
- {
898
- "epoch": 16.71875,
899
- "grad_norm": 0.025241246446967125,
900
- "learning_rate": 0.0001640625,
901
- "loss": 0.0019,
902
- "step": 535
903
- },
904
- {
905
- "epoch": 16.875,
906
- "grad_norm": 0.01479440089315176,
907
- "learning_rate": 0.00015625,
908
- "loss": 0.0027,
909
- "step": 540
910
- },
911
- {
912
- "epoch": 17.0,
913
- "eval_accuracy": 0.8470588235294118,
914
- "eval_loss": 0.5872153043746948,
915
- "eval_runtime": 16.056,
916
- "eval_samples_per_second": 63.528,
917
- "eval_steps_per_second": 1.993,
918
- "step": 544
919
- },
920
- {
921
- "epoch": 17.03125,
922
- "grad_norm": 0.01011387724429369,
923
- "learning_rate": 0.0001484375,
924
- "loss": 0.0019,
925
- "step": 545
926
- },
927
- {
928
- "epoch": 17.1875,
929
- "grad_norm": 0.020896941423416138,
930
- "learning_rate": 0.00014062500000000002,
931
- "loss": 0.0022,
932
- "step": 550
933
- },
934
- {
935
- "epoch": 17.34375,
936
- "grad_norm": 0.040105391293764114,
937
- "learning_rate": 0.0001328125,
938
- "loss": 0.002,
939
- "step": 555
940
- },
941
- {
942
- "epoch": 17.5,
943
- "grad_norm": 0.016236811876296997,
944
- "learning_rate": 0.000125,
945
- "loss": 0.0024,
946
- "step": 560
947
- },
948
- {
949
- "epoch": 17.65625,
950
- "grad_norm": 0.010203810408711433,
951
- "learning_rate": 0.0001171875,
952
- "loss": 0.002,
953
- "step": 565
954
- },
955
- {
956
- "epoch": 17.8125,
957
- "grad_norm": 0.01675267145037651,
958
- "learning_rate": 0.000109375,
959
- "loss": 0.0019,
960
- "step": 570
961
- },
962
- {
963
- "epoch": 17.96875,
964
- "grad_norm": 0.08755680918693542,
965
- "learning_rate": 0.00010156250000000001,
966
- "loss": 0.0028,
967
- "step": 575
968
- },
969
- {
970
- "epoch": 18.0,
971
- "eval_accuracy": 0.8450980392156863,
972
- "eval_loss": 0.5891793966293335,
973
- "eval_runtime": 15.883,
974
- "eval_samples_per_second": 64.22,
975
- "eval_steps_per_second": 2.015,
976
- "step": 576
977
- },
978
- {
979
- "epoch": 18.125,
980
- "grad_norm": 0.060470979660749435,
981
- "learning_rate": 9.375e-05,
982
- "loss": 0.003,
983
- "step": 580
984
- },
985
- {
986
- "epoch": 18.28125,
987
- "grad_norm": 0.02452988736331463,
988
- "learning_rate": 8.59375e-05,
989
- "loss": 0.0017,
990
- "step": 585
991
- },
992
- {
993
- "epoch": 18.4375,
994
- "grad_norm": 0.02058909274637699,
995
- "learning_rate": 7.8125e-05,
996
- "loss": 0.002,
997
- "step": 590
998
- },
999
- {
1000
- "epoch": 18.59375,
1001
- "grad_norm": 0.01303939614444971,
1002
- "learning_rate": 7.031250000000001e-05,
1003
- "loss": 0.0025,
1004
- "step": 595
1005
- },
1006
- {
1007
- "epoch": 18.75,
1008
- "grad_norm": 0.006279917433857918,
1009
- "learning_rate": 6.25e-05,
1010
- "loss": 0.0027,
1011
- "step": 600
1012
- },
1013
- {
1014
- "epoch": 18.90625,
1015
- "grad_norm": 0.022672630846500397,
1016
- "learning_rate": 5.46875e-05,
1017
- "loss": 0.002,
1018
- "step": 605
1019
- },
1020
- {
1021
- "epoch": 19.0,
1022
- "eval_accuracy": 0.8411764705882353,
1023
- "eval_loss": 0.5932831764221191,
1024
- "eval_runtime": 16.4628,
1025
- "eval_samples_per_second": 61.958,
1026
- "eval_steps_per_second": 1.944,
1027
- "step": 608
1028
- },
1029
- {
1030
- "epoch": 19.0625,
1031
- "grad_norm": 0.15350750088691711,
1032
- "learning_rate": 4.6875e-05,
1033
- "loss": 0.0034,
1034
- "step": 610
1035
- },
1036
- {
1037
- "epoch": 19.21875,
1038
- "grad_norm": 0.01092343870550394,
1039
- "learning_rate": 3.90625e-05,
1040
- "loss": 0.002,
1041
- "step": 615
1042
- },
1043
- {
1044
- "epoch": 19.375,
1045
- "grad_norm": 0.008441799320280552,
1046
- "learning_rate": 3.125e-05,
1047
- "loss": 0.0022,
1048
- "step": 620
1049
- },
1050
- {
1051
- "epoch": 19.53125,
1052
- "grad_norm": 0.012427592650055885,
1053
- "learning_rate": 2.34375e-05,
1054
- "loss": 0.0026,
1055
- "step": 625
1056
- },
1057
- {
1058
- "epoch": 19.6875,
1059
- "grad_norm": 0.019600288942456245,
1060
- "learning_rate": 1.5625e-05,
1061
- "loss": 0.0016,
1062
- "step": 630
1063
- },
1064
- {
1065
- "epoch": 19.84375,
1066
- "grad_norm": 0.0400865413248539,
1067
- "learning_rate": 7.8125e-06,
1068
- "loss": 0.0031,
1069
- "step": 635
1070
- },
1071
- {
1072
- "epoch": 20.0,
1073
- "grad_norm": 0.03250521048903465,
1074
- "learning_rate": 0.0,
1075
- "loss": 0.0017,
1076
- "step": 640
1077
- },
1078
- {
1079
- "epoch": 20.0,
1080
- "eval_accuracy": 0.85,
1081
- "eval_loss": 0.5915272235870361,
1082
- "eval_runtime": 17.7209,
1083
- "eval_samples_per_second": 57.559,
1084
- "eval_steps_per_second": 1.806,
1085
- "step": 640
1086
- },
1087
- {
1088
- "epoch": 20.0,
1089
- "step": 640,
1090
- "total_flos": 4.36977436041216e+17,
1091
- "train_loss": 0.5368185924002319,
1092
- "train_runtime": 902.6693,
1093
- "train_samples_per_second": 22.6,
1094
- "train_steps_per_second": 0.709
1095
  }
1096
  ],
1097
  "logging_steps": 5,
@@ -1100,6 +815,15 @@
1100
  "num_train_epochs": 20,
1101
  "save_steps": 500,
1102
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
1103
  "TrainerControl": {
1104
  "args": {
1105
  "should_epoch_stop": false,
@@ -1111,7 +835,7 @@
1111
  "attributes": {}
1112
  }
1113
  },
1114
- "total_flos": 4.36977436041216e+17,
1115
  "train_batch_size": 32,
1116
  "trial_name": null,
1117
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8647058823529412,
3
+ "best_model_checkpoint": "resnet-50-finetuned-oxfordflowers/checkpoint-352",
4
+ "epoch": 14.0,
5
  "eval_steps": 500,
6
+ "global_step": 448,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.15625,
13
+ "grad_norm": 1.7320846319198608,
14
  "learning_rate": 0.0009921875,
15
+ "loss": 4.6558,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.3125,
20
+ "grad_norm": 1.359827995300293,
21
  "learning_rate": 0.000984375,
22
+ "loss": 4.6228,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.46875,
27
+ "grad_norm": 1.5728328227996826,
28
  "learning_rate": 0.0009765625,
29
+ "loss": 4.6629,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.625,
34
+ "grad_norm": 1.514758586883545,
35
  "learning_rate": 0.00096875,
36
+ "loss": 4.6269,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.78125,
41
+ "grad_norm": 1.0490564107894897,
42
  "learning_rate": 0.0009609375,
43
+ "loss": 4.5619,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.9375,
48
+ "grad_norm": 1.087933897972107,
49
  "learning_rate": 0.000953125,
50
+ "loss": 4.5237,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.0,
55
+ "eval_accuracy": 0.34509803921568627,
56
+ "eval_f1": 0.2874778237337216,
57
+ "eval_loss": 4.340012073516846,
58
+ "eval_precision": 0.4128804980040239,
59
+ "eval_recall": 0.34509803921568627,
60
+ "eval_runtime": 16.4984,
61
+ "eval_samples_per_second": 61.824,
62
+ "eval_steps_per_second": 1.94,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 1.09375,
67
+ "grad_norm": 1.3993674516677856,
68
  "learning_rate": 0.0009453125,
69
+ "loss": 4.3376,
70
  "step": 35
71
  },
72
  {
73
  "epoch": 1.25,
74
+ "grad_norm": 1.5965288877487183,
75
  "learning_rate": 0.0009375,
76
+ "loss": 4.0528,
77
  "step": 40
78
  },
79
  {
80
  "epoch": 1.40625,
81
+ "grad_norm": 2.107327938079834,
82
  "learning_rate": 0.0009296875000000001,
83
+ "loss": 3.7811,
84
  "step": 45
85
  },
86
  {
87
  "epoch": 1.5625,
88
+ "grad_norm": 2.1875929832458496,
89
  "learning_rate": 0.0009218750000000001,
90
+ "loss": 3.3976,
91
  "step": 50
92
  },
93
  {
94
  "epoch": 1.71875,
95
+ "grad_norm": 2.3979506492614746,
96
  "learning_rate": 0.0009140625,
97
+ "loss": 3.0782,
98
  "step": 55
99
  },
100
  {
101
  "epoch": 1.875,
102
+ "grad_norm": 3.099531888961792,
103
  "learning_rate": 0.00090625,
104
+ "loss": 2.8508,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 2.0,
109
+ "eval_accuracy": 0.5205882352941177,
110
+ "eval_f1": 0.4815844148221795,
111
+ "eval_loss": 1.9784579277038574,
112
+ "eval_precision": 0.5609503268219397,
113
+ "eval_recall": 0.5205882352941177,
114
+ "eval_runtime": 14.6742,
115
+ "eval_samples_per_second": 69.51,
116
+ "eval_steps_per_second": 2.181,
117
  "step": 64
118
  },
119
  {
120
  "epoch": 2.03125,
121
+ "grad_norm": 3.019045829772949,
122
  "learning_rate": 0.0008984375,
123
+ "loss": 2.5847,
124
  "step": 65
125
  },
126
  {
127
  "epoch": 2.1875,
128
+ "grad_norm": 2.6212832927703857,
129
  "learning_rate": 0.000890625,
130
+ "loss": 2.0135,
131
  "step": 70
132
  },
133
  {
134
  "epoch": 2.34375,
135
+ "grad_norm": 2.5014185905456543,
136
  "learning_rate": 0.0008828125,
137
+ "loss": 1.6579,
138
  "step": 75
139
  },
140
  {
141
  "epoch": 2.5,
142
+ "grad_norm": 2.89278244972229,
143
  "learning_rate": 0.000875,
144
+ "loss": 1.5167,
145
  "step": 80
146
  },
147
  {
148
  "epoch": 2.65625,
149
+ "grad_norm": 2.6624577045440674,
150
  "learning_rate": 0.0008671875,
151
+ "loss": 1.3679,
152
  "step": 85
153
  },
154
  {
155
  "epoch": 2.8125,
156
+ "grad_norm": 2.6109840869903564,
157
  "learning_rate": 0.000859375,
158
+ "loss": 1.2521,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 2.96875,
163
+ "grad_norm": 3.0059635639190674,
164
  "learning_rate": 0.0008515625,
165
+ "loss": 1.346,
166
  "step": 95
167
  },
168
  {
169
  "epoch": 3.0,
170
+ "eval_accuracy": 0.7088235294117647,
171
+ "eval_f1": 0.695694394140452,
172
+ "eval_loss": 1.1449469327926636,
173
+ "eval_precision": 0.7737617026219967,
174
+ "eval_recall": 0.7088235294117647,
175
+ "eval_runtime": 15.7476,
176
+ "eval_samples_per_second": 64.772,
177
+ "eval_steps_per_second": 2.032,
178
  "step": 96
179
  },
180
  {
181
  "epoch": 3.125,
182
+ "grad_norm": 2.714418411254883,
183
  "learning_rate": 0.00084375,
184
+ "loss": 0.6888,
185
  "step": 100
186
  },
187
  {
188
  "epoch": 3.28125,
189
+ "grad_norm": 2.7836403846740723,
190
  "learning_rate": 0.0008359375,
191
+ "loss": 0.7027,
192
  "step": 105
193
  },
194
  {
195
  "epoch": 3.4375,
196
+ "grad_norm": 2.305562973022461,
197
  "learning_rate": 0.000828125,
198
+ "loss": 0.5808,
199
  "step": 110
200
  },
201
  {
202
  "epoch": 3.59375,
203
+ "grad_norm": 2.350442409515381,
204
  "learning_rate": 0.0008203125,
205
+ "loss": 0.5346,
206
  "step": 115
207
  },
208
  {
209
  "epoch": 3.75,
210
+ "grad_norm": 1.3235565423965454,
211
  "learning_rate": 0.0008125000000000001,
212
+ "loss": 0.4357,
213
  "step": 120
214
  },
215
  {
216
  "epoch": 3.90625,
217
+ "grad_norm": 2.181887626647949,
218
  "learning_rate": 0.0008046875000000001,
219
+ "loss": 0.5544,
220
  "step": 125
221
  },
222
  {
223
  "epoch": 4.0,
224
+ "eval_accuracy": 0.753921568627451,
225
+ "eval_f1": 0.7480454731222893,
226
+ "eval_loss": 0.9264965653419495,
227
+ "eval_precision": 0.8162285484733582,
228
+ "eval_recall": 0.753921568627451,
229
+ "eval_runtime": 16.1579,
230
+ "eval_samples_per_second": 63.127,
231
+ "eval_steps_per_second": 1.98,
232
  "step": 128
233
  },
234
  {
235
  "epoch": 4.0625,
236
+ "grad_norm": 1.2976504564285278,
237
  "learning_rate": 0.0007968750000000001,
238
+ "loss": 0.4729,
239
  "step": 130
240
  },
241
  {
242
  "epoch": 4.21875,
243
+ "grad_norm": 1.4201076030731201,
244
  "learning_rate": 0.0007890625,
245
+ "loss": 0.2202,
246
  "step": 135
247
  },
248
  {
249
  "epoch": 4.375,
250
+ "grad_norm": 1.6992279291152954,
251
  "learning_rate": 0.00078125,
252
+ "loss": 0.2482,
253
  "step": 140
254
  },
255
  {
256
  "epoch": 4.53125,
257
+ "grad_norm": 1.4370797872543335,
258
  "learning_rate": 0.0007734375,
259
+ "loss": 0.2159,
260
  "step": 145
261
  },
262
  {
263
  "epoch": 4.6875,
264
+ "grad_norm": 1.7889351844787598,
265
  "learning_rate": 0.000765625,
266
+ "loss": 0.2127,
267
  "step": 150
268
  },
269
  {
270
  "epoch": 4.84375,
271
+ "grad_norm": 0.961966872215271,
272
  "learning_rate": 0.0007578125,
273
+ "loss": 0.2415,
274
  "step": 155
275
  },
276
  {
277
  "epoch": 5.0,
278
+ "grad_norm": 1.1151365041732788,
279
  "learning_rate": 0.00075,
280
+ "loss": 0.1847,
281
  "step": 160
282
  },
283
  {
284
  "epoch": 5.0,
285
+ "eval_accuracy": 0.8029411764705883,
286
+ "eval_f1": 0.7996802939138055,
287
+ "eval_loss": 0.775350034236908,
288
+ "eval_precision": 0.8323794596323556,
289
+ "eval_recall": 0.8029411764705883,
290
+ "eval_runtime": 16.7669,
291
+ "eval_samples_per_second": 60.834,
292
+ "eval_steps_per_second": 1.909,
293
  "step": 160
294
  },
295
  {
296
  "epoch": 5.15625,
297
+ "grad_norm": 0.6469184756278992,
298
  "learning_rate": 0.0007421875,
299
+ "loss": 0.1029,
300
  "step": 165
301
  },
302
  {
303
  "epoch": 5.3125,
304
+ "grad_norm": 0.43876633048057556,
305
  "learning_rate": 0.000734375,
306
+ "loss": 0.0573,
307
  "step": 170
308
  },
309
  {
310
  "epoch": 5.46875,
311
+ "grad_norm": 0.9715489149093628,
312
  "learning_rate": 0.0007265625,
313
+ "loss": 0.073,
314
  "step": 175
315
  },
316
  {
317
  "epoch": 5.625,
318
+ "grad_norm": 0.6452958583831787,
319
  "learning_rate": 0.00071875,
320
+ "loss": 0.1527,
321
  "step": 180
322
  },
323
  {
324
  "epoch": 5.78125,
325
+ "grad_norm": 1.4150739908218384,
326
  "learning_rate": 0.0007109375,
327
+ "loss": 0.0969,
328
  "step": 185
329
  },
330
  {
331
  "epoch": 5.9375,
332
+ "grad_norm": 0.9330568313598633,
333
  "learning_rate": 0.000703125,
334
+ "loss": 0.0863,
335
  "step": 190
336
  },
337
  {
338
  "epoch": 6.0,
339
+ "eval_accuracy": 0.8019607843137255,
340
+ "eval_f1": 0.80000704547531,
341
+ "eval_loss": 0.7392613887786865,
342
+ "eval_precision": 0.8450064476152622,
343
+ "eval_recall": 0.8019607843137255,
344
+ "eval_runtime": 14.7972,
345
+ "eval_samples_per_second": 68.932,
346
+ "eval_steps_per_second": 2.163,
347
  "step": 192
348
  },
349
  {
350
  "epoch": 6.09375,
351
+ "grad_norm": 0.7720392346382141,
352
  "learning_rate": 0.0006953125,
353
+ "loss": 0.071,
354
  "step": 195
355
  },
356
  {
357
  "epoch": 6.25,
358
+ "grad_norm": 1.0389519929885864,
359
  "learning_rate": 0.0006875,
360
+ "loss": 0.0748,
361
  "step": 200
362
  },
363
  {
364
  "epoch": 6.40625,
365
+ "grad_norm": 1.150804042816162,
366
  "learning_rate": 0.0006796875000000001,
367
+ "loss": 0.0419,
368
  "step": 205
369
  },
370
  {
371
  "epoch": 6.5625,
372
+ "grad_norm": 2.15278959274292,
373
  "learning_rate": 0.0006718750000000001,
374
+ "loss": 0.0953,
375
  "step": 210
376
  },
377
  {
378
  "epoch": 6.71875,
379
+ "grad_norm": 0.3684898912906647,
380
  "learning_rate": 0.0006640625,
381
+ "loss": 0.051,
382
  "step": 215
383
  },
384
  {
385
  "epoch": 6.875,
386
+ "grad_norm": 0.40336862206459045,
387
  "learning_rate": 0.00065625,
388
+ "loss": 0.0516,
389
  "step": 220
390
  },
391
  {
392
  "epoch": 7.0,
393
+ "eval_accuracy": 0.8284313725490197,
394
+ "eval_f1": 0.8258508358061349,
395
+ "eval_loss": 0.6630767583847046,
396
+ "eval_precision": 0.8569142400730454,
397
+ "eval_recall": 0.8284313725490197,
398
+ "eval_runtime": 14.5794,
399
+ "eval_samples_per_second": 69.962,
400
+ "eval_steps_per_second": 2.195,
401
  "step": 224
402
  },
403
  {
404
  "epoch": 7.03125,
405
+ "grad_norm": 0.3206132650375366,
406
  "learning_rate": 0.0006484375,
407
+ "loss": 0.0309,
408
  "step": 225
409
  },
410
  {
411
  "epoch": 7.1875,
412
+ "grad_norm": 0.5986069440841675,
413
  "learning_rate": 0.000640625,
414
+ "loss": 0.0247,
415
  "step": 230
416
  },
417
  {
418
  "epoch": 7.34375,
419
+ "grad_norm": 0.12066510319709778,
420
  "learning_rate": 0.0006328125,
421
+ "loss": 0.0395,
422
  "step": 235
423
  },
424
  {
425
  "epoch": 7.5,
426
+ "grad_norm": 0.16991862654685974,
427
  "learning_rate": 0.000625,
428
+ "loss": 0.0355,
429
  "step": 240
430
  },
431
  {
432
  "epoch": 7.65625,
433
+ "grad_norm": 0.12674580514431,
434
  "learning_rate": 0.0006171875,
435
+ "loss": 0.0223,
436
  "step": 245
437
  },
438
  {
439
  "epoch": 7.8125,
440
+ "grad_norm": 0.26299160718917847,
441
  "learning_rate": 0.000609375,
442
+ "loss": 0.0229,
443
  "step": 250
444
  },
445
  {
446
  "epoch": 7.96875,
447
+ "grad_norm": 1.2256274223327637,
448
  "learning_rate": 0.0006015625,
449
+ "loss": 0.023,
450
  "step": 255
451
  },
452
  {
453
  "epoch": 8.0,
454
+ "eval_accuracy": 0.8470588235294118,
455
+ "eval_f1": 0.8428671541662826,
456
+ "eval_loss": 0.587995171546936,
457
+ "eval_precision": 0.863074478385897,
458
+ "eval_recall": 0.8470588235294118,
459
+ "eval_runtime": 14.5991,
460
+ "eval_samples_per_second": 69.867,
461
+ "eval_steps_per_second": 2.192,
462
  "step": 256
463
  },
464
  {
465
  "epoch": 8.125,
466
+ "grad_norm": 0.1445535123348236,
467
  "learning_rate": 0.00059375,
468
+ "loss": 0.0148,
469
  "step": 260
470
  },
471
  {
472
  "epoch": 8.28125,
473
+ "grad_norm": 0.80363529920578,
474
  "learning_rate": 0.0005859375,
475
+ "loss": 0.0138,
476
  "step": 265
477
  },
478
  {
479
  "epoch": 8.4375,
480
+ "grad_norm": 0.08668403327465057,
481
  "learning_rate": 0.000578125,
482
+ "loss": 0.0133,
483
  "step": 270
484
  },
485
  {
486
  "epoch": 8.59375,
487
+ "grad_norm": 0.14351984858512878,
488
  "learning_rate": 0.0005703125,
489
+ "loss": 0.0091,
490
  "step": 275
491
  },
492
  {
493
  "epoch": 8.75,
494
+ "grad_norm": 0.21965286135673523,
495
  "learning_rate": 0.0005625000000000001,
496
+ "loss": 0.0096,
497
  "step": 280
498
  },
499
  {
500
  "epoch": 8.90625,
501
+ "grad_norm": 0.2289452701807022,
502
  "learning_rate": 0.0005546875000000001,
503
+ "loss": 0.011,
504
  "step": 285
505
  },
506
  {
507
  "epoch": 9.0,
508
+ "eval_accuracy": 0.8568627450980392,
509
+ "eval_f1": 0.8520284357156945,
510
+ "eval_loss": 0.5421529412269592,
511
+ "eval_precision": 0.8685615038556214,
512
+ "eval_recall": 0.8568627450980392,
513
+ "eval_runtime": 15.0954,
514
+ "eval_samples_per_second": 67.57,
515
+ "eval_steps_per_second": 2.12,
516
  "step": 288
517
  },
518
  {
519
  "epoch": 9.0625,
520
+ "grad_norm": 0.08616359531879425,
521
  "learning_rate": 0.000546875,
522
+ "loss": 0.028,
523
  "step": 290
524
  },
525
  {
526
  "epoch": 9.21875,
527
+ "grad_norm": 0.09607744216918945,
528
  "learning_rate": 0.0005390625,
529
+ "loss": 0.0066,
530
  "step": 295
531
  },
532
  {
533
  "epoch": 9.375,
534
+ "grad_norm": 0.08231505751609802,
535
  "learning_rate": 0.00053125,
536
+ "loss": 0.0059,
537
  "step": 300
538
  },
539
  {
540
  "epoch": 9.53125,
541
+ "grad_norm": 0.1094212606549263,
542
  "learning_rate": 0.0005234375,
543
+ "loss": 0.0071,
544
  "step": 305
545
  },
546
  {
547
  "epoch": 9.6875,
548
+ "grad_norm": 0.2680395841598511,
549
  "learning_rate": 0.000515625,
550
+ "loss": 0.0097,
551
  "step": 310
552
  },
553
  {
554
  "epoch": 9.84375,
555
+ "grad_norm": 0.059513527899980545,
556
  "learning_rate": 0.0005078125,
557
+ "loss": 0.0088,
558
  "step": 315
559
  },
560
  {
561
  "epoch": 10.0,
562
+ "grad_norm": 0.05216934159398079,
563
  "learning_rate": 0.0005,
564
+ "loss": 0.0079,
565
  "step": 320
566
  },
567
  {
568
  "epoch": 10.0,
569
+ "eval_accuracy": 0.8509803921568627,
570
+ "eval_f1": 0.8469890554498172,
571
+ "eval_loss": 0.5335255265235901,
572
+ "eval_precision": 0.8636809133132662,
573
+ "eval_recall": 0.8509803921568627,
574
+ "eval_runtime": 15.0591,
575
+ "eval_samples_per_second": 67.733,
576
+ "eval_steps_per_second": 2.125,
577
  "step": 320
578
  },
579
  {
580
  "epoch": 10.15625,
581
+ "grad_norm": 0.031231796368956566,
582
  "learning_rate": 0.0004921875,
583
  "loss": 0.0045,
584
  "step": 325
585
  },
586
  {
587
  "epoch": 10.3125,
588
+ "grad_norm": 0.03108547069132328,
589
  "learning_rate": 0.000484375,
590
+ "loss": 0.0041,
591
  "step": 330
592
  },
593
  {
594
  "epoch": 10.46875,
595
+ "grad_norm": 0.04625704139471054,
596
  "learning_rate": 0.0004765625,
597
+ "loss": 0.0033,
598
  "step": 335
599
  },
600
  {
601
  "epoch": 10.625,
602
+ "grad_norm": 0.0934106633067131,
603
  "learning_rate": 0.00046875,
604
+ "loss": 0.005,
605
  "step": 340
606
  },
607
  {
608
  "epoch": 10.78125,
609
+ "grad_norm": 0.031285736709833145,
610
  "learning_rate": 0.00046093750000000003,
611
+ "loss": 0.004,
612
  "step": 345
613
  },
614
  {
615
  "epoch": 10.9375,
616
+ "grad_norm": 0.052936799824237823,
617
  "learning_rate": 0.000453125,
618
+ "loss": 0.0072,
619
  "step": 350
620
  },
621
  {
622
  "epoch": 11.0,
623
+ "eval_accuracy": 0.8647058823529412,
624
+ "eval_f1": 0.8604589215066042,
625
+ "eval_loss": 0.5107002854347229,
626
+ "eval_precision": 0.8735360179996857,
627
+ "eval_recall": 0.8647058823529412,
628
+ "eval_runtime": 14.8358,
629
+ "eval_samples_per_second": 68.753,
630
+ "eval_steps_per_second": 2.157,
631
  "step": 352
632
  },
633
  {
634
  "epoch": 11.09375,
635
+ "grad_norm": 0.02143882028758526,
636
  "learning_rate": 0.0004453125,
637
+ "loss": 0.0039,
638
  "step": 355
639
  },
640
  {
641
  "epoch": 11.25,
642
+ "grad_norm": 0.06976446509361267,
643
  "learning_rate": 0.0004375,
644
+ "loss": 0.0033,
645
  "step": 360
646
  },
647
  {
648
  "epoch": 11.40625,
649
+ "grad_norm": 0.08110585063695908,
650
  "learning_rate": 0.0004296875,
651
+ "loss": 0.0039,
652
  "step": 365
653
  },
654
  {
655
  "epoch": 11.5625,
656
+ "grad_norm": 0.025856945663690567,
657
  "learning_rate": 0.000421875,
658
+ "loss": 0.0052,
659
  "step": 370
660
  },
661
  {
662
  "epoch": 11.71875,
663
+ "grad_norm": 0.059996578842401505,
664
  "learning_rate": 0.0004140625,
665
+ "loss": 0.0076,
666
  "step": 375
667
  },
668
  {
669
  "epoch": 11.875,
670
+ "grad_norm": 0.1558006852865219,
671
  "learning_rate": 0.00040625000000000004,
672
+ "loss": 0.0086,
673
  "step": 380
674
  },
675
  {
676
  "epoch": 12.0,
677
+ "eval_accuracy": 0.8578431372549019,
678
+ "eval_f1": 0.8548177831549634,
679
+ "eval_loss": 0.5290006995201111,
680
+ "eval_precision": 0.8719719142513259,
681
+ "eval_recall": 0.8578431372549019,
682
+ "eval_runtime": 14.675,
683
+ "eval_samples_per_second": 69.506,
684
+ "eval_steps_per_second": 2.181,
685
  "step": 384
686
  },
687
  {
688
  "epoch": 12.03125,
689
+ "grad_norm": 0.2643249034881592,
690
  "learning_rate": 0.00039843750000000003,
691
+ "loss": 0.0052,
692
  "step": 385
693
  },
694
  {
695
  "epoch": 12.1875,
696
+ "grad_norm": 0.03377879783511162,
697
  "learning_rate": 0.000390625,
698
+ "loss": 0.0042,
699
  "step": 390
700
  },
701
  {
702
  "epoch": 12.34375,
703
+ "grad_norm": 0.4935762286186218,
704
  "learning_rate": 0.0003828125,
705
+ "loss": 0.0044,
706
  "step": 395
707
  },
708
  {
709
  "epoch": 12.5,
710
+ "grad_norm": 0.011438349261879921,
711
  "learning_rate": 0.000375,
712
+ "loss": 0.0026,
713
  "step": 400
714
  },
715
  {
716
  "epoch": 12.65625,
717
+ "grad_norm": 0.0676066130399704,
718
  "learning_rate": 0.0003671875,
719
+ "loss": 0.004,
720
  "step": 405
721
  },
722
  {
723
  "epoch": 12.8125,
724
+ "grad_norm": 0.014333638362586498,
725
  "learning_rate": 0.000359375,
726
+ "loss": 0.0033,
727
  "step": 410
728
  },
729
  {
730
  "epoch": 12.96875,
731
+ "grad_norm": 0.11364184319972992,
732
  "learning_rate": 0.0003515625,
733
+ "loss": 0.0058,
734
  "step": 415
735
  },
736
  {
737
  "epoch": 13.0,
738
+ "eval_accuracy": 0.8568627450980392,
739
+ "eval_f1": 0.8523334959912007,
740
+ "eval_loss": 0.5160782933235168,
741
+ "eval_precision": 0.865809422059422,
742
+ "eval_recall": 0.8568627450980392,
743
+ "eval_runtime": 14.822,
744
+ "eval_samples_per_second": 68.817,
745
+ "eval_steps_per_second": 2.159,
746
  "step": 416
747
  },
748
  {
749
  "epoch": 13.125,
750
+ "grad_norm": 0.013512840494513512,
751
  "learning_rate": 0.00034375,
752
+ "loss": 0.0023,
753
  "step": 420
754
  },
755
  {
756
  "epoch": 13.28125,
757
+ "grad_norm": 0.16694830358028412,
758
  "learning_rate": 0.00033593750000000003,
759
+ "loss": 0.0076,
760
  "step": 425
761
  },
762
  {
763
  "epoch": 13.4375,
764
+ "grad_norm": 0.025320900604128838,
765
  "learning_rate": 0.000328125,
766
+ "loss": 0.0023,
767
  "step": 430
768
  },
769
  {
770
  "epoch": 13.59375,
771
+ "grad_norm": 0.015468744561076164,
772
  "learning_rate": 0.0003203125,
773
+ "loss": 0.0035,
774
  "step": 435
775
  },
776
  {
777
  "epoch": 13.75,
778
+ "grad_norm": 0.03578919544816017,
779
  "learning_rate": 0.0003125,
780
+ "loss": 0.0027,
781
  "step": 440
782
  },
783
  {
784
  "epoch": 13.90625,
785
+ "grad_norm": 0.023722629994153976,
786
  "learning_rate": 0.0003046875,
787
+ "loss": 0.0027,
788
  "step": 445
789
  },
790
  {
791
  "epoch": 14.0,
792
+ "eval_accuracy": 0.8588235294117647,
793
+ "eval_f1": 0.8538315193219463,
794
+ "eval_loss": 0.5138522982597351,
795
+ "eval_precision": 0.8702200713230125,
796
+ "eval_recall": 0.8588235294117647,
797
+ "eval_runtime": 15.3283,
798
+ "eval_samples_per_second": 66.544,
799
+ "eval_steps_per_second": 2.088,
800
  "step": 448
801
  },
802
  {
803
+ "epoch": 14.0,
804
+ "step": 448,
805
+ "total_flos": 3.058842052288512e+17,
806
+ "train_loss": 0.7544229235707982,
807
+ "train_runtime": 674.4917,
808
+ "train_samples_per_second": 30.245,
809
+ "train_steps_per_second": 0.949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
811
  ],
812
  "logging_steps": 5,
 
815
  "num_train_epochs": 20,
816
  "save_steps": 500,
817
  "stateful_callbacks": {
818
+ "EarlyStoppingCallback": {
819
+ "args": {
820
+ "early_stopping_patience": 3,
821
+ "early_stopping_threshold": 0.0
822
+ },
823
+ "attributes": {
824
+ "early_stopping_patience_counter": 3
825
+ }
826
+ },
827
  "TrainerControl": {
828
  "args": {
829
  "should_epoch_stop": false,
 
835
  "attributes": {}
836
  }
837
  },
838
+ "total_flos": 3.058842052288512e+17,
839
  "train_batch_size": 32,
840
  "trial_name": null,
841
  "trial_params": null