heisenberg3376 commited on
Commit
a29d4d3
1 Parent(s): 2b23533

Training in progress, step 100

Browse files
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/vit-base-patch16-224-in21k
4
+ tags:
5
+ - image-classification
6
+ - generated_from_trainer
7
+ datasets:
8
+ - imagefolder
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: vit-base-food-items-v1
13
+ results:
14
+ - task:
15
+ name: Image Classification
16
+ type: image-classification
17
+ dataset:
18
+ name: beans
19
+ type: imagefolder
20
+ config: default
21
+ split: validation
22
+ args: default
23
+ metrics:
24
+ - name: Accuracy
25
+ type: accuracy
26
+ value: 0.9236363636363636
27
+ ---
28
+
29
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
+ should probably proofread and complete it, then remove this comment. -->
31
+
32
+ # vit-base-food-items-v1
33
+
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
35
+ It achieves the following results on the evaluation set:
36
+ - Loss: 0.3363
37
+ - Accuracy: 0.9236
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 0.0002
57
+ - train_batch_size: 16
58
+ - eval_batch_size: 8
59
+ - seed: 42
60
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
61
+ - lr_scheduler_type: linear
62
+ - num_epochs: 4
63
+ - mixed_precision_training: Native AMP
64
+
65
+ ### Training results
66
+
67
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
69
+ | 0.4195 | 0.6579 | 100 | 0.5028 | 0.9055 |
70
+ | 0.1072 | 1.3158 | 200 | 0.3794 | 0.8945 |
71
+ | 0.0326 | 1.9737 | 300 | 0.3832 | 0.9055 |
72
+ | 0.0207 | 2.6316 | 400 | 0.3363 | 0.9236 |
73
+ | 0.0167 | 3.2895 | 500 | 0.3373 | 0.9236 |
74
+ | 0.0153 | 3.9474 | 600 | 0.3374 | 0.9236 |
75
+
76
+
77
+ ### Framework versions
78
+
79
+ - Transformers 4.41.2
80
+ - Pytorch 2.3.0+cu121
81
+ - Datasets 2.20.0
82
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.9236363636363636,
4
+ "eval_loss": 0.33629149198532104,
5
+ "eval_runtime": 7.1163,
6
+ "eval_samples_per_second": 77.287,
7
+ "eval_steps_per_second": 9.696,
8
+ "total_flos": 7.501829674622976e+17,
9
+ "train_loss": 0.22265003621578217,
10
+ "train_runtime": 237.6059,
11
+ "train_samples_per_second": 40.74,
12
+ "train_steps_per_second": 2.559
13
+ }
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "Bread",
13
+ "1": "Dairy product",
14
+ "10": "Vegetable-Fruit",
15
+ "2": "Dessert",
16
+ "3": "Egg",
17
+ "4": "Fried food",
18
+ "5": "Meat",
19
+ "6": "Noodles-Pasta",
20
+ "7": "Rice",
21
+ "8": "Seafood",
22
+ "9": "Soup"
23
+ },
24
+ "image_size": 224,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 3072,
27
+ "label2id": {
28
+ "Bread": "0",
29
+ "Dairy product": "1",
30
+ "Dessert": "2",
31
+ "Egg": "3",
32
+ "Fried food": "4",
33
+ "Meat": "5",
34
+ "Noodles-Pasta": "6",
35
+ "Rice": "7",
36
+ "Seafood": "8",
37
+ "Soup": "9",
38
+ "Vegetable-Fruit": "10"
39
+ },
40
+ "layer_norm_eps": 1e-12,
41
+ "model_type": "vit",
42
+ "num_attention_heads": 12,
43
+ "num_channels": 3,
44
+ "num_hidden_layers": 12,
45
+ "patch_size": 16,
46
+ "problem_type": "single_label_classification",
47
+ "qkv_bias": true,
48
+ "torch_dtype": "float32",
49
+ "transformers_version": "4.41.2"
50
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.9236363636363636,
4
+ "eval_loss": 0.33629149198532104,
5
+ "eval_runtime": 7.1163,
6
+ "eval_samples_per_second": 77.287,
7
+ "eval_steps_per_second": 9.696
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:535d09cd8b39917b646d5dd2b752011055935c5cadd188604fb079078c7ebfac
3
+ size 343251660
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "ViTFeatureExtractor",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "resample": 2,
31
+ "rescale_factor": 0.00392156862745098,
32
+ "size": {
33
+ "height": 224,
34
+ "width": 224
35
+ }
36
+ }
runs/Jul17_09-22-53_405903fcfe02/events.out.tfevents.1721208182.405903fcfe02.739.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:768ad43b38f92547fb617b93c5d0c7a0c72408da27a37423585966f76adce2e3
3
+ size 5199
runs/Jul17_09-26-03_405903fcfe02/events.out.tfevents.1721208370.405903fcfe02.739.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d8c7db1cb1d871d01c41ce1fa5720d9c5fe736e177b473d84cc974aaeac0326
3
+ size 20097
runs/Jul17_09-26-03_405903fcfe02/events.out.tfevents.1721208623.405903fcfe02.739.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17832bd2c862108bbb3a63f77ed3caf302a7feb532bfea3e073df4931cc17cc3
3
+ size 411
runs/Jul17_09-32-17_405903fcfe02/events.out.tfevents.1721208742.405903fcfe02.739.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc9ff6ab6be4a8fc3dceedf6dec83cb4b3c2bd8e9db315c267f3e2a5331f772
3
+ size 7679
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 7.501829674622976e+17,
4
+ "train_loss": 0.22265003621578217,
5
+ "train_runtime": 237.6059,
6
+ "train_samples_per_second": 40.74,
7
+ "train_steps_per_second": 2.559
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.33629149198532104,
3
+ "best_model_checkpoint": "vit-base-food-items-v1/checkpoint-400",
4
+ "epoch": 4.0,
5
+ "eval_steps": 100,
6
+ "global_step": 608,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06578947368421052,
13
+ "grad_norm": 2.054168224334717,
14
+ "learning_rate": 0.00019671052631578949,
15
+ "loss": 2.2227,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.13157894736842105,
20
+ "grad_norm": 2.418569326400757,
21
+ "learning_rate": 0.00019342105263157894,
22
+ "loss": 1.7988,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.19736842105263158,
27
+ "grad_norm": 2.0799572467803955,
28
+ "learning_rate": 0.00019013157894736844,
29
+ "loss": 1.3952,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.2631578947368421,
34
+ "grad_norm": 2.5012855529785156,
35
+ "learning_rate": 0.00018684210526315792,
36
+ "loss": 1.0071,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.32894736842105265,
41
+ "grad_norm": 1.610549807548523,
42
+ "learning_rate": 0.00018355263157894736,
43
+ "loss": 0.8514,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.39473684210526316,
48
+ "grad_norm": 2.7514488697052,
49
+ "learning_rate": 0.00018026315789473684,
50
+ "loss": 0.6752,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.4605263157894737,
55
+ "grad_norm": 5.107870101928711,
56
+ "learning_rate": 0.00017697368421052632,
57
+ "loss": 0.617,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.5263157894736842,
62
+ "grad_norm": 1.621307611465454,
63
+ "learning_rate": 0.0001736842105263158,
64
+ "loss": 0.4383,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.5921052631578947,
69
+ "grad_norm": 2.050955057144165,
70
+ "learning_rate": 0.00017039473684210527,
71
+ "loss": 0.4703,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.6578947368421053,
76
+ "grad_norm": 3.5689868927001953,
77
+ "learning_rate": 0.00016710526315789475,
78
+ "loss": 0.4195,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.6578947368421053,
83
+ "eval_accuracy": 0.9054545454545454,
84
+ "eval_loss": 0.5027927756309509,
85
+ "eval_runtime": 6.6566,
86
+ "eval_samples_per_second": 82.625,
87
+ "eval_steps_per_second": 10.366,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.7236842105263158,
92
+ "grad_norm": 2.683819055557251,
93
+ "learning_rate": 0.00016381578947368422,
94
+ "loss": 0.3666,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.7894736842105263,
99
+ "grad_norm": 2.7733426094055176,
100
+ "learning_rate": 0.0001605263157894737,
101
+ "loss": 0.3876,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 0.8552631578947368,
106
+ "grad_norm": 3.341937303543091,
107
+ "learning_rate": 0.00015723684210526318,
108
+ "loss": 0.3778,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.9210526315789473,
113
+ "grad_norm": 1.0890475511550903,
114
+ "learning_rate": 0.00015394736842105265,
115
+ "loss": 0.3368,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.9868421052631579,
120
+ "grad_norm": 3.217635154724121,
121
+ "learning_rate": 0.0001506578947368421,
122
+ "loss": 0.2434,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 1.0526315789473684,
127
+ "grad_norm": 1.1362298727035522,
128
+ "learning_rate": 0.00014736842105263158,
129
+ "loss": 0.1537,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 1.118421052631579,
134
+ "grad_norm": 0.3043310344219208,
135
+ "learning_rate": 0.00014407894736842106,
136
+ "loss": 0.1786,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 1.1842105263157894,
141
+ "grad_norm": 0.36744824051856995,
142
+ "learning_rate": 0.00014078947368421053,
143
+ "loss": 0.1534,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.25,
148
+ "grad_norm": 0.4088458716869354,
149
+ "learning_rate": 0.0001375,
150
+ "loss": 0.1273,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 1.3157894736842106,
155
+ "grad_norm": 0.22641144692897797,
156
+ "learning_rate": 0.00013421052631578948,
157
+ "loss": 0.1072,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 1.3157894736842106,
162
+ "eval_accuracy": 0.8945454545454545,
163
+ "eval_loss": 0.37944725155830383,
164
+ "eval_runtime": 6.6833,
165
+ "eval_samples_per_second": 82.295,
166
+ "eval_steps_per_second": 10.324,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 1.381578947368421,
171
+ "grad_norm": 0.14886893332004547,
172
+ "learning_rate": 0.00013092105263157893,
173
+ "loss": 0.0846,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 1.4473684210526316,
178
+ "grad_norm": 0.17389647662639618,
179
+ "learning_rate": 0.00012763157894736844,
180
+ "loss": 0.0789,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 1.513157894736842,
185
+ "grad_norm": 0.12492559105157852,
186
+ "learning_rate": 0.00012434210526315791,
187
+ "loss": 0.0605,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 1.5789473684210527,
192
+ "grad_norm": 0.14732375741004944,
193
+ "learning_rate": 0.00012105263157894738,
194
+ "loss": 0.0867,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 1.6447368421052633,
199
+ "grad_norm": 0.1113506406545639,
200
+ "learning_rate": 0.00011776315789473684,
201
+ "loss": 0.0436,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 1.7105263157894737,
206
+ "grad_norm": 0.09813081473112106,
207
+ "learning_rate": 0.00011447368421052632,
208
+ "loss": 0.0416,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 1.776315789473684,
213
+ "grad_norm": 6.826725006103516,
214
+ "learning_rate": 0.0001111842105263158,
215
+ "loss": 0.0514,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 1.8421052631578947,
220
+ "grad_norm": 0.10619573295116425,
221
+ "learning_rate": 0.00010789473684210527,
222
+ "loss": 0.0601,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 1.9078947368421053,
227
+ "grad_norm": 0.13959018886089325,
228
+ "learning_rate": 0.00010460526315789475,
229
+ "loss": 0.0454,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 1.973684210526316,
234
+ "grad_norm": 0.08468258380889893,
235
+ "learning_rate": 0.00010131578947368421,
236
+ "loss": 0.0326,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 1.973684210526316,
241
+ "eval_accuracy": 0.9054545454545454,
242
+ "eval_loss": 0.38323774933815,
243
+ "eval_runtime": 6.0691,
244
+ "eval_samples_per_second": 90.622,
245
+ "eval_steps_per_second": 11.369,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 2.039473684210526,
250
+ "grad_norm": 0.07823757082223892,
251
+ "learning_rate": 9.802631578947369e-05,
252
+ "loss": 0.0392,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 2.1052631578947367,
257
+ "grad_norm": 0.07656868547201157,
258
+ "learning_rate": 9.473684210526316e-05,
259
+ "loss": 0.0288,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 2.1710526315789473,
264
+ "grad_norm": 0.07013211399316788,
265
+ "learning_rate": 9.144736842105264e-05,
266
+ "loss": 0.0313,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 2.236842105263158,
271
+ "grad_norm": 0.07913695275783539,
272
+ "learning_rate": 8.81578947368421e-05,
273
+ "loss": 0.0378,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 2.3026315789473686,
278
+ "grad_norm": 0.3869466483592987,
279
+ "learning_rate": 8.486842105263159e-05,
280
+ "loss": 0.0253,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 2.3684210526315788,
285
+ "grad_norm": 0.06490592658519745,
286
+ "learning_rate": 8.157894736842105e-05,
287
+ "loss": 0.0241,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 2.4342105263157894,
292
+ "grad_norm": 0.06631086021661758,
293
+ "learning_rate": 7.828947368421053e-05,
294
+ "loss": 0.0231,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 2.5,
299
+ "grad_norm": 0.05489266291260719,
300
+ "learning_rate": 7.500000000000001e-05,
301
+ "loss": 0.0218,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 2.5657894736842106,
306
+ "grad_norm": 0.07426982372999191,
307
+ "learning_rate": 7.171052631578947e-05,
308
+ "loss": 0.0215,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 2.6315789473684212,
313
+ "grad_norm": 0.063384510576725,
314
+ "learning_rate": 6.842105263157895e-05,
315
+ "loss": 0.0207,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 2.6315789473684212,
320
+ "eval_accuracy": 0.9236363636363636,
321
+ "eval_loss": 0.33629149198532104,
322
+ "eval_runtime": 6.0608,
323
+ "eval_samples_per_second": 90.746,
324
+ "eval_steps_per_second": 11.385,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 2.6973684210526314,
329
+ "grad_norm": 0.05782260745763779,
330
+ "learning_rate": 6.513157894736842e-05,
331
+ "loss": 0.0201,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 2.763157894736842,
336
+ "grad_norm": 0.05535552278161049,
337
+ "learning_rate": 6.18421052631579e-05,
338
+ "loss": 0.0194,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 2.8289473684210527,
343
+ "grad_norm": 0.05756945163011551,
344
+ "learning_rate": 5.855263157894737e-05,
345
+ "loss": 0.0191,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 2.8947368421052633,
350
+ "grad_norm": 0.05671467259526253,
351
+ "learning_rate": 5.526315789473685e-05,
352
+ "loss": 0.0188,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 2.9605263157894735,
357
+ "grad_norm": 0.05619660019874573,
358
+ "learning_rate": 5.197368421052632e-05,
359
+ "loss": 0.0183,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 3.026315789473684,
364
+ "grad_norm": 0.05277419090270996,
365
+ "learning_rate": 4.868421052631579e-05,
366
+ "loss": 0.0177,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 3.0921052631578947,
371
+ "grad_norm": 0.05281645059585571,
372
+ "learning_rate": 4.539473684210527e-05,
373
+ "loss": 0.0174,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 3.1578947368421053,
378
+ "grad_norm": 0.06867770105600357,
379
+ "learning_rate": 4.210526315789474e-05,
380
+ "loss": 0.017,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 3.223684210526316,
385
+ "grad_norm": 0.047292064875364304,
386
+ "learning_rate": 3.8815789473684214e-05,
387
+ "loss": 0.0168,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 3.2894736842105265,
392
+ "grad_norm": 0.043311525136232376,
393
+ "learning_rate": 3.5526315789473684e-05,
394
+ "loss": 0.0167,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 3.2894736842105265,
399
+ "eval_accuracy": 0.9236363636363636,
400
+ "eval_loss": 0.33733832836151123,
401
+ "eval_runtime": 5.7257,
402
+ "eval_samples_per_second": 96.057,
403
+ "eval_steps_per_second": 12.051,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 3.3552631578947367,
408
+ "grad_norm": 0.04796218127012253,
409
+ "learning_rate": 3.223684210526316e-05,
410
+ "loss": 0.0165,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 3.4210526315789473,
415
+ "grad_norm": 0.048424966633319855,
416
+ "learning_rate": 2.8947368421052634e-05,
417
+ "loss": 0.0163,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 3.486842105263158,
422
+ "grad_norm": 0.046178512275218964,
423
+ "learning_rate": 2.565789473684211e-05,
424
+ "loss": 0.0157,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 3.5526315789473686,
429
+ "grad_norm": 0.04182315245270729,
430
+ "learning_rate": 2.236842105263158e-05,
431
+ "loss": 0.0156,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 3.6184210526315788,
436
+ "grad_norm": 0.04811399057507515,
437
+ "learning_rate": 1.9078947368421056e-05,
438
+ "loss": 0.0157,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 3.6842105263157894,
443
+ "grad_norm": 0.04523231461644173,
444
+ "learning_rate": 1.5789473684210526e-05,
445
+ "loss": 0.0157,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 3.75,
450
+ "grad_norm": 0.04799880087375641,
451
+ "learning_rate": 1.25e-05,
452
+ "loss": 0.0155,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 3.8157894736842106,
457
+ "grad_norm": 0.04668057709932327,
458
+ "learning_rate": 9.210526315789474e-06,
459
+ "loss": 0.0154,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 3.8815789473684212,
464
+ "grad_norm": 0.044472016394138336,
465
+ "learning_rate": 5.921052631578948e-06,
466
+ "loss": 0.0154,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 3.9473684210526314,
471
+ "grad_norm": 0.05030672252178192,
472
+ "learning_rate": 2.631578947368421e-06,
473
+ "loss": 0.0153,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 3.9473684210526314,
478
+ "eval_accuracy": 0.9236363636363636,
479
+ "eval_loss": 0.33738574385643005,
480
+ "eval_runtime": 6.0053,
481
+ "eval_samples_per_second": 91.586,
482
+ "eval_steps_per_second": 11.49,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 4.0,
487
+ "step": 608,
488
+ "total_flos": 7.501829674622976e+17,
489
+ "train_loss": 0.22265003621578217,
490
+ "train_runtime": 237.6059,
491
+ "train_samples_per_second": 40.74,
492
+ "train_steps_per_second": 2.559
493
+ }
494
+ ],
495
+ "logging_steps": 10,
496
+ "max_steps": 608,
497
+ "num_input_tokens_seen": 0,
498
+ "num_train_epochs": 4,
499
+ "save_steps": 100,
500
+ "stateful_callbacks": {
501
+ "TrainerControl": {
502
+ "args": {
503
+ "should_epoch_stop": false,
504
+ "should_evaluate": false,
505
+ "should_log": false,
506
+ "should_save": true,
507
+ "should_training_stop": false
508
+ },
509
+ "attributes": {}
510
+ }
511
+ },
512
+ "total_flos": 7.501829674622976e+17,
513
+ "train_batch_size": 16,
514
+ "trial_name": null,
515
+ "trial_params": null
516
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cff28f95376c6aecb268700119787cbf2428019aa507742603920dd75428cdb8
3
+ size 5112