Amanaccessassist commited on
Commit
4bcb5ce
1 Parent(s): 575be78

End of training

Browse files
README.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/vit-base-patch16-224-in21k
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: finetune-apple-leaf
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # finetune-apple-leaf
17
+
18
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.0682
21
+ - Accuracy: 0.9957
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 2e-05
41
+ - train_batch_size: 64
42
+ - eval_batch_size: 64
43
+ - seed: 42
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: linear
46
+ - num_epochs: 5
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
+ | 0.1988 | 1.0 | 209 | 0.1881 | 0.9957 |
54
+ | 0.1012 | 2.0 | 418 | 0.1110 | 0.9953 |
55
+ | 0.1171 | 3.0 | 627 | 0.0925 | 0.9928 |
56
+ | 0.0766 | 4.0 | 836 | 0.0707 | 0.9966 |
57
+ | 0.0723 | 5.0 | 1045 | 0.0682 | 0.9957 |
58
+
59
+
60
+ ### Framework versions
61
+
62
+ - Transformers 4.41.1
63
+ - Pytorch 2.1.2
64
+ - Datasets 2.19.1
65
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.9957482993197279,
4
+ "eval_loss": 0.0682438537478447,
5
+ "eval_runtime": 40.0762,
6
+ "eval_samples_per_second": 58.688,
7
+ "eval_steps_per_second": 0.923,
8
+ "total_flos": 5.1638175692258e+18,
9
+ "train_loss": 0.20529093819371821,
10
+ "train_runtime": 2500.4647,
11
+ "train_samples_per_second": 26.649,
12
+ "train_steps_per_second": 0.418
13
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "Apple Rust",
13
+ "1": "Apple Scab",
14
+ "2": "Black Rot",
15
+ "3": "Healthy",
16
+ "4": "Uknown"
17
+ },
18
+ "image_size": 224,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "Apple Rust": 0,
23
+ "Apple Scab": 1,
24
+ "Black Rot": 2,
25
+ "Healthy": 3,
26
+ "Uknown": 4
27
+ },
28
+ "layer_norm_eps": 1e-12,
29
+ "model_type": "vit",
30
+ "num_attention_heads": 12,
31
+ "num_channels": 3,
32
+ "num_hidden_layers": 12,
33
+ "patch_size": 16,
34
+ "problem_type": "single_label_classification",
35
+ "qkv_bias": true,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.41.1"
38
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.9957482993197279,
4
+ "eval_loss": 0.0682438537478447,
5
+ "eval_runtime": 40.0762,
6
+ "eval_samples_per_second": 58.688,
7
+ "eval_steps_per_second": 0.923
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1a2ce5434427e2be0a6ed1a16cec05fb055bf9718452c547c4554e4dd98a923
3
+ size 343233204
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "ViTFeatureExtractor",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "resample": 2,
31
+ "rescale_factor": 0.00392156862745098,
32
+ "size": {
33
+ "height": 224,
34
+ "width": 224
35
+ }
36
+ }
runs/Jun02_11-14-26_4d71aa98b00b/events.out.tfevents.1717326867.4d71aa98b00b.34.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ad77aac6ce72735bf044205bca45985c9551983091312aba1a18fe47a86477
3
+ size 28815
runs/Jun02_11-14-26_4d71aa98b00b/events.out.tfevents.1717329814.4d71aa98b00b.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf62c044a6ad9ffe88063353f89bcb9d21ecb4e3481e09d6b53f0b07e0f35b8c
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 5.1638175692258e+18,
4
+ "train_loss": 0.20529093819371821,
5
+ "train_runtime": 2500.4647,
6
+ "train_samples_per_second": 26.649,
7
+ "train_steps_per_second": 0.418
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0682438537478447,
3
+ "best_model_checkpoint": "finetune-apple-leaf/checkpoint-1045",
4
+ "epoch": 5.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1045,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04784688995215311,
13
+ "grad_norm": 68452.109375,
14
+ "learning_rate": 1.980861244019139e-05,
15
+ "loss": 1.5577,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.09569377990430622,
20
+ "grad_norm": 81296.0546875,
21
+ "learning_rate": 1.9617224880382777e-05,
22
+ "loss": 1.4263,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.14354066985645933,
27
+ "grad_norm": 91526.734375,
28
+ "learning_rate": 1.9425837320574165e-05,
29
+ "loss": 1.2775,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.19138755980861244,
34
+ "grad_norm": 82238.640625,
35
+ "learning_rate": 1.9234449760765553e-05,
36
+ "loss": 1.144,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.23923444976076555,
41
+ "grad_norm": 93808.3671875,
42
+ "learning_rate": 1.904306220095694e-05,
43
+ "loss": 0.9826,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.28708133971291866,
48
+ "grad_norm": 92552.0625,
49
+ "learning_rate": 1.8851674641148328e-05,
50
+ "loss": 0.834,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.3349282296650718,
55
+ "grad_norm": 97152.6953125,
56
+ "learning_rate": 1.8660287081339713e-05,
57
+ "loss": 0.7036,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.3827751196172249,
62
+ "grad_norm": 72807.1796875,
63
+ "learning_rate": 1.8468899521531104e-05,
64
+ "loss": 0.5867,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.430622009569378,
69
+ "grad_norm": 80641.046875,
70
+ "learning_rate": 1.8277511961722488e-05,
71
+ "loss": 0.5116,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.4784688995215311,
76
+ "grad_norm": 88719.7265625,
77
+ "learning_rate": 1.8086124401913876e-05,
78
+ "loss": 0.4345,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.5263157894736842,
83
+ "grad_norm": 87248.7109375,
84
+ "learning_rate": 1.7894736842105264e-05,
85
+ "loss": 0.3765,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.5741626794258373,
90
+ "grad_norm": 74184.46875,
91
+ "learning_rate": 1.770334928229665e-05,
92
+ "loss": 0.3363,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.6220095693779905,
97
+ "grad_norm": 76457.8125,
98
+ "learning_rate": 1.751196172248804e-05,
99
+ "loss": 0.3341,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.6698564593301436,
104
+ "grad_norm": 59443.3984375,
105
+ "learning_rate": 1.7320574162679427e-05,
106
+ "loss": 0.2943,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.7177033492822966,
111
+ "grad_norm": 98078.953125,
112
+ "learning_rate": 1.7129186602870815e-05,
113
+ "loss": 0.2792,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.7655502392344498,
118
+ "grad_norm": 68582.828125,
119
+ "learning_rate": 1.6937799043062203e-05,
120
+ "loss": 0.2567,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.8133971291866029,
125
+ "grad_norm": 60306.1328125,
126
+ "learning_rate": 1.674641148325359e-05,
127
+ "loss": 0.2431,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.861244019138756,
132
+ "grad_norm": 72700.328125,
133
+ "learning_rate": 1.6555023923444978e-05,
134
+ "loss": 0.2115,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.9090909090909091,
139
+ "grad_norm": 152611.125,
140
+ "learning_rate": 1.6363636363636366e-05,
141
+ "loss": 0.222,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.9569377990430622,
146
+ "grad_norm": 61888.64453125,
147
+ "learning_rate": 1.6172248803827754e-05,
148
+ "loss": 0.1988,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 1.0,
153
+ "eval_accuracy": 0.9957482993197279,
154
+ "eval_loss": 0.18814513087272644,
155
+ "eval_runtime": 39.0452,
156
+ "eval_samples_per_second": 60.238,
157
+ "eval_steps_per_second": 0.948,
158
+ "step": 209
159
+ },
160
+ {
161
+ "epoch": 1.0047846889952152,
162
+ "grad_norm": 93927.0234375,
163
+ "learning_rate": 1.5980861244019138e-05,
164
+ "loss": 0.215,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 1.0526315789473684,
169
+ "grad_norm": 50695.28125,
170
+ "learning_rate": 1.578947368421053e-05,
171
+ "loss": 0.1936,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 1.1004784688995215,
176
+ "grad_norm": 150133.75,
177
+ "learning_rate": 1.5598086124401914e-05,
178
+ "loss": 0.1818,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 1.1483253588516746,
183
+ "grad_norm": 91581.5078125,
184
+ "learning_rate": 1.5406698564593305e-05,
185
+ "loss": 0.1819,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 1.1961722488038278,
190
+ "grad_norm": 53073.0234375,
191
+ "learning_rate": 1.5215311004784689e-05,
192
+ "loss": 0.1618,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 1.244019138755981,
197
+ "grad_norm": 112323.625,
198
+ "learning_rate": 1.5023923444976079e-05,
199
+ "loss": 0.165,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 1.291866028708134,
204
+ "grad_norm": 97414.734375,
205
+ "learning_rate": 1.4832535885167465e-05,
206
+ "loss": 0.2017,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 1.339712918660287,
211
+ "grad_norm": 162907.59375,
212
+ "learning_rate": 1.4641148325358854e-05,
213
+ "loss": 0.1762,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 1.38755980861244,
218
+ "grad_norm": 27552.251953125,
219
+ "learning_rate": 1.444976076555024e-05,
220
+ "loss": 0.1715,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 1.4354066985645932,
225
+ "grad_norm": 108859.3671875,
226
+ "learning_rate": 1.4258373205741626e-05,
227
+ "loss": 0.153,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 1.4832535885167464,
232
+ "grad_norm": 73719.96875,
233
+ "learning_rate": 1.4066985645933016e-05,
234
+ "loss": 0.14,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 1.5311004784688995,
239
+ "grad_norm": 21314.03125,
240
+ "learning_rate": 1.3875598086124402e-05,
241
+ "loss": 0.1354,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 1.5789473684210527,
246
+ "grad_norm": 58937.32421875,
247
+ "learning_rate": 1.3684210526315791e-05,
248
+ "loss": 0.1389,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 1.6267942583732058,
253
+ "grad_norm": 56629.7578125,
254
+ "learning_rate": 1.3492822966507177e-05,
255
+ "loss": 0.1402,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.674641148325359,
260
+ "grad_norm": 59540.6953125,
261
+ "learning_rate": 1.3301435406698567e-05,
262
+ "loss": 0.1347,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.722488038277512,
267
+ "grad_norm": 81783.2421875,
268
+ "learning_rate": 1.3110047846889953e-05,
269
+ "loss": 0.1448,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 1.7703349282296652,
274
+ "grad_norm": 90950.421875,
275
+ "learning_rate": 1.2918660287081342e-05,
276
+ "loss": 0.1503,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 1.8181818181818183,
281
+ "grad_norm": 149309.421875,
282
+ "learning_rate": 1.2727272727272728e-05,
283
+ "loss": 0.1464,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 1.8660287081339713,
288
+ "grad_norm": 83194.9375,
289
+ "learning_rate": 1.2535885167464116e-05,
290
+ "loss": 0.1394,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.9138755980861244,
295
+ "grad_norm": 124922.578125,
296
+ "learning_rate": 1.2344497607655504e-05,
297
+ "loss": 0.1245,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.9617224880382775,
302
+ "grad_norm": 52044.2578125,
303
+ "learning_rate": 1.215311004784689e-05,
304
+ "loss": 0.1012,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 2.0,
309
+ "eval_accuracy": 0.9953231292517006,
310
+ "eval_loss": 0.1110498234629631,
311
+ "eval_runtime": 39.4692,
312
+ "eval_samples_per_second": 59.591,
313
+ "eval_steps_per_second": 0.937,
314
+ "step": 418
315
+ },
316
+ {
317
+ "epoch": 2.0095693779904304,
318
+ "grad_norm": 78857.1328125,
319
+ "learning_rate": 1.196172248803828e-05,
320
+ "loss": 0.1258,
321
+ "step": 420
322
+ },
323
+ {
324
+ "epoch": 2.0574162679425836,
325
+ "grad_norm": 74979.1796875,
326
+ "learning_rate": 1.1770334928229666e-05,
327
+ "loss": 0.1325,
328
+ "step": 430
329
+ },
330
+ {
331
+ "epoch": 2.1052631578947367,
332
+ "grad_norm": 85820.78125,
333
+ "learning_rate": 1.1578947368421053e-05,
334
+ "loss": 0.1231,
335
+ "step": 440
336
+ },
337
+ {
338
+ "epoch": 2.15311004784689,
339
+ "grad_norm": 58782.41015625,
340
+ "learning_rate": 1.1387559808612441e-05,
341
+ "loss": 0.1282,
342
+ "step": 450
343
+ },
344
+ {
345
+ "epoch": 2.200956937799043,
346
+ "grad_norm": 21210.45703125,
347
+ "learning_rate": 1.1196172248803829e-05,
348
+ "loss": 0.1087,
349
+ "step": 460
350
+ },
351
+ {
352
+ "epoch": 2.248803827751196,
353
+ "grad_norm": 80271.7265625,
354
+ "learning_rate": 1.1004784688995217e-05,
355
+ "loss": 0.1192,
356
+ "step": 470
357
+ },
358
+ {
359
+ "epoch": 2.2966507177033493,
360
+ "grad_norm": 32322.310546875,
361
+ "learning_rate": 1.0813397129186604e-05,
362
+ "loss": 0.1098,
363
+ "step": 480
364
+ },
365
+ {
366
+ "epoch": 2.3444976076555024,
367
+ "grad_norm": 113407.7578125,
368
+ "learning_rate": 1.062200956937799e-05,
369
+ "loss": 0.1216,
370
+ "step": 490
371
+ },
372
+ {
373
+ "epoch": 2.3923444976076556,
374
+ "grad_norm": 194755.9375,
375
+ "learning_rate": 1.0430622009569378e-05,
376
+ "loss": 0.116,
377
+ "step": 500
378
+ },
379
+ {
380
+ "epoch": 2.4401913875598087,
381
+ "grad_norm": 23810.296875,
382
+ "learning_rate": 1.0239234449760766e-05,
383
+ "loss": 0.1026,
384
+ "step": 510
385
+ },
386
+ {
387
+ "epoch": 2.488038277511962,
388
+ "grad_norm": 49451.3125,
389
+ "learning_rate": 1.0047846889952154e-05,
390
+ "loss": 0.1217,
391
+ "step": 520
392
+ },
393
+ {
394
+ "epoch": 2.535885167464115,
395
+ "grad_norm": 85535.9609375,
396
+ "learning_rate": 9.856459330143542e-06,
397
+ "loss": 0.1324,
398
+ "step": 530
399
+ },
400
+ {
401
+ "epoch": 2.583732057416268,
402
+ "grad_norm": 77596.9453125,
403
+ "learning_rate": 9.66507177033493e-06,
404
+ "loss": 0.1154,
405
+ "step": 540
406
+ },
407
+ {
408
+ "epoch": 2.6315789473684212,
409
+ "grad_norm": 196213.265625,
410
+ "learning_rate": 9.473684210526315e-06,
411
+ "loss": 0.1234,
412
+ "step": 550
413
+ },
414
+ {
415
+ "epoch": 2.679425837320574,
416
+ "grad_norm": 129738.171875,
417
+ "learning_rate": 9.282296650717703e-06,
418
+ "loss": 0.0983,
419
+ "step": 560
420
+ },
421
+ {
422
+ "epoch": 2.7272727272727275,
423
+ "grad_norm": 123404.078125,
424
+ "learning_rate": 9.090909090909091e-06,
425
+ "loss": 0.1187,
426
+ "step": 570
427
+ },
428
+ {
429
+ "epoch": 2.77511961722488,
430
+ "grad_norm": 163260.46875,
431
+ "learning_rate": 8.899521531100479e-06,
432
+ "loss": 0.1145,
433
+ "step": 580
434
+ },
435
+ {
436
+ "epoch": 2.8229665071770333,
437
+ "grad_norm": 92286.703125,
438
+ "learning_rate": 8.708133971291867e-06,
439
+ "loss": 0.1001,
440
+ "step": 590
441
+ },
442
+ {
443
+ "epoch": 2.8708133971291865,
444
+ "grad_norm": 107133.9375,
445
+ "learning_rate": 8.516746411483254e-06,
446
+ "loss": 0.0953,
447
+ "step": 600
448
+ },
449
+ {
450
+ "epoch": 2.9186602870813396,
451
+ "grad_norm": 52350.8359375,
452
+ "learning_rate": 8.325358851674642e-06,
453
+ "loss": 0.0981,
454
+ "step": 610
455
+ },
456
+ {
457
+ "epoch": 2.9665071770334928,
458
+ "grad_norm": 33242.46875,
459
+ "learning_rate": 8.13397129186603e-06,
460
+ "loss": 0.1171,
461
+ "step": 620
462
+ },
463
+ {
464
+ "epoch": 3.0,
465
+ "eval_accuracy": 0.9927721088435374,
466
+ "eval_loss": 0.0925019159913063,
467
+ "eval_runtime": 39.5447,
468
+ "eval_samples_per_second": 59.477,
469
+ "eval_steps_per_second": 0.936,
470
+ "step": 627
471
+ },
472
+ {
473
+ "epoch": 3.014354066985646,
474
+ "grad_norm": 24220.681640625,
475
+ "learning_rate": 7.942583732057418e-06,
476
+ "loss": 0.0879,
477
+ "step": 630
478
+ },
479
+ {
480
+ "epoch": 3.062200956937799,
481
+ "grad_norm": 161409.765625,
482
+ "learning_rate": 7.751196172248805e-06,
483
+ "loss": 0.098,
484
+ "step": 640
485
+ },
486
+ {
487
+ "epoch": 3.110047846889952,
488
+ "grad_norm": 137376.515625,
489
+ "learning_rate": 7.5598086124401915e-06,
490
+ "loss": 0.0991,
491
+ "step": 650
492
+ },
493
+ {
494
+ "epoch": 3.1578947368421053,
495
+ "grad_norm": 17668.015625,
496
+ "learning_rate": 7.368421052631579e-06,
497
+ "loss": 0.0898,
498
+ "step": 660
499
+ },
500
+ {
501
+ "epoch": 3.2057416267942584,
502
+ "grad_norm": 168850.53125,
503
+ "learning_rate": 7.177033492822967e-06,
504
+ "loss": 0.0787,
505
+ "step": 670
506
+ },
507
+ {
508
+ "epoch": 3.2535885167464116,
509
+ "grad_norm": 24098.3203125,
510
+ "learning_rate": 6.985645933014355e-06,
511
+ "loss": 0.1136,
512
+ "step": 680
513
+ },
514
+ {
515
+ "epoch": 3.3014354066985647,
516
+ "grad_norm": 184889.9375,
517
+ "learning_rate": 6.794258373205742e-06,
518
+ "loss": 0.0899,
519
+ "step": 690
520
+ },
521
+ {
522
+ "epoch": 3.349282296650718,
523
+ "grad_norm": 154068.140625,
524
+ "learning_rate": 6.6028708133971295e-06,
525
+ "loss": 0.072,
526
+ "step": 700
527
+ },
528
+ {
529
+ "epoch": 3.397129186602871,
530
+ "grad_norm": 213492.796875,
531
+ "learning_rate": 6.411483253588517e-06,
532
+ "loss": 0.095,
533
+ "step": 710
534
+ },
535
+ {
536
+ "epoch": 3.444976076555024,
537
+ "grad_norm": 79102.0859375,
538
+ "learning_rate": 6.220095693779905e-06,
539
+ "loss": 0.0852,
540
+ "step": 720
541
+ },
542
+ {
543
+ "epoch": 3.492822966507177,
544
+ "grad_norm": 158182.59375,
545
+ "learning_rate": 6.028708133971293e-06,
546
+ "loss": 0.079,
547
+ "step": 730
548
+ },
549
+ {
550
+ "epoch": 3.5406698564593304,
551
+ "grad_norm": 12833.8759765625,
552
+ "learning_rate": 5.837320574162681e-06,
553
+ "loss": 0.0881,
554
+ "step": 740
555
+ },
556
+ {
557
+ "epoch": 3.588516746411483,
558
+ "grad_norm": 124951.265625,
559
+ "learning_rate": 5.645933014354067e-06,
560
+ "loss": 0.0963,
561
+ "step": 750
562
+ },
563
+ {
564
+ "epoch": 3.6363636363636362,
565
+ "grad_norm": 207838.703125,
566
+ "learning_rate": 5.4545454545454545e-06,
567
+ "loss": 0.0967,
568
+ "step": 760
569
+ },
570
+ {
571
+ "epoch": 3.6842105263157894,
572
+ "grad_norm": 15068.2353515625,
573
+ "learning_rate": 5.263157894736842e-06,
574
+ "loss": 0.081,
575
+ "step": 770
576
+ },
577
+ {
578
+ "epoch": 3.7320574162679425,
579
+ "grad_norm": 20366.380859375,
580
+ "learning_rate": 5.07177033492823e-06,
581
+ "loss": 0.0824,
582
+ "step": 780
583
+ },
584
+ {
585
+ "epoch": 3.7799043062200957,
586
+ "grad_norm": 11184.1630859375,
587
+ "learning_rate": 4.880382775119618e-06,
588
+ "loss": 0.1021,
589
+ "step": 790
590
+ },
591
+ {
592
+ "epoch": 3.827751196172249,
593
+ "grad_norm": 41858.83203125,
594
+ "learning_rate": 4.6889952153110055e-06,
595
+ "loss": 0.071,
596
+ "step": 800
597
+ },
598
+ {
599
+ "epoch": 3.875598086124402,
600
+ "grad_norm": 11330.4501953125,
601
+ "learning_rate": 4.4976076555023925e-06,
602
+ "loss": 0.0725,
603
+ "step": 810
604
+ },
605
+ {
606
+ "epoch": 3.923444976076555,
607
+ "grad_norm": 143718.625,
608
+ "learning_rate": 4.30622009569378e-06,
609
+ "loss": 0.0938,
610
+ "step": 820
611
+ },
612
+ {
613
+ "epoch": 3.971291866028708,
614
+ "grad_norm": 89987.9453125,
615
+ "learning_rate": 4.114832535885168e-06,
616
+ "loss": 0.0766,
617
+ "step": 830
618
+ },
619
+ {
620
+ "epoch": 4.0,
621
+ "eval_accuracy": 0.9965986394557823,
622
+ "eval_loss": 0.07070581614971161,
623
+ "eval_runtime": 39.4603,
624
+ "eval_samples_per_second": 59.604,
625
+ "eval_steps_per_second": 0.938,
626
+ "step": 836
627
+ },
628
+ {
629
+ "epoch": 4.019138755980861,
630
+ "grad_norm": 57436.94140625,
631
+ "learning_rate": 3.923444976076555e-06,
632
+ "loss": 0.0945,
633
+ "step": 840
634
+ },
635
+ {
636
+ "epoch": 4.0669856459330145,
637
+ "grad_norm": 40826.1953125,
638
+ "learning_rate": 3.732057416267943e-06,
639
+ "loss": 0.0743,
640
+ "step": 850
641
+ },
642
+ {
643
+ "epoch": 4.114832535885167,
644
+ "grad_norm": 29963.001953125,
645
+ "learning_rate": 3.5406698564593305e-06,
646
+ "loss": 0.0962,
647
+ "step": 860
648
+ },
649
+ {
650
+ "epoch": 4.162679425837321,
651
+ "grad_norm": 32081.44921875,
652
+ "learning_rate": 3.3492822966507182e-06,
653
+ "loss": 0.0883,
654
+ "step": 870
655
+ },
656
+ {
657
+ "epoch": 4.2105263157894735,
658
+ "grad_norm": 65598.9375,
659
+ "learning_rate": 3.157894736842105e-06,
660
+ "loss": 0.0753,
661
+ "step": 880
662
+ },
663
+ {
664
+ "epoch": 4.258373205741627,
665
+ "grad_norm": 72391.234375,
666
+ "learning_rate": 2.966507177033493e-06,
667
+ "loss": 0.0764,
668
+ "step": 890
669
+ },
670
+ {
671
+ "epoch": 4.30622009569378,
672
+ "grad_norm": 82336.0625,
673
+ "learning_rate": 2.7751196172248807e-06,
674
+ "loss": 0.0594,
675
+ "step": 900
676
+ },
677
+ {
678
+ "epoch": 4.354066985645933,
679
+ "grad_norm": 126016.4609375,
680
+ "learning_rate": 2.5837320574162685e-06,
681
+ "loss": 0.0782,
682
+ "step": 910
683
+ },
684
+ {
685
+ "epoch": 4.401913875598086,
686
+ "grad_norm": 10192.6044921875,
687
+ "learning_rate": 2.392344497607656e-06,
688
+ "loss": 0.0705,
689
+ "step": 920
690
+ },
691
+ {
692
+ "epoch": 4.44976076555024,
693
+ "grad_norm": 226228.453125,
694
+ "learning_rate": 2.200956937799043e-06,
695
+ "loss": 0.0793,
696
+ "step": 930
697
+ },
698
+ {
699
+ "epoch": 4.497607655502392,
700
+ "grad_norm": 11644.4921875,
701
+ "learning_rate": 2.0095693779904305e-06,
702
+ "loss": 0.0687,
703
+ "step": 940
704
+ },
705
+ {
706
+ "epoch": 4.545454545454545,
707
+ "grad_norm": 71224.4609375,
708
+ "learning_rate": 1.8181818181818183e-06,
709
+ "loss": 0.0626,
710
+ "step": 950
711
+ },
712
+ {
713
+ "epoch": 4.5933014354066986,
714
+ "grad_norm": 15732.5244140625,
715
+ "learning_rate": 1.6267942583732059e-06,
716
+ "loss": 0.0764,
717
+ "step": 960
718
+ },
719
+ {
720
+ "epoch": 4.641148325358852,
721
+ "grad_norm": 49954.69921875,
722
+ "learning_rate": 1.4354066985645934e-06,
723
+ "loss": 0.0941,
724
+ "step": 970
725
+ },
726
+ {
727
+ "epoch": 4.688995215311005,
728
+ "grad_norm": 196565.375,
729
+ "learning_rate": 1.244019138755981e-06,
730
+ "loss": 0.0802,
731
+ "step": 980
732
+ },
733
+ {
734
+ "epoch": 4.7368421052631575,
735
+ "grad_norm": 29685.708984375,
736
+ "learning_rate": 1.0526315789473685e-06,
737
+ "loss": 0.0656,
738
+ "step": 990
739
+ },
740
+ {
741
+ "epoch": 4.784688995215311,
742
+ "grad_norm": 31311.546875,
743
+ "learning_rate": 8.612440191387561e-07,
744
+ "loss": 0.0908,
745
+ "step": 1000
746
+ },
747
+ {
748
+ "epoch": 4.832535885167464,
749
+ "grad_norm": 75467.4765625,
750
+ "learning_rate": 6.698564593301436e-07,
751
+ "loss": 0.0755,
752
+ "step": 1010
753
+ },
754
+ {
755
+ "epoch": 4.880382775119617,
756
+ "grad_norm": 51163.75390625,
757
+ "learning_rate": 4.784688995215311e-07,
758
+ "loss": 0.0836,
759
+ "step": 1020
760
+ },
761
+ {
762
+ "epoch": 4.92822966507177,
763
+ "grad_norm": 180410.21875,
764
+ "learning_rate": 2.870813397129187e-07,
765
+ "loss": 0.0815,
766
+ "step": 1030
767
+ },
768
+ {
769
+ "epoch": 4.976076555023924,
770
+ "grad_norm": 187452.96875,
771
+ "learning_rate": 9.569377990430622e-08,
772
+ "loss": 0.0723,
773
+ "step": 1040
774
+ },
775
+ {
776
+ "epoch": 5.0,
777
+ "eval_accuracy": 0.9957482993197279,
778
+ "eval_loss": 0.0682438537478447,
779
+ "eval_runtime": 38.7752,
780
+ "eval_samples_per_second": 60.657,
781
+ "eval_steps_per_second": 0.954,
782
+ "step": 1045
783
+ },
784
+ {
785
+ "epoch": 5.0,
786
+ "step": 1045,
787
+ "total_flos": 5.1638175692258e+18,
788
+ "train_loss": 0.20529093819371821,
789
+ "train_runtime": 2500.4647,
790
+ "train_samples_per_second": 26.649,
791
+ "train_steps_per_second": 0.418
792
+ }
793
+ ],
794
+ "logging_steps": 10,
795
+ "max_steps": 1045,
796
+ "num_input_tokens_seen": 0,
797
+ "num_train_epochs": 5,
798
+ "save_steps": 100,
799
+ "stateful_callbacks": {
800
+ "EarlyStoppingCallback": {
801
+ "args": {
802
+ "early_stopping_patience": 3,
803
+ "early_stopping_threshold": 0.0
804
+ },
805
+ "attributes": {
806
+ "early_stopping_patience_counter": 0
807
+ }
808
+ },
809
+ "TrainerControl": {
810
+ "args": {
811
+ "should_epoch_stop": false,
812
+ "should_evaluate": false,
813
+ "should_log": false,
814
+ "should_save": true,
815
+ "should_training_stop": true
816
+ },
817
+ "attributes": {}
818
+ }
819
+ },
820
+ "total_flos": 5.1638175692258e+18,
821
+ "train_batch_size": 64,
822
+ "trial_name": null,
823
+ "trial_params": null
824
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f7d7582e978604d3f252d76d6c8ac401fe7d5508d003dd429c9a25147cb9090
3
+ size 5112