gsandle92 commited on
Commit
dfb30c3
·
verified ·
1 Parent(s): deb0d2c

End of training

Browse files
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google/vit-base-patch16-224-in21k
5
+ tags:
6
+ - image-classification
7
+ - generated_from_trainer
8
+ datasets:
9
+ - imagefolder
10
+ metrics:
11
+ - accuracy
12
+ model-index:
13
+ - name: vit-base-beans-demo-v5
14
+ results:
15
+ - task:
16
+ name: Image Classification
17
+ type: image-classification
18
+ dataset:
19
+ name: thumbnails
20
+ type: imagefolder
21
+ config: default
22
+ split: train
23
+ args: default
24
+ metrics:
25
+ - name: Accuracy
26
+ type: accuracy
27
+ value: 0.9761904761904762
28
+ ---
29
+
30
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
31
+ should probably proofread and complete it, then remove this comment. -->
32
+
33
+ # vit-base-beans-demo-v5
34
+
35
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the thumbnails dataset.
36
+ It achieves the following results on the evaluation set:
37
+ - Loss: 0.0984
38
+ - Accuracy: 0.9762
39
+
40
+ ## Model description
41
+
42
+ More information needed
43
+
44
+ ## Intended uses & limitations
45
+
46
+ More information needed
47
+
48
+ ## Training and evaluation data
49
+
50
+ More information needed
51
+
52
+ ## Training procedure
53
+
54
+ ### Training hyperparameters
55
+
56
+ The following hyperparameters were used during training:
57
+ - learning_rate: 0.0002
58
+ - train_batch_size: 16
59
+ - eval_batch_size: 8
60
+ - seed: 42
61
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
62
+ - lr_scheduler_type: linear
63
+ - num_epochs: 1
64
+ - mixed_precision_training: Native AMP
65
+
66
+ ### Training results
67
+
68
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
70
+ | 0.1192 | 0.1764 | 100 | 0.1158 | 0.9692 |
71
+ | 0.0734 | 0.3527 | 200 | 0.1268 | 0.9702 |
72
+ | 0.0701 | 0.5291 | 300 | 0.1057 | 0.9673 |
73
+ | 0.1107 | 0.7055 | 400 | 0.1081 | 0.9722 |
74
+ | 0.0413 | 0.8818 | 500 | 0.0984 | 0.9762 |
75
+
76
+
77
+ ### Framework versions
78
+
79
+ - Transformers 4.47.0
80
+ - Pytorch 2.4.1+cu124
81
+ - Datasets 3.2.0
82
+ - Tokenizers 0.21.0
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google/vit-base-patch16-224-in21k
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - imagefolder
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: vit-base-beans-demo-v5
13
+ results:
14
+ - task:
15
+ name: Image Classification
16
+ type: image-classification
17
+ dataset:
18
+ name: imagefolder
19
+ type: imagefolder
20
+ config: default
21
+ split: train
22
+ args: default
23
+ metrics:
24
+ - name: Accuracy
25
+ type: accuracy
26
+ value: 0.9761904761904762
27
+ ---
28
+
29
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
+ should probably proofread and complete it, then remove this comment. -->
31
+
32
+ # vit-base-beans-demo-v5
33
+
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
+ It achieves the following results on the evaluation set:
36
+ - Loss: 0.0984
37
+ - Accuracy: 0.9762
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 0.0002
57
+ - train_batch_size: 16
58
+ - eval_batch_size: 8
59
+ - seed: 42
60
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
61
+ - lr_scheduler_type: linear
62
+ - num_epochs: 1
63
+ - mixed_precision_training: Native AMP
64
+
65
+ ### Training results
66
+
67
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
69
+ | 0.1192 | 0.1764 | 100 | 0.1158 | 0.9692 |
70
+ | 0.0734 | 0.3527 | 200 | 0.1268 | 0.9702 |
71
+ | 0.0701 | 0.5291 | 300 | 0.1057 | 0.9673 |
72
+ | 0.1107 | 0.7055 | 400 | 0.1081 | 0.9722 |
73
+ | 0.0413 | 0.8818 | 500 | 0.0984 | 0.9762 |
74
+
75
+
76
+ ### Framework versions
77
+
78
+ - Transformers 4.47.0
79
+ - Pytorch 2.4.1+cu124
80
+ - Datasets 3.2.0
81
+ - Tokenizers 0.21.0
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 7.023873938666619e+17,
4
+ "train_loss": 0.09499678115575612,
5
+ "train_runtime": 238.2789,
6
+ "train_samples_per_second": 38.039,
7
+ "train_steps_per_second": 2.38
8
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "negative",
13
+ "1": "positive"
14
+ },
15
+ "image_size": 224,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "negative": "0",
20
+ "positive": "1"
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "model_type": "vit",
24
+ "num_attention_heads": 12,
25
+ "num_channels": 3,
26
+ "num_hidden_layers": 12,
27
+ "patch_size": 16,
28
+ "problem_type": "single_label_classification",
29
+ "qkv_bias": true,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.47.0"
32
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5debb94f820dfbdbb03b6e6e875f9afdd644efe80fb1e0edae48cf65813bd4c8
3
+ size 343223968
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "ViTFeatureExtractor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "resample": 2,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 224,
21
+ "width": 224
22
+ }
23
+ }
runs/Dec12_00-29-14_8d603ce634bd/events.out.tfevents.1733963658.8d603ce634bd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dd65655c9956d8c15c15c86602fafc43f96424ce1119b7647effe2591d82f7d
3
+ size 9996
runs/Dec12_01-01-17_8d603ce634bd/events.out.tfevents.1733965296.8d603ce634bd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:720f0010caade76d5235809dec456b1c83ac321445b720f2c0aef3f3a651bb94
3
+ size 5018
runs/Dec12_01-04-35_8d603ce634bd/events.out.tfevents.1733965485.8d603ce634bd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9a5fe9a83efb291168fff5c22e11d3015ae101df0c626fd0d5ec0e9fe19c0f5
3
+ size 9996
runs/Dec12_01-13-50_8d603ce634bd/events.out.tfevents.1733966031.8d603ce634bd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:013670751f6a0abfde00f99f6dfa3ad3184d77cec141361bcee0c11fdaf22472
3
+ size 7088
runs/Dec12_01-16-21_8d603ce634bd/events.out.tfevents.1733966187.8d603ce634bd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed525cdff0120f95430dc70848dd53f420bf9dd0f549c2c10a893bde24885af
3
+ size 32369
runs/Dec12_01-23-00_8d603ce634bd/events.out.tfevents.1733966588.8d603ce634bd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7456a000d7a2ee0b2e45dd140ce30849b60d6a2ec769969627447ebd24868a29
3
+ size 18803
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 7.023873938666619e+17,
4
+ "train_loss": 0.09499678115575612,
5
+ "train_runtime": 238.2789,
6
+ "train_samples_per_second": 38.039,
7
+ "train_steps_per_second": 2.38
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.09835900366306305,
3
+ "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-500",
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 567,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01763668430335097,
13
+ "grad_norm": 0.15796910226345062,
14
+ "learning_rate": 0.00019647266313932982,
15
+ "loss": 0.1281,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.03527336860670194,
20
+ "grad_norm": 0.291827917098999,
21
+ "learning_rate": 0.0001929453262786596,
22
+ "loss": 0.1139,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.05291005291005291,
27
+ "grad_norm": 0.489213764667511,
28
+ "learning_rate": 0.00018941798941798943,
29
+ "loss": 0.1035,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.07054673721340388,
34
+ "grad_norm": 0.11527382582426071,
35
+ "learning_rate": 0.00018589065255731924,
36
+ "loss": 0.1207,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08818342151675485,
41
+ "grad_norm": 0.15595707297325134,
42
+ "learning_rate": 0.00018236331569664903,
43
+ "loss": 0.1186,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.10582010582010581,
48
+ "grad_norm": 0.7361170053482056,
49
+ "learning_rate": 0.00017883597883597884,
50
+ "loss": 0.1058,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.12345679012345678,
55
+ "grad_norm": 0.289522647857666,
56
+ "learning_rate": 0.00017530864197530866,
57
+ "loss": 0.0913,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.14109347442680775,
62
+ "grad_norm": 0.0984591692686081,
63
+ "learning_rate": 0.00017178130511463847,
64
+ "loss": 0.0928,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.15873015873015872,
69
+ "grad_norm": 0.08082891255617142,
70
+ "learning_rate": 0.00016825396825396826,
71
+ "loss": 0.0829,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.1763668430335097,
76
+ "grad_norm": 0.5304596424102783,
77
+ "learning_rate": 0.00016472663139329807,
78
+ "loss": 0.1192,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.1763668430335097,
83
+ "eval_accuracy": 0.9692460317460317,
84
+ "eval_loss": 0.1157514676451683,
85
+ "eval_runtime": 12.4102,
86
+ "eval_samples_per_second": 81.224,
87
+ "eval_steps_per_second": 10.153,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.19400352733686066,
92
+ "grad_norm": 0.2734580636024475,
93
+ "learning_rate": 0.0001611992945326279,
94
+ "loss": 0.0381,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.21164021164021163,
99
+ "grad_norm": 1.1482329368591309,
100
+ "learning_rate": 0.00015767195767195767,
101
+ "loss": 0.1579,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 0.2292768959435626,
106
+ "grad_norm": 0.29367437958717346,
107
+ "learning_rate": 0.0001541446208112875,
108
+ "loss": 0.1296,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.24691358024691357,
113
+ "grad_norm": 0.4296620190143585,
114
+ "learning_rate": 0.0001506172839506173,
115
+ "loss": 0.0876,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.26455026455026454,
120
+ "grad_norm": 0.430569589138031,
121
+ "learning_rate": 0.0001470899470899471,
122
+ "loss": 0.1431,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 0.2821869488536155,
127
+ "grad_norm": 0.17311625182628632,
128
+ "learning_rate": 0.0001435626102292769,
129
+ "loss": 0.1194,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 0.2998236331569665,
134
+ "grad_norm": 0.21777546405792236,
135
+ "learning_rate": 0.00014003527336860672,
136
+ "loss": 0.1202,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 0.31746031746031744,
141
+ "grad_norm": 2.8384833335876465,
142
+ "learning_rate": 0.0001365079365079365,
143
+ "loss": 0.1352,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.3350970017636684,
148
+ "grad_norm": 0.3133362829685211,
149
+ "learning_rate": 0.00013298059964726632,
150
+ "loss": 0.0935,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 0.3527336860670194,
155
+ "grad_norm": 0.27794110774993896,
156
+ "learning_rate": 0.00012945326278659614,
157
+ "loss": 0.0734,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.3527336860670194,
162
+ "eval_accuracy": 0.9702380952380952,
163
+ "eval_loss": 0.1267656832933426,
164
+ "eval_runtime": 12.6893,
165
+ "eval_samples_per_second": 79.437,
166
+ "eval_steps_per_second": 9.93,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.37037037037037035,
171
+ "grad_norm": 0.12122131884098053,
172
+ "learning_rate": 0.00012592592592592592,
173
+ "loss": 0.1548,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 0.3880070546737213,
178
+ "grad_norm": 0.14606191217899323,
179
+ "learning_rate": 0.00012239858906525574,
180
+ "loss": 0.0166,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 0.4056437389770723,
185
+ "grad_norm": 0.118919737637043,
186
+ "learning_rate": 0.00011887125220458555,
187
+ "loss": 0.1295,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.42328042328042326,
192
+ "grad_norm": 0.14326384663581848,
193
+ "learning_rate": 0.00011534391534391535,
194
+ "loss": 0.1287,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 0.4409171075837742,
199
+ "grad_norm": 1.4116628170013428,
200
+ "learning_rate": 0.00011181657848324515,
201
+ "loss": 0.148,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 0.4585537918871252,
206
+ "grad_norm": 0.18267543613910675,
207
+ "learning_rate": 0.00010828924162257497,
208
+ "loss": 0.1161,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 0.47619047619047616,
213
+ "grad_norm": 0.6114774346351624,
214
+ "learning_rate": 0.00010476190476190477,
215
+ "loss": 0.0778,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 0.49382716049382713,
220
+ "grad_norm": 0.10120945423841476,
221
+ "learning_rate": 0.00010123456790123458,
222
+ "loss": 0.026,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 0.5114638447971781,
227
+ "grad_norm": 0.5115650296211243,
228
+ "learning_rate": 9.770723104056437e-05,
229
+ "loss": 0.0964,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 0.5291005291005291,
234
+ "grad_norm": 0.2577248513698578,
235
+ "learning_rate": 9.417989417989419e-05,
236
+ "loss": 0.0701,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 0.5291005291005291,
241
+ "eval_accuracy": 0.9672619047619048,
242
+ "eval_loss": 0.1057305857539177,
243
+ "eval_runtime": 12.4383,
244
+ "eval_samples_per_second": 81.04,
245
+ "eval_steps_per_second": 10.13,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.54673721340388,
250
+ "grad_norm": 1.4628558158874512,
251
+ "learning_rate": 9.065255731922399e-05,
252
+ "loss": 0.1241,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 0.564373897707231,
257
+ "grad_norm": 0.4355175793170929,
258
+ "learning_rate": 8.712522045855379e-05,
259
+ "loss": 0.125,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 0.582010582010582,
264
+ "grad_norm": 0.09273388981819153,
265
+ "learning_rate": 8.35978835978836e-05,
266
+ "loss": 0.0728,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 0.599647266313933,
271
+ "grad_norm": 0.777564525604248,
272
+ "learning_rate": 8.00705467372134e-05,
273
+ "loss": 0.0974,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 0.6172839506172839,
278
+ "grad_norm": 0.8731770515441895,
279
+ "learning_rate": 7.65432098765432e-05,
280
+ "loss": 0.1852,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 0.6349206349206349,
285
+ "grad_norm": 0.18054471909999847,
286
+ "learning_rate": 7.301587301587302e-05,
287
+ "loss": 0.0722,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 0.6525573192239859,
292
+ "grad_norm": 0.10277850925922394,
293
+ "learning_rate": 6.948853615520282e-05,
294
+ "loss": 0.0705,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 0.6701940035273368,
299
+ "grad_norm": 0.2326202243566513,
300
+ "learning_rate": 6.596119929453263e-05,
301
+ "loss": 0.0813,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 0.6878306878306878,
306
+ "grad_norm": 0.4453868865966797,
307
+ "learning_rate": 6.243386243386243e-05,
308
+ "loss": 0.0443,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 0.7054673721340388,
313
+ "grad_norm": 0.18326041102409363,
314
+ "learning_rate": 5.890652557319224e-05,
315
+ "loss": 0.1107,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 0.7054673721340388,
320
+ "eval_accuracy": 0.9722222222222222,
321
+ "eval_loss": 0.10807543247938156,
322
+ "eval_runtime": 12.4927,
323
+ "eval_samples_per_second": 80.687,
324
+ "eval_steps_per_second": 10.086,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 0.7231040564373897,
329
+ "grad_norm": 0.08458270132541656,
330
+ "learning_rate": 5.537918871252204e-05,
331
+ "loss": 0.0699,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 0.7407407407407407,
336
+ "grad_norm": 0.3042624294757843,
337
+ "learning_rate": 5.185185185185185e-05,
338
+ "loss": 0.0649,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 0.7583774250440917,
343
+ "grad_norm": 0.1482134908437729,
344
+ "learning_rate": 4.832451499118166e-05,
345
+ "loss": 0.0652,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 0.7760141093474426,
350
+ "grad_norm": 0.08534322679042816,
351
+ "learning_rate": 4.4797178130511465e-05,
352
+ "loss": 0.0572,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 0.7936507936507936,
357
+ "grad_norm": 0.09549910575151443,
358
+ "learning_rate": 4.126984126984127e-05,
359
+ "loss": 0.0527,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 0.8112874779541446,
364
+ "grad_norm": 1.4032962322235107,
365
+ "learning_rate": 3.7742504409171074e-05,
366
+ "loss": 0.0643,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 0.8289241622574955,
371
+ "grad_norm": 0.5806692242622375,
372
+ "learning_rate": 3.421516754850088e-05,
373
+ "loss": 0.0699,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 0.8465608465608465,
378
+ "grad_norm": 0.07148485630750656,
379
+ "learning_rate": 3.068783068783069e-05,
380
+ "loss": 0.1102,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 0.8641975308641975,
385
+ "grad_norm": 0.6352106928825378,
386
+ "learning_rate": 2.7160493827160493e-05,
387
+ "loss": 0.0574,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 0.8818342151675485,
392
+ "grad_norm": 0.8136057257652283,
393
+ "learning_rate": 2.36331569664903e-05,
394
+ "loss": 0.0413,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 0.8818342151675485,
399
+ "eval_accuracy": 0.9761904761904762,
400
+ "eval_loss": 0.09835900366306305,
401
+ "eval_runtime": 12.4974,
402
+ "eval_samples_per_second": 80.657,
403
+ "eval_steps_per_second": 10.082,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 0.8994708994708994,
408
+ "grad_norm": 0.11122062802314758,
409
+ "learning_rate": 2.0105820105820105e-05,
410
+ "loss": 0.0287,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 0.9171075837742504,
415
+ "grad_norm": 0.08222879469394684,
416
+ "learning_rate": 1.6578483245149913e-05,
417
+ "loss": 0.0793,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 0.9347442680776014,
422
+ "grad_norm": 0.08997409790754318,
423
+ "learning_rate": 1.3051146384479717e-05,
424
+ "loss": 0.0523,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 0.9523809523809523,
429
+ "grad_norm": 0.14071638882160187,
430
+ "learning_rate": 9.523809523809523e-06,
431
+ "loss": 0.0828,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 0.9700176366843033,
436
+ "grad_norm": 0.0937449261546135,
437
+ "learning_rate": 5.99647266313933e-06,
438
+ "loss": 0.1201,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 0.9876543209876543,
443
+ "grad_norm": 2.3273768424987793,
444
+ "learning_rate": 2.469135802469136e-06,
445
+ "loss": 0.1994,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 1.0,
450
+ "step": 567,
451
+ "total_flos": 7.023873938666619e+17,
452
+ "train_loss": 0.09499678115575612,
453
+ "train_runtime": 238.2789,
454
+ "train_samples_per_second": 38.039,
455
+ "train_steps_per_second": 2.38
456
+ }
457
+ ],
458
+ "logging_steps": 10,
459
+ "max_steps": 567,
460
+ "num_input_tokens_seen": 0,
461
+ "num_train_epochs": 1,
462
+ "save_steps": 100,
463
+ "stateful_callbacks": {
464
+ "TrainerControl": {
465
+ "args": {
466
+ "should_epoch_stop": false,
467
+ "should_evaluate": false,
468
+ "should_log": false,
469
+ "should_save": true,
470
+ "should_training_stop": true
471
+ },
472
+ "attributes": {}
473
+ }
474
+ },
475
+ "total_flos": 7.023873938666619e+17,
476
+ "train_batch_size": 16,
477
+ "trial_name": null,
478
+ "trial_params": null
479
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:317e053375713781e18532476ce219fb1b179036be89272eb6cb5737b34b20c6
3
+ size 5304