Augusto777 commited on
Commit
9d26d70
1 Parent(s): e6933e3

End of training

Browse files
README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: microsoft/swinv2-tiny-patch4-window8-256
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - imagefolder
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: swinv2-tiny-patch4-window8-256-Ocular-Toxoplasmosis-DA
13
+ results:
14
+ - task:
15
+ name: Image Classification
16
+ type: image-classification
17
+ dataset:
18
+ name: imagefolder
19
+ type: imagefolder
20
+ config: default
21
+ split: validation
22
+ args: default
23
+ metrics:
24
+ - name: Accuracy
25
+ type: accuracy
26
+ value: 0.8548387096774194
27
+ ---
28
+
29
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
+ should probably proofread and complete it, then remove this comment. -->
31
+
32
+ # swinv2-tiny-patch4-window8-256-Ocular-Toxoplasmosis-DA
33
+
34
+ This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on the imagefolder dataset.
35
+ It achieves the following results on the evaluation set:
36
+ - Loss: 0.5075
37
+ - Accuracy: 0.8548
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 5e-05
57
+ - train_batch_size: 32
58
+ - eval_batch_size: 32
59
+ - seed: 42
60
+ - gradient_accumulation_steps: 4
61
+ - total_train_batch_size: 128
62
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
63
+ - lr_scheduler_type: linear
64
+ - lr_scheduler_warmup_ratio: 0.1
65
+ - num_epochs: 40
66
+
67
+ ### Training results
68
+
69
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
70
+ |:-------------:|:-------:|:----:|:---------------:|:--------:|
71
+ | 1.3402 | 0.9630 | 13 | 1.1682 | 0.5484 |
72
+ | 1.1725 | 2.0 | 27 | 1.0025 | 0.6290 |
73
+ | 0.8824 | 2.9630 | 40 | 0.7644 | 0.6613 |
74
+ | 0.7342 | 4.0 | 54 | 0.5840 | 0.7258 |
75
+ | 0.6734 | 4.9630 | 67 | 0.6754 | 0.6452 |
76
+ | 0.5167 | 6.0 | 81 | 0.5904 | 0.6935 |
77
+ | 0.5009 | 6.9630 | 94 | 0.5549 | 0.6935 |
78
+ | 0.4988 | 8.0 | 108 | 0.6204 | 0.6774 |
79
+ | 0.3856 | 8.9630 | 121 | 0.4463 | 0.8226 |
80
+ | 0.4057 | 10.0 | 135 | 0.5232 | 0.7903 |
81
+ | 0.3929 | 10.9630 | 148 | 0.4580 | 0.8387 |
82
+ | 0.3638 | 12.0 | 162 | 0.5115 | 0.7742 |
83
+ | 0.3248 | 12.9630 | 175 | 0.5313 | 0.7742 |
84
+ | 0.2673 | 14.0 | 189 | 0.5203 | 0.7903 |
85
+ | 0.2922 | 14.9630 | 202 | 0.4315 | 0.8387 |
86
+ | 0.2803 | 16.0 | 216 | 0.4577 | 0.8387 |
87
+ | 0.2735 | 16.9630 | 229 | 0.5467 | 0.8065 |
88
+ | 0.2586 | 18.0 | 243 | 0.5236 | 0.8387 |
89
+ | 0.2366 | 18.9630 | 256 | 0.5075 | 0.8548 |
90
+ | 0.2347 | 20.0 | 270 | 0.5179 | 0.8387 |
91
+ | 0.2046 | 20.9630 | 283 | 0.5428 | 0.8387 |
92
+ | 0.2289 | 22.0 | 297 | 0.5748 | 0.8387 |
93
+ | 0.2195 | 22.9630 | 310 | 0.5969 | 0.8226 |
94
+ | 0.2224 | 24.0 | 324 | 0.6092 | 0.8226 |
95
+ | 0.2167 | 24.9630 | 337 | 0.6333 | 0.8226 |
96
+ | 0.1956 | 26.0 | 351 | 0.5993 | 0.8226 |
97
+ | 0.2174 | 26.9630 | 364 | 0.6063 | 0.8548 |
98
+ | 0.1999 | 28.0 | 378 | 0.6414 | 0.8387 |
99
+ | 0.1667 | 28.9630 | 391 | 0.6297 | 0.8387 |
100
+ | 0.1835 | 30.0 | 405 | 0.6149 | 0.8226 |
101
+ | 0.186 | 30.9630 | 418 | 0.6430 | 0.8387 |
102
+ | 0.1749 | 32.0 | 432 | 0.6678 | 0.8387 |
103
+ | 0.1663 | 32.9630 | 445 | 0.6829 | 0.8387 |
104
+ | 0.1557 | 34.0 | 459 | 0.6557 | 0.8387 |
105
+ | 0.1913 | 34.9630 | 472 | 0.6275 | 0.8387 |
106
+ | 0.1775 | 36.0 | 486 | 0.6555 | 0.8548 |
107
+ | 0.152 | 36.9630 | 499 | 0.6653 | 0.8548 |
108
+ | 0.1897 | 38.0 | 513 | 0.6682 | 0.8548 |
109
+ | 0.1589 | 38.5185 | 520 | 0.6679 | 0.8548 |
110
+
111
+
112
+ ### Framework versions
113
+
114
+ - Transformers 4.44.2
115
+ - Pytorch 2.4.1+cu121
116
+ - Datasets 3.0.1
117
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 38.51851851851852,
3
+ "eval_accuracy": 0.8548387096774194,
4
+ "eval_loss": 0.5075119137763977,
5
+ "eval_runtime": 2.3581,
6
+ "eval_samples_per_second": 26.293,
7
+ "eval_steps_per_second": 0.848,
8
+ "total_flos": 2.140878196703232e+18,
9
+ "train_loss": 0.35049390150950505,
10
+ "train_runtime": 3356.7171,
11
+ "train_samples_per_second": 20.353,
12
+ "train_steps_per_second": 0.155
13
+ }
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/swinv2-tiny-patch4-window8-256",
3
+ "architectures": [
4
+ "Swinv2ForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "depths": [
8
+ 2,
9
+ 2,
10
+ 6,
11
+ 2
12
+ ],
13
+ "drop_path_rate": 0.1,
14
+ "embed_dim": 96,
15
+ "encoder_stride": 32,
16
+ "hidden_act": "gelu",
17
+ "hidden_dropout_prob": 0.0,
18
+ "hidden_size": 768,
19
+ "id2label": {
20
+ "0": "active",
21
+ "1": "active-inactive",
22
+ "2": "healthy",
23
+ "3": "inactive"
24
+ },
25
+ "image_size": 256,
26
+ "initializer_range": 0.02,
27
+ "label2id": {
28
+ "active": 0,
29
+ "active-inactive": 1,
30
+ "healthy": 2,
31
+ "inactive": 3
32
+ },
33
+ "layer_norm_eps": 1e-05,
34
+ "mlp_ratio": 4.0,
35
+ "model_type": "swinv2",
36
+ "num_channels": 3,
37
+ "num_heads": [
38
+ 3,
39
+ 6,
40
+ 12,
41
+ 24
42
+ ],
43
+ "num_layers": 4,
44
+ "out_features": [
45
+ "stage4"
46
+ ],
47
+ "out_indices": [
48
+ 4
49
+ ],
50
+ "patch_size": 4,
51
+ "path_norm": true,
52
+ "pretrained_window_sizes": [
53
+ 0,
54
+ 0,
55
+ 0,
56
+ 0
57
+ ],
58
+ "problem_type": "single_label_classification",
59
+ "qkv_bias": true,
60
+ "stage_names": [
61
+ "stem",
62
+ "stage1",
63
+ "stage2",
64
+ "stage3",
65
+ "stage4"
66
+ ],
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.44.2",
69
+ "use_absolute_embeddings": false,
70
+ "window_size": 8
71
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 38.51851851851852,
3
+ "eval_accuracy": 0.8548387096774194,
4
+ "eval_loss": 0.5075119137763977,
5
+ "eval_runtime": 2.3581,
6
+ "eval_samples_per_second": 26.293,
7
+ "eval_steps_per_second": 0.848
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00b20367a51932debc487a0f982258dfd88b15d58596b4f5de73fdd897b2ee8e
3
+ size 110356296
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.485,
7
+ 0.456,
8
+ 0.406
9
+ ],
10
+ "image_processor_type": "ViTImageProcessor",
11
+ "image_std": [
12
+ 0.229,
13
+ 0.224,
14
+ 0.225
15
+ ],
16
+ "resample": 3,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 256,
20
+ "width": 256
21
+ }
22
+ }
runs/Oct13_15-37-14_496ec48e7779/events.out.tfevents.1728833842.496ec48e7779.2126.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:930cd8a74a0a067ede15c21a4388bd1488e36af3369c4d196c3901505231464c
3
+ size 29395
runs/Oct13_15-37-14_496ec48e7779/events.out.tfevents.1728837202.496ec48e7779.2126.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b91f5baca1ecf4952c704c3b9e1b037a88008e9411beac13764f5df0ec37d18f
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 38.51851851851852,
3
+ "total_flos": 2.140878196703232e+18,
4
+ "train_loss": 0.35049390150950505,
5
+ "train_runtime": 3356.7171,
6
+ "train_samples_per_second": 20.353,
7
+ "train_steps_per_second": 0.155
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,757 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8548387096774194,
3
+ "best_model_checkpoint": "swinv2-tiny-patch4-window8-256-Ocular-Toxoplasmosis-DA/checkpoint-256",
4
+ "epoch": 38.51851851851852,
5
+ "eval_steps": 500,
6
+ "global_step": 520,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.7407407407407407,
13
+ "grad_norm": 4.370074272155762,
14
+ "learning_rate": 9.615384615384616e-06,
15
+ "loss": 1.3402,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.9629629629629629,
20
+ "eval_accuracy": 0.5483870967741935,
21
+ "eval_loss": 1.1682088375091553,
22
+ "eval_runtime": 2.3908,
23
+ "eval_samples_per_second": 25.932,
24
+ "eval_steps_per_second": 0.837,
25
+ "step": 13
26
+ },
27
+ {
28
+ "epoch": 1.4814814814814814,
29
+ "grad_norm": 7.86944580078125,
30
+ "learning_rate": 1.923076923076923e-05,
31
+ "loss": 1.1725,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.6290322580645161,
37
+ "eval_loss": 1.0024793148040771,
38
+ "eval_runtime": 3.324,
39
+ "eval_samples_per_second": 18.652,
40
+ "eval_steps_per_second": 0.602,
41
+ "step": 27
42
+ },
43
+ {
44
+ "epoch": 2.2222222222222223,
45
+ "grad_norm": 4.94896936416626,
46
+ "learning_rate": 2.8846153846153845e-05,
47
+ "loss": 1.0671,
48
+ "step": 30
49
+ },
50
+ {
51
+ "epoch": 2.962962962962963,
52
+ "grad_norm": 4.811951160430908,
53
+ "learning_rate": 3.846153846153846e-05,
54
+ "loss": 0.8824,
55
+ "step": 40
56
+ },
57
+ {
58
+ "epoch": 2.962962962962963,
59
+ "eval_accuracy": 0.6612903225806451,
60
+ "eval_loss": 0.7644360065460205,
61
+ "eval_runtime": 3.388,
62
+ "eval_samples_per_second": 18.3,
63
+ "eval_steps_per_second": 0.59,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 3.7037037037037037,
68
+ "grad_norm": 7.974093914031982,
69
+ "learning_rate": 4.8076923076923084e-05,
70
+ "loss": 0.7342,
71
+ "step": 50
72
+ },
73
+ {
74
+ "epoch": 4.0,
75
+ "eval_accuracy": 0.7258064516129032,
76
+ "eval_loss": 0.5839676260948181,
77
+ "eval_runtime": 3.0543,
78
+ "eval_samples_per_second": 20.299,
79
+ "eval_steps_per_second": 0.655,
80
+ "step": 54
81
+ },
82
+ {
83
+ "epoch": 4.444444444444445,
84
+ "grad_norm": 8.472794532775879,
85
+ "learning_rate": 4.9145299145299147e-05,
86
+ "loss": 0.6734,
87
+ "step": 60
88
+ },
89
+ {
90
+ "epoch": 4.962962962962963,
91
+ "eval_accuracy": 0.6451612903225806,
92
+ "eval_loss": 0.6753666400909424,
93
+ "eval_runtime": 2.3642,
94
+ "eval_samples_per_second": 26.225,
95
+ "eval_steps_per_second": 0.846,
96
+ "step": 67
97
+ },
98
+ {
99
+ "epoch": 5.185185185185185,
100
+ "grad_norm": 9.15774917602539,
101
+ "learning_rate": 4.8076923076923084e-05,
102
+ "loss": 0.6373,
103
+ "step": 70
104
+ },
105
+ {
106
+ "epoch": 5.925925925925926,
107
+ "grad_norm": 12.02450942993164,
108
+ "learning_rate": 4.700854700854701e-05,
109
+ "loss": 0.5167,
110
+ "step": 80
111
+ },
112
+ {
113
+ "epoch": 6.0,
114
+ "eval_accuracy": 0.6935483870967742,
115
+ "eval_loss": 0.5904402136802673,
116
+ "eval_runtime": 2.3866,
117
+ "eval_samples_per_second": 25.979,
118
+ "eval_steps_per_second": 0.838,
119
+ "step": 81
120
+ },
121
+ {
122
+ "epoch": 6.666666666666667,
123
+ "grad_norm": 7.348090648651123,
124
+ "learning_rate": 4.594017094017094e-05,
125
+ "loss": 0.5009,
126
+ "step": 90
127
+ },
128
+ {
129
+ "epoch": 6.962962962962963,
130
+ "eval_accuracy": 0.6935483870967742,
131
+ "eval_loss": 0.5549384355545044,
132
+ "eval_runtime": 2.9982,
133
+ "eval_samples_per_second": 20.679,
134
+ "eval_steps_per_second": 0.667,
135
+ "step": 94
136
+ },
137
+ {
138
+ "epoch": 7.407407407407407,
139
+ "grad_norm": 5.642479419708252,
140
+ "learning_rate": 4.4871794871794874e-05,
141
+ "loss": 0.4988,
142
+ "step": 100
143
+ },
144
+ {
145
+ "epoch": 8.0,
146
+ "eval_accuracy": 0.6774193548387096,
147
+ "eval_loss": 0.620449423789978,
148
+ "eval_runtime": 2.4283,
149
+ "eval_samples_per_second": 25.532,
150
+ "eval_steps_per_second": 0.824,
151
+ "step": 108
152
+ },
153
+ {
154
+ "epoch": 8.148148148148149,
155
+ "grad_norm": 6.128896713256836,
156
+ "learning_rate": 4.3803418803418805e-05,
157
+ "loss": 0.4619,
158
+ "step": 110
159
+ },
160
+ {
161
+ "epoch": 8.88888888888889,
162
+ "grad_norm": 7.555347919464111,
163
+ "learning_rate": 4.2735042735042735e-05,
164
+ "loss": 0.3856,
165
+ "step": 120
166
+ },
167
+ {
168
+ "epoch": 8.962962962962964,
169
+ "eval_accuracy": 0.8225806451612904,
170
+ "eval_loss": 0.44631102681159973,
171
+ "eval_runtime": 2.3506,
172
+ "eval_samples_per_second": 26.376,
173
+ "eval_steps_per_second": 0.851,
174
+ "step": 121
175
+ },
176
+ {
177
+ "epoch": 9.62962962962963,
178
+ "grad_norm": 9.627432823181152,
179
+ "learning_rate": 4.166666666666667e-05,
180
+ "loss": 0.4057,
181
+ "step": 130
182
+ },
183
+ {
184
+ "epoch": 10.0,
185
+ "eval_accuracy": 0.7903225806451613,
186
+ "eval_loss": 0.5231879353523254,
187
+ "eval_runtime": 3.1544,
188
+ "eval_samples_per_second": 19.655,
189
+ "eval_steps_per_second": 0.634,
190
+ "step": 135
191
+ },
192
+ {
193
+ "epoch": 10.37037037037037,
194
+ "grad_norm": 8.669109344482422,
195
+ "learning_rate": 4.05982905982906e-05,
196
+ "loss": 0.3929,
197
+ "step": 140
198
+ },
199
+ {
200
+ "epoch": 10.962962962962964,
201
+ "eval_accuracy": 0.8387096774193549,
202
+ "eval_loss": 0.45801177620887756,
203
+ "eval_runtime": 2.3878,
204
+ "eval_samples_per_second": 25.965,
205
+ "eval_steps_per_second": 0.838,
206
+ "step": 148
207
+ },
208
+ {
209
+ "epoch": 11.11111111111111,
210
+ "grad_norm": 6.289756774902344,
211
+ "learning_rate": 3.952991452991453e-05,
212
+ "loss": 0.3673,
213
+ "step": 150
214
+ },
215
+ {
216
+ "epoch": 11.851851851851851,
217
+ "grad_norm": 12.90579605102539,
218
+ "learning_rate": 3.846153846153846e-05,
219
+ "loss": 0.3638,
220
+ "step": 160
221
+ },
222
+ {
223
+ "epoch": 12.0,
224
+ "eval_accuracy": 0.7741935483870968,
225
+ "eval_loss": 0.5114619135856628,
226
+ "eval_runtime": 3.3569,
227
+ "eval_samples_per_second": 18.47,
228
+ "eval_steps_per_second": 0.596,
229
+ "step": 162
230
+ },
231
+ {
232
+ "epoch": 12.592592592592592,
233
+ "grad_norm": 10.698553085327148,
234
+ "learning_rate": 3.739316239316239e-05,
235
+ "loss": 0.3248,
236
+ "step": 170
237
+ },
238
+ {
239
+ "epoch": 12.962962962962964,
240
+ "eval_accuracy": 0.7741935483870968,
241
+ "eval_loss": 0.5312773585319519,
242
+ "eval_runtime": 2.4335,
243
+ "eval_samples_per_second": 25.478,
244
+ "eval_steps_per_second": 0.822,
245
+ "step": 175
246
+ },
247
+ {
248
+ "epoch": 13.333333333333334,
249
+ "grad_norm": 6.529489994049072,
250
+ "learning_rate": 3.6324786324786323e-05,
251
+ "loss": 0.2673,
252
+ "step": 180
253
+ },
254
+ {
255
+ "epoch": 14.0,
256
+ "eval_accuracy": 0.7903225806451613,
257
+ "eval_loss": 0.5203306674957275,
258
+ "eval_runtime": 3.4828,
259
+ "eval_samples_per_second": 17.802,
260
+ "eval_steps_per_second": 0.574,
261
+ "step": 189
262
+ },
263
+ {
264
+ "epoch": 14.074074074074074,
265
+ "grad_norm": 6.994911193847656,
266
+ "learning_rate": 3.525641025641026e-05,
267
+ "loss": 0.3216,
268
+ "step": 190
269
+ },
270
+ {
271
+ "epoch": 14.814814814814815,
272
+ "grad_norm": 9.194233894348145,
273
+ "learning_rate": 3.418803418803419e-05,
274
+ "loss": 0.2922,
275
+ "step": 200
276
+ },
277
+ {
278
+ "epoch": 14.962962962962964,
279
+ "eval_accuracy": 0.8387096774193549,
280
+ "eval_loss": 0.4315454959869385,
281
+ "eval_runtime": 2.3822,
282
+ "eval_samples_per_second": 26.026,
283
+ "eval_steps_per_second": 0.84,
284
+ "step": 202
285
+ },
286
+ {
287
+ "epoch": 15.555555555555555,
288
+ "grad_norm": 6.076256275177002,
289
+ "learning_rate": 3.311965811965812e-05,
290
+ "loss": 0.2803,
291
+ "step": 210
292
+ },
293
+ {
294
+ "epoch": 16.0,
295
+ "eval_accuracy": 0.8387096774193549,
296
+ "eval_loss": 0.4577220380306244,
297
+ "eval_runtime": 2.9439,
298
+ "eval_samples_per_second": 21.06,
299
+ "eval_steps_per_second": 0.679,
300
+ "step": 216
301
+ },
302
+ {
303
+ "epoch": 16.296296296296298,
304
+ "grad_norm": 12.038761138916016,
305
+ "learning_rate": 3.205128205128206e-05,
306
+ "loss": 0.2735,
307
+ "step": 220
308
+ },
309
+ {
310
+ "epoch": 16.962962962962962,
311
+ "eval_accuracy": 0.8064516129032258,
312
+ "eval_loss": 0.5466907024383545,
313
+ "eval_runtime": 2.3229,
314
+ "eval_samples_per_second": 26.691,
315
+ "eval_steps_per_second": 0.861,
316
+ "step": 229
317
+ },
318
+ {
319
+ "epoch": 17.037037037037038,
320
+ "grad_norm": 8.897506713867188,
321
+ "learning_rate": 3.098290598290599e-05,
322
+ "loss": 0.2776,
323
+ "step": 230
324
+ },
325
+ {
326
+ "epoch": 17.77777777777778,
327
+ "grad_norm": 9.66178035736084,
328
+ "learning_rate": 2.9914529914529915e-05,
329
+ "loss": 0.2586,
330
+ "step": 240
331
+ },
332
+ {
333
+ "epoch": 18.0,
334
+ "eval_accuracy": 0.8387096774193549,
335
+ "eval_loss": 0.5236416459083557,
336
+ "eval_runtime": 3.4253,
337
+ "eval_samples_per_second": 18.101,
338
+ "eval_steps_per_second": 0.584,
339
+ "step": 243
340
+ },
341
+ {
342
+ "epoch": 18.51851851851852,
343
+ "grad_norm": 7.729655742645264,
344
+ "learning_rate": 2.8846153846153845e-05,
345
+ "loss": 0.2366,
346
+ "step": 250
347
+ },
348
+ {
349
+ "epoch": 18.962962962962962,
350
+ "eval_accuracy": 0.8548387096774194,
351
+ "eval_loss": 0.5075119137763977,
352
+ "eval_runtime": 2.3943,
353
+ "eval_samples_per_second": 25.895,
354
+ "eval_steps_per_second": 0.835,
355
+ "step": 256
356
+ },
357
+ {
358
+ "epoch": 19.25925925925926,
359
+ "grad_norm": 11.543585777282715,
360
+ "learning_rate": 2.777777777777778e-05,
361
+ "loss": 0.252,
362
+ "step": 260
363
+ },
364
+ {
365
+ "epoch": 20.0,
366
+ "grad_norm": 7.877120494842529,
367
+ "learning_rate": 2.670940170940171e-05,
368
+ "loss": 0.2347,
369
+ "step": 270
370
+ },
371
+ {
372
+ "epoch": 20.0,
373
+ "eval_accuracy": 0.8387096774193549,
374
+ "eval_loss": 0.5178562998771667,
375
+ "eval_runtime": 2.4124,
376
+ "eval_samples_per_second": 25.701,
377
+ "eval_steps_per_second": 0.829,
378
+ "step": 270
379
+ },
380
+ {
381
+ "epoch": 20.74074074074074,
382
+ "grad_norm": 7.83768892288208,
383
+ "learning_rate": 2.564102564102564e-05,
384
+ "loss": 0.2046,
385
+ "step": 280
386
+ },
387
+ {
388
+ "epoch": 20.962962962962962,
389
+ "eval_accuracy": 0.8387096774193549,
390
+ "eval_loss": 0.5427502393722534,
391
+ "eval_runtime": 3.4728,
392
+ "eval_samples_per_second": 17.853,
393
+ "eval_steps_per_second": 0.576,
394
+ "step": 283
395
+ },
396
+ {
397
+ "epoch": 21.48148148148148,
398
+ "grad_norm": 7.919957637786865,
399
+ "learning_rate": 2.4572649572649573e-05,
400
+ "loss": 0.2289,
401
+ "step": 290
402
+ },
403
+ {
404
+ "epoch": 22.0,
405
+ "eval_accuracy": 0.8387096774193549,
406
+ "eval_loss": 0.57480788230896,
407
+ "eval_runtime": 2.4021,
408
+ "eval_samples_per_second": 25.811,
409
+ "eval_steps_per_second": 0.833,
410
+ "step": 297
411
+ },
412
+ {
413
+ "epoch": 22.22222222222222,
414
+ "grad_norm": 8.665252685546875,
415
+ "learning_rate": 2.3504273504273504e-05,
416
+ "loss": 0.2394,
417
+ "step": 300
418
+ },
419
+ {
420
+ "epoch": 22.962962962962962,
421
+ "grad_norm": 7.902819633483887,
422
+ "learning_rate": 2.2435897435897437e-05,
423
+ "loss": 0.2195,
424
+ "step": 310
425
+ },
426
+ {
427
+ "epoch": 22.962962962962962,
428
+ "eval_accuracy": 0.8225806451612904,
429
+ "eval_loss": 0.5968937277793884,
430
+ "eval_runtime": 3.4133,
431
+ "eval_samples_per_second": 18.164,
432
+ "eval_steps_per_second": 0.586,
433
+ "step": 310
434
+ },
435
+ {
436
+ "epoch": 23.703703703703702,
437
+ "grad_norm": 9.844597816467285,
438
+ "learning_rate": 2.1367521367521368e-05,
439
+ "loss": 0.2224,
440
+ "step": 320
441
+ },
442
+ {
443
+ "epoch": 24.0,
444
+ "eval_accuracy": 0.8225806451612904,
445
+ "eval_loss": 0.6092303991317749,
446
+ "eval_runtime": 2.3949,
447
+ "eval_samples_per_second": 25.888,
448
+ "eval_steps_per_second": 0.835,
449
+ "step": 324
450
+ },
451
+ {
452
+ "epoch": 24.444444444444443,
453
+ "grad_norm": 6.439063549041748,
454
+ "learning_rate": 2.02991452991453e-05,
455
+ "loss": 0.2167,
456
+ "step": 330
457
+ },
458
+ {
459
+ "epoch": 24.962962962962962,
460
+ "eval_accuracy": 0.8225806451612904,
461
+ "eval_loss": 0.6333113312721252,
462
+ "eval_runtime": 2.4482,
463
+ "eval_samples_per_second": 25.325,
464
+ "eval_steps_per_second": 0.817,
465
+ "step": 337
466
+ },
467
+ {
468
+ "epoch": 25.185185185185187,
469
+ "grad_norm": 8.865224838256836,
470
+ "learning_rate": 1.923076923076923e-05,
471
+ "loss": 0.2323,
472
+ "step": 340
473
+ },
474
+ {
475
+ "epoch": 25.925925925925927,
476
+ "grad_norm": 6.462991237640381,
477
+ "learning_rate": 1.8162393162393162e-05,
478
+ "loss": 0.1956,
479
+ "step": 350
480
+ },
481
+ {
482
+ "epoch": 26.0,
483
+ "eval_accuracy": 0.8225806451612904,
484
+ "eval_loss": 0.5993022322654724,
485
+ "eval_runtime": 2.3358,
486
+ "eval_samples_per_second": 26.543,
487
+ "eval_steps_per_second": 0.856,
488
+ "step": 351
489
+ },
490
+ {
491
+ "epoch": 26.666666666666668,
492
+ "grad_norm": 6.978143692016602,
493
+ "learning_rate": 1.7094017094017095e-05,
494
+ "loss": 0.2174,
495
+ "step": 360
496
+ },
497
+ {
498
+ "epoch": 26.962962962962962,
499
+ "eval_accuracy": 0.8548387096774194,
500
+ "eval_loss": 0.6063364744186401,
501
+ "eval_runtime": 2.3579,
502
+ "eval_samples_per_second": 26.295,
503
+ "eval_steps_per_second": 0.848,
504
+ "step": 364
505
+ },
506
+ {
507
+ "epoch": 27.40740740740741,
508
+ "grad_norm": 8.283989906311035,
509
+ "learning_rate": 1.602564102564103e-05,
510
+ "loss": 0.1999,
511
+ "step": 370
512
+ },
513
+ {
514
+ "epoch": 28.0,
515
+ "eval_accuracy": 0.8387096774193549,
516
+ "eval_loss": 0.6413679718971252,
517
+ "eval_runtime": 3.4435,
518
+ "eval_samples_per_second": 18.005,
519
+ "eval_steps_per_second": 0.581,
520
+ "step": 378
521
+ },
522
+ {
523
+ "epoch": 28.14814814814815,
524
+ "grad_norm": 5.77383279800415,
525
+ "learning_rate": 1.4957264957264958e-05,
526
+ "loss": 0.1783,
527
+ "step": 380
528
+ },
529
+ {
530
+ "epoch": 28.88888888888889,
531
+ "grad_norm": 7.4615654945373535,
532
+ "learning_rate": 1.388888888888889e-05,
533
+ "loss": 0.1667,
534
+ "step": 390
535
+ },
536
+ {
537
+ "epoch": 28.962962962962962,
538
+ "eval_accuracy": 0.8387096774193549,
539
+ "eval_loss": 0.6296666860580444,
540
+ "eval_runtime": 2.3485,
541
+ "eval_samples_per_second": 26.4,
542
+ "eval_steps_per_second": 0.852,
543
+ "step": 391
544
+ },
545
+ {
546
+ "epoch": 29.62962962962963,
547
+ "grad_norm": 9.373270034790039,
548
+ "learning_rate": 1.282051282051282e-05,
549
+ "loss": 0.1835,
550
+ "step": 400
551
+ },
552
+ {
553
+ "epoch": 30.0,
554
+ "eval_accuracy": 0.8225806451612904,
555
+ "eval_loss": 0.6148854494094849,
556
+ "eval_runtime": 3.1829,
557
+ "eval_samples_per_second": 19.479,
558
+ "eval_steps_per_second": 0.628,
559
+ "step": 405
560
+ },
561
+ {
562
+ "epoch": 30.37037037037037,
563
+ "grad_norm": 8.87562370300293,
564
+ "learning_rate": 1.1752136752136752e-05,
565
+ "loss": 0.186,
566
+ "step": 410
567
+ },
568
+ {
569
+ "epoch": 30.962962962962962,
570
+ "eval_accuracy": 0.8387096774193549,
571
+ "eval_loss": 0.6429581642150879,
572
+ "eval_runtime": 2.4503,
573
+ "eval_samples_per_second": 25.303,
574
+ "eval_steps_per_second": 0.816,
575
+ "step": 418
576
+ },
577
+ {
578
+ "epoch": 31.11111111111111,
579
+ "grad_norm": 5.281705856323242,
580
+ "learning_rate": 1.0683760683760684e-05,
581
+ "loss": 0.1706,
582
+ "step": 420
583
+ },
584
+ {
585
+ "epoch": 31.85185185185185,
586
+ "grad_norm": 4.753020286560059,
587
+ "learning_rate": 9.615384615384616e-06,
588
+ "loss": 0.1749,
589
+ "step": 430
590
+ },
591
+ {
592
+ "epoch": 32.0,
593
+ "eval_accuracy": 0.8387096774193549,
594
+ "eval_loss": 0.6677759885787964,
595
+ "eval_runtime": 2.3885,
596
+ "eval_samples_per_second": 25.957,
597
+ "eval_steps_per_second": 0.837,
598
+ "step": 432
599
+ },
600
+ {
601
+ "epoch": 32.592592592592595,
602
+ "grad_norm": 7.2512526512146,
603
+ "learning_rate": 8.547008547008548e-06,
604
+ "loss": 0.1663,
605
+ "step": 440
606
+ },
607
+ {
608
+ "epoch": 32.96296296296296,
609
+ "eval_accuracy": 0.8387096774193549,
610
+ "eval_loss": 0.6828835010528564,
611
+ "eval_runtime": 2.3483,
612
+ "eval_samples_per_second": 26.402,
613
+ "eval_steps_per_second": 0.852,
614
+ "step": 445
615
+ },
616
+ {
617
+ "epoch": 33.333333333333336,
618
+ "grad_norm": 9.678658485412598,
619
+ "learning_rate": 7.478632478632479e-06,
620
+ "loss": 0.1557,
621
+ "step": 450
622
+ },
623
+ {
624
+ "epoch": 34.0,
625
+ "eval_accuracy": 0.8387096774193549,
626
+ "eval_loss": 0.655702531337738,
627
+ "eval_runtime": 3.1084,
628
+ "eval_samples_per_second": 19.946,
629
+ "eval_steps_per_second": 0.643,
630
+ "step": 459
631
+ },
632
+ {
633
+ "epoch": 34.074074074074076,
634
+ "grad_norm": 5.886323928833008,
635
+ "learning_rate": 6.41025641025641e-06,
636
+ "loss": 0.2095,
637
+ "step": 460
638
+ },
639
+ {
640
+ "epoch": 34.81481481481482,
641
+ "grad_norm": 5.312963485717773,
642
+ "learning_rate": 5.341880341880342e-06,
643
+ "loss": 0.1913,
644
+ "step": 470
645
+ },
646
+ {
647
+ "epoch": 34.96296296296296,
648
+ "eval_accuracy": 0.8387096774193549,
649
+ "eval_loss": 0.6274862885475159,
650
+ "eval_runtime": 3.2878,
651
+ "eval_samples_per_second": 18.858,
652
+ "eval_steps_per_second": 0.608,
653
+ "step": 472
654
+ },
655
+ {
656
+ "epoch": 35.55555555555556,
657
+ "grad_norm": 7.064798355102539,
658
+ "learning_rate": 4.273504273504274e-06,
659
+ "loss": 0.1775,
660
+ "step": 480
661
+ },
662
+ {
663
+ "epoch": 36.0,
664
+ "eval_accuracy": 0.8548387096774194,
665
+ "eval_loss": 0.6554756760597229,
666
+ "eval_runtime": 2.4759,
667
+ "eval_samples_per_second": 25.041,
668
+ "eval_steps_per_second": 0.808,
669
+ "step": 486
670
+ },
671
+ {
672
+ "epoch": 36.2962962962963,
673
+ "grad_norm": 5.463845729827881,
674
+ "learning_rate": 3.205128205128205e-06,
675
+ "loss": 0.152,
676
+ "step": 490
677
+ },
678
+ {
679
+ "epoch": 36.96296296296296,
680
+ "eval_accuracy": 0.8548387096774194,
681
+ "eval_loss": 0.6653042435646057,
682
+ "eval_runtime": 3.3751,
683
+ "eval_samples_per_second": 18.37,
684
+ "eval_steps_per_second": 0.593,
685
+ "step": 499
686
+ },
687
+ {
688
+ "epoch": 37.03703703703704,
689
+ "grad_norm": 5.512512683868408,
690
+ "learning_rate": 2.136752136752137e-06,
691
+ "loss": 0.1681,
692
+ "step": 500
693
+ },
694
+ {
695
+ "epoch": 37.77777777777778,
696
+ "grad_norm": 6.535687446594238,
697
+ "learning_rate": 1.0683760683760685e-06,
698
+ "loss": 0.1897,
699
+ "step": 510
700
+ },
701
+ {
702
+ "epoch": 38.0,
703
+ "eval_accuracy": 0.8548387096774194,
704
+ "eval_loss": 0.6681959629058838,
705
+ "eval_runtime": 2.3421,
706
+ "eval_samples_per_second": 26.472,
707
+ "eval_steps_per_second": 0.854,
708
+ "step": 513
709
+ },
710
+ {
711
+ "epoch": 38.51851851851852,
712
+ "grad_norm": 8.290581703186035,
713
+ "learning_rate": 0.0,
714
+ "loss": 0.1589,
715
+ "step": 520
716
+ },
717
+ {
718
+ "epoch": 38.51851851851852,
719
+ "eval_accuracy": 0.8548387096774194,
720
+ "eval_loss": 0.6678970456123352,
721
+ "eval_runtime": 2.3455,
722
+ "eval_samples_per_second": 26.434,
723
+ "eval_steps_per_second": 0.853,
724
+ "step": 520
725
+ },
726
+ {
727
+ "epoch": 38.51851851851852,
728
+ "step": 520,
729
+ "total_flos": 2.140878196703232e+18,
730
+ "train_loss": 0.35049390150950505,
731
+ "train_runtime": 3356.7171,
732
+ "train_samples_per_second": 20.353,
733
+ "train_steps_per_second": 0.155
734
+ }
735
+ ],
736
+ "logging_steps": 10,
737
+ "max_steps": 520,
738
+ "num_input_tokens_seen": 0,
739
+ "num_train_epochs": 40,
740
+ "save_steps": 500,
741
+ "stateful_callbacks": {
742
+ "TrainerControl": {
743
+ "args": {
744
+ "should_epoch_stop": false,
745
+ "should_evaluate": false,
746
+ "should_log": false,
747
+ "should_save": true,
748
+ "should_training_stop": true
749
+ },
750
+ "attributes": {}
751
+ }
752
+ },
753
+ "total_flos": 2.140878196703232e+18,
754
+ "train_batch_size": 32,
755
+ "trial_name": null,
756
+ "trial_params": null
757
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ed5fae65789d986275c6f5b5304c2bbfd8e92ec07663c5d5e7b7917f8ab2ef
3
+ size 5304