anh-dangminh commited on
Commit
36e1704
·
verified ·
1 Parent(s): 74239a2

End of training

Browse files
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: microsoft/resnet-50
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - oxford102_flower_dataset
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: resnet-50-finetuned-oxfordflowers
13
+ results:
14
+ - task:
15
+ name: Image Classification
16
+ type: image-classification
17
+ dataset:
18
+ name: oxford102_flower_dataset
19
+ type: oxford102_flower_dataset
20
+ config: default
21
+ split: validation
22
+ args: default
23
+ metrics:
24
+ - name: Accuracy
25
+ type: accuracy
26
+ value: 0.85
27
+ ---
28
+
29
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
+ should probably proofread and complete it, then remove this comment. -->
31
+
32
+ # resnet-50-finetuned-oxfordflowers
33
+
34
+ This model is a fine-tuned version of [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) on the oxford102_flower_dataset dataset.
35
+ It achieves the following results on the evaluation set:
36
+ - Loss: 0.5915
37
+ - Accuracy: 0.85
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 0.001
57
+ - train_batch_size: 32
58
+ - eval_batch_size: 32
59
+ - seed: 42
60
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
61
+ - lr_scheduler_type: linear
62
+ - num_epochs: 20
63
+
64
+ ### Training results
65
+
66
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
67
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
68
+ | 4.5224 | 1.0 | 32 | 4.2939 | 0.25 |
69
+ | 2.8139 | 2.0 | 64 | 2.1128 | 0.4892 |
70
+ | 1.4505 | 3.0 | 96 | 1.2261 | 0.6843 |
71
+ | 0.5751 | 4.0 | 128 | 1.0176 | 0.7441 |
72
+ | 0.2265 | 5.0 | 160 | 0.8487 | 0.7559 |
73
+ | 0.0531 | 6.0 | 192 | 0.7609 | 0.8 |
74
+ | 0.0411 | 7.0 | 224 | 0.7191 | 0.8029 |
75
+ | 0.0351 | 8.0 | 256 | 0.6987 | 0.8078 |
76
+ | 0.0107 | 9.0 | 288 | 0.6843 | 0.8225 |
77
+ | 0.0094 | 10.0 | 320 | 0.6314 | 0.8343 |
78
+ | 0.0081 | 11.0 | 352 | 0.6320 | 0.8353 |
79
+ | 0.0053 | 12.0 | 384 | 0.6049 | 0.8353 |
80
+ | 0.0048 | 13.0 | 416 | 0.5961 | 0.8373 |
81
+ | 0.0024 | 14.0 | 448 | 0.5880 | 0.8471 |
82
+ | 0.0028 | 15.0 | 480 | 0.5927 | 0.8441 |
83
+ | 0.0023 | 16.0 | 512 | 0.5878 | 0.8520 |
84
+ | 0.0027 | 17.0 | 544 | 0.5872 | 0.8471 |
85
+ | 0.0028 | 18.0 | 576 | 0.5892 | 0.8451 |
86
+ | 0.002 | 19.0 | 608 | 0.5933 | 0.8412 |
87
+ | 0.0017 | 20.0 | 640 | 0.5915 | 0.85 |
88
+
89
+
90
+ ### Framework versions
91
+
92
+ - Transformers 4.47.1
93
+ - Pytorch 2.5.1+cu121
94
+ - Datasets 3.2.0
95
+ - Tokenizers 0.21.0
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.7801268498942917,
4
+ "eval_loss": 0.8647737503051758,
5
+ "eval_runtime": 123.7654,
6
+ "eval_samples_per_second": 49.683,
7
+ "eval_steps_per_second": 0.396,
8
+ "total_flos": 4.36977436041216e+17,
9
+ "train_loss": 0.5368185924002319,
10
+ "train_runtime": 902.6693,
11
+ "train_samples_per_second": 22.6,
12
+ "train_steps_per_second": 0.709
13
+ }
config.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/resnet-50",
3
+ "architectures": [
4
+ "ResNetForImageClassification"
5
+ ],
6
+ "depths": [
7
+ 3,
8
+ 4,
9
+ 6,
10
+ 3
11
+ ],
12
+ "downsample_in_bottleneck": false,
13
+ "downsample_in_first_stage": false,
14
+ "embedding_size": 64,
15
+ "hidden_act": "relu",
16
+ "hidden_sizes": [
17
+ 256,
18
+ 512,
19
+ 1024,
20
+ 2048
21
+ ],
22
+ "id2label": {
23
+ "0": "pink primrose",
24
+ "1": "hard-leaved pocket orchid",
25
+ "2": "canterbury bells",
26
+ "3": "sweet pea",
27
+ "4": "english marigold",
28
+ "5": "tiger lily",
29
+ "6": "moon orchid",
30
+ "7": "bird of paradise",
31
+ "8": "monkshood",
32
+ "9": "globe thistle",
33
+ "10": "snapdragon",
34
+ "11": "colt's foot",
35
+ "12": "king protea",
36
+ "13": "spear thistle",
37
+ "14": "yellow iris",
38
+ "15": "globe-flower",
39
+ "16": "purple coneflower",
40
+ "17": "peruvian lily",
41
+ "18": "balloon flower",
42
+ "19": "giant white arum lily",
43
+ "20": "fire lily",
44
+ "21": "pincushion flower",
45
+ "22": "fritillary",
46
+ "23": "red ginger",
47
+ "24": "grape hyacinth",
48
+ "25": "corn poppy",
49
+ "26": "prince of wales feathers",
50
+ "27": "stemless gentian",
51
+ "28": "artichoke",
52
+ "29": "sweet william",
53
+ "30": "carnation",
54
+ "31": "garden phlox",
55
+ "32": "love in the mist",
56
+ "33": "mexican aster",
57
+ "34": "alpine sea holly",
58
+ "35": "ruby-lipped cattleya",
59
+ "36": "cape flower",
60
+ "37": "great masterwort",
61
+ "38": "siam tulip",
62
+ "39": "lenten rose",
63
+ "40": "barbeton daisy",
64
+ "41": "daffodil",
65
+ "42": "sword lily",
66
+ "43": "poinsettia",
67
+ "44": "bolero deep blue",
68
+ "45": "wallflower",
69
+ "46": "marigold",
70
+ "47": "buttercup",
71
+ "48": "oxeye daisy",
72
+ "49": "common dandelion",
73
+ "50": "petunia",
74
+ "51": "wild pansy",
75
+ "52": "primula",
76
+ "53": "sunflower",
77
+ "54": "pelargonium",
78
+ "55": "bishop of llandaff",
79
+ "56": "gaura",
80
+ "57": "geranium",
81
+ "58": "orange dahlia",
82
+ "59": "pink-yellow dahlia?",
83
+ "60": "cautleya spicata",
84
+ "61": "japanese anemone",
85
+ "62": "black-eyed susan",
86
+ "63": "silverbush",
87
+ "64": "californian poppy",
88
+ "65": "osteospermum",
89
+ "66": "spring crocus",
90
+ "67": "bearded iris",
91
+ "68": "windflower",
92
+ "69": "tree poppy",
93
+ "70": "gazania",
94
+ "71": "azalea",
95
+ "72": "water lily",
96
+ "73": "rose",
97
+ "74": "thorn apple",
98
+ "75": "morning glory",
99
+ "76": "passion flower",
100
+ "77": "lotus",
101
+ "78": "toad lily",
102
+ "79": "anthurium",
103
+ "80": "frangipani",
104
+ "81": "clematis",
105
+ "82": "hibiscus",
106
+ "83": "columbine",
107
+ "84": "desert-rose",
108
+ "85": "tree mallow",
109
+ "86": "magnolia",
110
+ "87": "cyclamen",
111
+ "88": "watercress",
112
+ "89": "canna lily",
113
+ "90": "hippeastrum",
114
+ "91": "bee balm",
115
+ "92": "ball moss",
116
+ "93": "foxglove",
117
+ "94": "bougainvillea",
118
+ "95": "camellia",
119
+ "96": "mallow",
120
+ "97": "mexican petunia",
121
+ "98": "bromelia",
122
+ "99": "blanket flower",
123
+ "100": "trumpet creeper",
124
+ "101": "blackberry lily"
125
+ },
126
+ "label2id": {
127
+ "alpine sea holly": 34,
128
+ "anthurium": 79,
129
+ "artichoke": 28,
130
+ "azalea": 71,
131
+ "ball moss": 92,
132
+ "balloon flower": 18,
133
+ "barbeton daisy": 40,
134
+ "bearded iris": 67,
135
+ "bee balm": 91,
136
+ "bird of paradise": 7,
137
+ "bishop of llandaff": 55,
138
+ "black-eyed susan": 62,
139
+ "blackberry lily": 101,
140
+ "blanket flower": 99,
141
+ "bolero deep blue": 44,
142
+ "bougainvillea": 94,
143
+ "bromelia": 98,
144
+ "buttercup": 47,
145
+ "californian poppy": 64,
146
+ "camellia": 95,
147
+ "canna lily": 89,
148
+ "canterbury bells": 2,
149
+ "cape flower": 36,
150
+ "carnation": 30,
151
+ "cautleya spicata": 60,
152
+ "clematis": 81,
153
+ "colt's foot": 11,
154
+ "columbine": 83,
155
+ "common dandelion": 49,
156
+ "corn poppy": 25,
157
+ "cyclamen": 87,
158
+ "daffodil": 41,
159
+ "desert-rose": 84,
160
+ "english marigold": 4,
161
+ "fire lily": 20,
162
+ "foxglove": 93,
163
+ "frangipani": 80,
164
+ "fritillary": 22,
165
+ "garden phlox": 31,
166
+ "gaura": 56,
167
+ "gazania": 70,
168
+ "geranium": 57,
169
+ "giant white arum lily": 19,
170
+ "globe thistle": 9,
171
+ "globe-flower": 15,
172
+ "grape hyacinth": 24,
173
+ "great masterwort": 37,
174
+ "hard-leaved pocket orchid": 1,
175
+ "hibiscus": 82,
176
+ "hippeastrum": 90,
177
+ "japanese anemone": 61,
178
+ "king protea": 12,
179
+ "lenten rose": 39,
180
+ "lotus": 77,
181
+ "love in the mist": 32,
182
+ "magnolia": 86,
183
+ "mallow": 96,
184
+ "marigold": 46,
185
+ "mexican aster": 33,
186
+ "mexican petunia": 97,
187
+ "monkshood": 8,
188
+ "moon orchid": 6,
189
+ "morning glory": 75,
190
+ "orange dahlia": 58,
191
+ "osteospermum": 65,
192
+ "oxeye daisy": 48,
193
+ "passion flower": 76,
194
+ "pelargonium": 54,
195
+ "peruvian lily": 17,
196
+ "petunia": 50,
197
+ "pincushion flower": 21,
198
+ "pink primrose": 0,
199
+ "pink-yellow dahlia?": 59,
200
+ "poinsettia": 43,
201
+ "primula": 52,
202
+ "prince of wales feathers": 26,
203
+ "purple coneflower": 16,
204
+ "red ginger": 23,
205
+ "rose": 73,
206
+ "ruby-lipped cattleya": 35,
207
+ "siam tulip": 38,
208
+ "silverbush": 63,
209
+ "snapdragon": 10,
210
+ "spear thistle": 13,
211
+ "spring crocus": 66,
212
+ "stemless gentian": 27,
213
+ "sunflower": 53,
214
+ "sweet pea": 3,
215
+ "sweet william": 29,
216
+ "sword lily": 42,
217
+ "thorn apple": 74,
218
+ "tiger lily": 5,
219
+ "toad lily": 78,
220
+ "tree mallow": 85,
221
+ "tree poppy": 69,
222
+ "trumpet creeper": 100,
223
+ "wallflower": 45,
224
+ "water lily": 72,
225
+ "watercress": 88,
226
+ "wild pansy": 51,
227
+ "windflower": 68,
228
+ "yellow iris": 14
229
+ },
230
+ "layer_type": "bottleneck",
231
+ "model_type": "resnet",
232
+ "num_channels": 3,
233
+ "out_features": [
234
+ "stage4"
235
+ ],
236
+ "out_indices": [
237
+ 4
238
+ ],
239
+ "problem_type": "single_label_classification",
240
+ "stage_names": [
241
+ "stem",
242
+ "stage1",
243
+ "stage2",
244
+ "stage3",
245
+ "stage4"
246
+ ],
247
+ "torch_dtype": "float32",
248
+ "transformers_version": "4.47.1"
249
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.8205882352941176,
4
+ "eval_loss": 0.7496953010559082,
5
+ "eval_runtime": 16.2045,
6
+ "eval_samples_per_second": 62.945,
7
+ "eval_steps_per_second": 0.494
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f6515139de92486682b8ab4b26eaa75f616f163f3c2214015877cb8aa5c5dc
3
+ size 95122680
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_pct": 0.875,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.485,
8
+ 0.456,
9
+ 0.406
10
+ ],
11
+ "image_processor_type": "ConvNextImageProcessor",
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "shortest_edge": 224
21
+ }
22
+ }
runs/Dec21_13-14-05_3efe3f73ff0a/events.out.tfevents.1734787160.3efe3f73ff0a.417.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed13b0b6740ca2aafd3c55fc15f80f69d064551d0ba61bc5c4fe86fa2fb9076
3
+ size 12593
runs/Dec21_13-24-50_3efe3f73ff0a/events.out.tfevents.1734787495.3efe3f73ff0a.3963.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b741d50eb6d5124cc34fda60cc47faab201680eacc036bf86e3e658fe0805fa6
3
+ size 22339
runs/Dec21_13-24-50_3efe3f73ff0a/events.out.tfevents.1734789288.3efe3f73ff0a.3963.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf552733c6dbfaa37c56e75633a279652fee24cee5ffc5a6293b620daddefd27
3
+ size 10182
runs/Dec21_13-24-50_3efe3f73ff0a/events.out.tfevents.1734789352.3efe3f73ff0a.3963.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:103e534027fa47e89b8f8988bcf7940a7d2feb0e8d07747ce76697c86e966ee0
3
+ size 10499
runs/Dec21_13-24-50_3efe3f73ff0a/events.out.tfevents.1734789453.3efe3f73ff0a.3963.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:817cc2fb68a4a02bdc9ae2fb3dbaf2f701e7895599b9166fa72162195cca76fa
3
+ size 14493
runs/Dec21_14-02-33_3efe3f73ff0a/events.out.tfevents.1734789763.3efe3f73ff0a.3963.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cff27986bd235dd32b2e32db30cd85b8280a934308adddf5bc31b5309a8ccc22
3
+ size 24806
runs/Dec21_14-27-10_3efe3f73ff0a/events.out.tfevents.1734791239.3efe3f73ff0a.3963.9 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:741849c9a7960bd00c9ef7b053acf8f82f53151c23efab3eadf07644e0278e8f
3
+ size 10190
runs/Dec21_14-27-10_3efe3f73ff0a/events.out.tfevents.1734791271.3efe3f73ff0a.3963.10 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93f6d2e497ce7bbdd00bfd4f1a4162272402541eaab5fa1782ad97bfa5afe1b2
3
+ size 10190
runs/Dec21_14-27-10_3efe3f73ff0a/events.out.tfevents.1734791296.3efe3f73ff0a.3963.11 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c276c4c60478d4553671a521d4b4d6e7ebfbf491a1cf4f6ec317fa96894516
3
+ size 10190
runs/Dec21_14-35-24_3efe3f73ff0a/events.out.tfevents.1734791742.3efe3f73ff0a.21272.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcac6354c9b53e02123c238a5d917abd048fc850df809a0598b153dda3d2b0f5
3
+ size 10496
runs/Dec21_14-35-24_3efe3f73ff0a/events.out.tfevents.1734791834.3efe3f73ff0a.21272.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ce7408d045a0d0446afbcc1c0a8184f5725d2bad105d5801508c6a2283dfb8
3
+ size 11139
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.7801268498942917,
4
+ "eval_loss": 0.8647737503051758,
5
+ "eval_runtime": 123.7654,
6
+ "eval_samples_per_second": 49.683,
7
+ "eval_steps_per_second": 0.396
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 4.36977436041216e+17,
4
+ "train_loss": 0.5368185924002319,
5
+ "train_runtime": 902.6693,
6
+ "train_samples_per_second": 22.6,
7
+ "train_steps_per_second": 0.709
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8519607843137255,
3
+ "best_model_checkpoint": "resnet-50-finetuned-oxfordflowers/checkpoint-512",
4
+ "epoch": 20.0,
5
+ "eval_steps": 500,
6
+ "global_step": 640,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15625,
13
+ "grad_norm": 1.6800851821899414,
14
+ "learning_rate": 0.0009921875,
15
+ "loss": 4.6507,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.3125,
20
+ "grad_norm": 1.4043323993682861,
21
+ "learning_rate": 0.000984375,
22
+ "loss": 4.6146,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.46875,
27
+ "grad_norm": 1.4668281078338623,
28
+ "learning_rate": 0.0009765625,
29
+ "loss": 4.6484,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.625,
34
+ "grad_norm": 1.4262796640396118,
35
+ "learning_rate": 0.00096875,
36
+ "loss": 4.6292,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.78125,
41
+ "grad_norm": 1.1266566514968872,
42
+ "learning_rate": 0.0009609375,
43
+ "loss": 4.5702,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.9375,
48
+ "grad_norm": 1.0406345129013062,
49
+ "learning_rate": 0.000953125,
50
+ "loss": 4.5224,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "eval_accuracy": 0.25,
56
+ "eval_loss": 4.293937683105469,
57
+ "eval_runtime": 15.9684,
58
+ "eval_samples_per_second": 63.876,
59
+ "eval_steps_per_second": 2.004,
60
+ "step": 32
61
+ },
62
+ {
63
+ "epoch": 1.09375,
64
+ "grad_norm": 2.009798765182495,
65
+ "learning_rate": 0.0009453125,
66
+ "loss": 4.2571,
67
+ "step": 35
68
+ },
69
+ {
70
+ "epoch": 1.25,
71
+ "grad_norm": 1.697478175163269,
72
+ "learning_rate": 0.0009375,
73
+ "loss": 3.9421,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 1.40625,
78
+ "grad_norm": 2.285863161087036,
79
+ "learning_rate": 0.0009296875000000001,
80
+ "loss": 3.6971,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 1.5625,
85
+ "grad_norm": 2.211660623550415,
86
+ "learning_rate": 0.0009218750000000001,
87
+ "loss": 3.3611,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.71875,
92
+ "grad_norm": 2.331829786300659,
93
+ "learning_rate": 0.0009140625,
94
+ "loss": 3.1108,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.875,
99
+ "grad_norm": 2.799659013748169,
100
+ "learning_rate": 0.00090625,
101
+ "loss": 2.8139,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "eval_accuracy": 0.4892156862745098,
107
+ "eval_loss": 2.112804651260376,
108
+ "eval_runtime": 16.5487,
109
+ "eval_samples_per_second": 61.636,
110
+ "eval_steps_per_second": 1.934,
111
+ "step": 64
112
+ },
113
+ {
114
+ "epoch": 2.03125,
115
+ "grad_norm": 2.7438573837280273,
116
+ "learning_rate": 0.0008984375,
117
+ "loss": 2.6103,
118
+ "step": 65
119
+ },
120
+ {
121
+ "epoch": 2.1875,
122
+ "grad_norm": 2.3204867839813232,
123
+ "learning_rate": 0.000890625,
124
+ "loss": 2.0599,
125
+ "step": 70
126
+ },
127
+ {
128
+ "epoch": 2.34375,
129
+ "grad_norm": 2.3990378379821777,
130
+ "learning_rate": 0.0008828125,
131
+ "loss": 1.7052,
132
+ "step": 75
133
+ },
134
+ {
135
+ "epoch": 2.5,
136
+ "grad_norm": 3.4195637702941895,
137
+ "learning_rate": 0.000875,
138
+ "loss": 1.5619,
139
+ "step": 80
140
+ },
141
+ {
142
+ "epoch": 2.65625,
143
+ "grad_norm": 2.6798551082611084,
144
+ "learning_rate": 0.0008671875,
145
+ "loss": 1.4689,
146
+ "step": 85
147
+ },
148
+ {
149
+ "epoch": 2.8125,
150
+ "grad_norm": 3.0105719566345215,
151
+ "learning_rate": 0.000859375,
152
+ "loss": 1.4125,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 2.96875,
157
+ "grad_norm": 3.218193531036377,
158
+ "learning_rate": 0.0008515625,
159
+ "loss": 1.4505,
160
+ "step": 95
161
+ },
162
+ {
163
+ "epoch": 3.0,
164
+ "eval_accuracy": 0.6843137254901961,
165
+ "eval_loss": 1.226142406463623,
166
+ "eval_runtime": 16.601,
167
+ "eval_samples_per_second": 61.442,
168
+ "eval_steps_per_second": 1.928,
169
+ "step": 96
170
+ },
171
+ {
172
+ "epoch": 3.125,
173
+ "grad_norm": 2.6317319869995117,
174
+ "learning_rate": 0.00084375,
175
+ "loss": 0.7778,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 3.28125,
180
+ "grad_norm": 2.3407766819000244,
181
+ "learning_rate": 0.0008359375,
182
+ "loss": 0.8178,
183
+ "step": 105
184
+ },
185
+ {
186
+ "epoch": 3.4375,
187
+ "grad_norm": 2.060016632080078,
188
+ "learning_rate": 0.000828125,
189
+ "loss": 0.7545,
190
+ "step": 110
191
+ },
192
+ {
193
+ "epoch": 3.59375,
194
+ "grad_norm": 2.2562413215637207,
195
+ "learning_rate": 0.0008203125,
196
+ "loss": 0.6023,
197
+ "step": 115
198
+ },
199
+ {
200
+ "epoch": 3.75,
201
+ "grad_norm": 2.7784945964813232,
202
+ "learning_rate": 0.0008125000000000001,
203
+ "loss": 0.5268,
204
+ "step": 120
205
+ },
206
+ {
207
+ "epoch": 3.90625,
208
+ "grad_norm": 2.47145676612854,
209
+ "learning_rate": 0.0008046875000000001,
210
+ "loss": 0.5751,
211
+ "step": 125
212
+ },
213
+ {
214
+ "epoch": 4.0,
215
+ "eval_accuracy": 0.7441176470588236,
216
+ "eval_loss": 1.0175817012786865,
217
+ "eval_runtime": 18.5424,
218
+ "eval_samples_per_second": 55.009,
219
+ "eval_steps_per_second": 1.726,
220
+ "step": 128
221
+ },
222
+ {
223
+ "epoch": 4.0625,
224
+ "grad_norm": 1.2707927227020264,
225
+ "learning_rate": 0.0007968750000000001,
226
+ "loss": 0.41,
227
+ "step": 130
228
+ },
229
+ {
230
+ "epoch": 4.21875,
231
+ "grad_norm": 2.2418272495269775,
232
+ "learning_rate": 0.0007890625,
233
+ "loss": 0.2771,
234
+ "step": 135
235
+ },
236
+ {
237
+ "epoch": 4.375,
238
+ "grad_norm": 1.0117669105529785,
239
+ "learning_rate": 0.00078125,
240
+ "loss": 0.2848,
241
+ "step": 140
242
+ },
243
+ {
244
+ "epoch": 4.53125,
245
+ "grad_norm": 1.5163785219192505,
246
+ "learning_rate": 0.0007734375,
247
+ "loss": 0.2322,
248
+ "step": 145
249
+ },
250
+ {
251
+ "epoch": 4.6875,
252
+ "grad_norm": 1.693102478981018,
253
+ "learning_rate": 0.000765625,
254
+ "loss": 0.292,
255
+ "step": 150
256
+ },
257
+ {
258
+ "epoch": 4.84375,
259
+ "grad_norm": 1.6366838216781616,
260
+ "learning_rate": 0.0007578125,
261
+ "loss": 0.2391,
262
+ "step": 155
263
+ },
264
+ {
265
+ "epoch": 5.0,
266
+ "grad_norm": 1.1743065118789673,
267
+ "learning_rate": 0.00075,
268
+ "loss": 0.2265,
269
+ "step": 160
270
+ },
271
+ {
272
+ "epoch": 5.0,
273
+ "eval_accuracy": 0.7558823529411764,
274
+ "eval_loss": 0.8487027287483215,
275
+ "eval_runtime": 20.2945,
276
+ "eval_samples_per_second": 50.26,
277
+ "eval_steps_per_second": 1.577,
278
+ "step": 160
279
+ },
280
+ {
281
+ "epoch": 5.15625,
282
+ "grad_norm": 0.5249314308166504,
283
+ "learning_rate": 0.0007421875,
284
+ "loss": 0.1254,
285
+ "step": 165
286
+ },
287
+ {
288
+ "epoch": 5.3125,
289
+ "grad_norm": 0.41112297773361206,
290
+ "learning_rate": 0.000734375,
291
+ "loss": 0.0917,
292
+ "step": 170
293
+ },
294
+ {
295
+ "epoch": 5.46875,
296
+ "grad_norm": 1.9200881719589233,
297
+ "learning_rate": 0.0007265625,
298
+ "loss": 0.1139,
299
+ "step": 175
300
+ },
301
+ {
302
+ "epoch": 5.625,
303
+ "grad_norm": 0.7506140470504761,
304
+ "learning_rate": 0.00071875,
305
+ "loss": 0.116,
306
+ "step": 180
307
+ },
308
+ {
309
+ "epoch": 5.78125,
310
+ "grad_norm": 1.2240333557128906,
311
+ "learning_rate": 0.0007109375,
312
+ "loss": 0.1251,
313
+ "step": 185
314
+ },
315
+ {
316
+ "epoch": 5.9375,
317
+ "grad_norm": 1.3143774271011353,
318
+ "learning_rate": 0.000703125,
319
+ "loss": 0.0531,
320
+ "step": 190
321
+ },
322
+ {
323
+ "epoch": 6.0,
324
+ "eval_accuracy": 0.8,
325
+ "eval_loss": 0.7608510255813599,
326
+ "eval_runtime": 24.6873,
327
+ "eval_samples_per_second": 41.317,
328
+ "eval_steps_per_second": 1.296,
329
+ "step": 192
330
+ },
331
+ {
332
+ "epoch": 6.09375,
333
+ "grad_norm": 1.9283502101898193,
334
+ "learning_rate": 0.0006953125,
335
+ "loss": 0.1274,
336
+ "step": 195
337
+ },
338
+ {
339
+ "epoch": 6.25,
340
+ "grad_norm": 0.5062114000320435,
341
+ "learning_rate": 0.0006875,
342
+ "loss": 0.0358,
343
+ "step": 200
344
+ },
345
+ {
346
+ "epoch": 6.40625,
347
+ "grad_norm": 1.057132601737976,
348
+ "learning_rate": 0.0006796875000000001,
349
+ "loss": 0.0426,
350
+ "step": 205
351
+ },
352
+ {
353
+ "epoch": 6.5625,
354
+ "grad_norm": 0.2724122107028961,
355
+ "learning_rate": 0.0006718750000000001,
356
+ "loss": 0.0668,
357
+ "step": 210
358
+ },
359
+ {
360
+ "epoch": 6.71875,
361
+ "grad_norm": 0.3335299789905548,
362
+ "learning_rate": 0.0006640625,
363
+ "loss": 0.0838,
364
+ "step": 215
365
+ },
366
+ {
367
+ "epoch": 6.875,
368
+ "grad_norm": 0.5840352177619934,
369
+ "learning_rate": 0.00065625,
370
+ "loss": 0.0411,
371
+ "step": 220
372
+ },
373
+ {
374
+ "epoch": 7.0,
375
+ "eval_accuracy": 0.8029411764705883,
376
+ "eval_loss": 0.7190886735916138,
377
+ "eval_runtime": 18.919,
378
+ "eval_samples_per_second": 53.914,
379
+ "eval_steps_per_second": 1.691,
380
+ "step": 224
381
+ },
382
+ {
383
+ "epoch": 7.03125,
384
+ "grad_norm": 0.6974908709526062,
385
+ "learning_rate": 0.0006484375,
386
+ "loss": 0.0412,
387
+ "step": 225
388
+ },
389
+ {
390
+ "epoch": 7.1875,
391
+ "grad_norm": 0.27331459522247314,
392
+ "learning_rate": 0.000640625,
393
+ "loss": 0.0238,
394
+ "step": 230
395
+ },
396
+ {
397
+ "epoch": 7.34375,
398
+ "grad_norm": 0.26315683126449585,
399
+ "learning_rate": 0.0006328125,
400
+ "loss": 0.0181,
401
+ "step": 235
402
+ },
403
+ {
404
+ "epoch": 7.5,
405
+ "grad_norm": 0.979246199131012,
406
+ "learning_rate": 0.000625,
407
+ "loss": 0.0368,
408
+ "step": 240
409
+ },
410
+ {
411
+ "epoch": 7.65625,
412
+ "grad_norm": 0.18979792296886444,
413
+ "learning_rate": 0.0006171875,
414
+ "loss": 0.0293,
415
+ "step": 245
416
+ },
417
+ {
418
+ "epoch": 7.8125,
419
+ "grad_norm": 2.098189115524292,
420
+ "learning_rate": 0.000609375,
421
+ "loss": 0.0263,
422
+ "step": 250
423
+ },
424
+ {
425
+ "epoch": 7.96875,
426
+ "grad_norm": 0.20951713621616364,
427
+ "learning_rate": 0.0006015625,
428
+ "loss": 0.0351,
429
+ "step": 255
430
+ },
431
+ {
432
+ "epoch": 8.0,
433
+ "eval_accuracy": 0.807843137254902,
434
+ "eval_loss": 0.698701798915863,
435
+ "eval_runtime": 20.5747,
436
+ "eval_samples_per_second": 49.575,
437
+ "eval_steps_per_second": 1.555,
438
+ "step": 256
439
+ },
440
+ {
441
+ "epoch": 8.125,
442
+ "grad_norm": 0.08398638665676117,
443
+ "learning_rate": 0.00059375,
444
+ "loss": 0.0138,
445
+ "step": 260
446
+ },
447
+ {
448
+ "epoch": 8.28125,
449
+ "grad_norm": 0.94996577501297,
450
+ "learning_rate": 0.0005859375,
451
+ "loss": 0.0176,
452
+ "step": 265
453
+ },
454
+ {
455
+ "epoch": 8.4375,
456
+ "grad_norm": 0.14498768746852875,
457
+ "learning_rate": 0.000578125,
458
+ "loss": 0.0149,
459
+ "step": 270
460
+ },
461
+ {
462
+ "epoch": 8.59375,
463
+ "grad_norm": 0.1302383691072464,
464
+ "learning_rate": 0.0005703125,
465
+ "loss": 0.0146,
466
+ "step": 275
467
+ },
468
+ {
469
+ "epoch": 8.75,
470
+ "grad_norm": 0.3484581708908081,
471
+ "learning_rate": 0.0005625000000000001,
472
+ "loss": 0.0183,
473
+ "step": 280
474
+ },
475
+ {
476
+ "epoch": 8.90625,
477
+ "grad_norm": 0.1543685644865036,
478
+ "learning_rate": 0.0005546875000000001,
479
+ "loss": 0.0107,
480
+ "step": 285
481
+ },
482
+ {
483
+ "epoch": 9.0,
484
+ "eval_accuracy": 0.8225490196078431,
485
+ "eval_loss": 0.6843494176864624,
486
+ "eval_runtime": 16.0725,
487
+ "eval_samples_per_second": 63.462,
488
+ "eval_steps_per_second": 1.991,
489
+ "step": 288
490
+ },
491
+ {
492
+ "epoch": 9.0625,
493
+ "grad_norm": 0.9732298851013184,
494
+ "learning_rate": 0.000546875,
495
+ "loss": 0.0156,
496
+ "step": 290
497
+ },
498
+ {
499
+ "epoch": 9.21875,
500
+ "grad_norm": 0.09730440378189087,
501
+ "learning_rate": 0.0005390625,
502
+ "loss": 0.0114,
503
+ "step": 295
504
+ },
505
+ {
506
+ "epoch": 9.375,
507
+ "grad_norm": 0.41419529914855957,
508
+ "learning_rate": 0.00053125,
509
+ "loss": 0.0101,
510
+ "step": 300
511
+ },
512
+ {
513
+ "epoch": 9.53125,
514
+ "grad_norm": 0.055323634296655655,
515
+ "learning_rate": 0.0005234375,
516
+ "loss": 0.0074,
517
+ "step": 305
518
+ },
519
+ {
520
+ "epoch": 9.6875,
521
+ "grad_norm": 0.07538346946239471,
522
+ "learning_rate": 0.000515625,
523
+ "loss": 0.0051,
524
+ "step": 310
525
+ },
526
+ {
527
+ "epoch": 9.84375,
528
+ "grad_norm": 0.037017084658145905,
529
+ "learning_rate": 0.0005078125,
530
+ "loss": 0.0133,
531
+ "step": 315
532
+ },
533
+ {
534
+ "epoch": 10.0,
535
+ "grad_norm": 0.05076463520526886,
536
+ "learning_rate": 0.0005,
537
+ "loss": 0.0094,
538
+ "step": 320
539
+ },
540
+ {
541
+ "epoch": 10.0,
542
+ "eval_accuracy": 0.8343137254901961,
543
+ "eval_loss": 0.6314178109169006,
544
+ "eval_runtime": 16.9633,
545
+ "eval_samples_per_second": 60.13,
546
+ "eval_steps_per_second": 1.886,
547
+ "step": 320
548
+ },
549
+ {
550
+ "epoch": 10.15625,
551
+ "grad_norm": 0.04146264120936394,
552
+ "learning_rate": 0.0004921875,
553
+ "loss": 0.0045,
554
+ "step": 325
555
+ },
556
+ {
557
+ "epoch": 10.3125,
558
+ "grad_norm": 1.678152084350586,
559
+ "learning_rate": 0.000484375,
560
+ "loss": 0.0111,
561
+ "step": 330
562
+ },
563
+ {
564
+ "epoch": 10.46875,
565
+ "grad_norm": 0.08414560556411743,
566
+ "learning_rate": 0.0004765625,
567
+ "loss": 0.004,
568
+ "step": 335
569
+ },
570
+ {
571
+ "epoch": 10.625,
572
+ "grad_norm": 0.062152933329343796,
573
+ "learning_rate": 0.00046875,
574
+ "loss": 0.0058,
575
+ "step": 340
576
+ },
577
+ {
578
+ "epoch": 10.78125,
579
+ "grad_norm": 0.18813878297805786,
580
+ "learning_rate": 0.00046093750000000003,
581
+ "loss": 0.0059,
582
+ "step": 345
583
+ },
584
+ {
585
+ "epoch": 10.9375,
586
+ "grad_norm": 0.03264420107007027,
587
+ "learning_rate": 0.000453125,
588
+ "loss": 0.0081,
589
+ "step": 350
590
+ },
591
+ {
592
+ "epoch": 11.0,
593
+ "eval_accuracy": 0.8352941176470589,
594
+ "eval_loss": 0.6319591999053955,
595
+ "eval_runtime": 16.0721,
596
+ "eval_samples_per_second": 63.464,
597
+ "eval_steps_per_second": 1.991,
598
+ "step": 352
599
+ },
600
+ {
601
+ "epoch": 11.09375,
602
+ "grad_norm": 0.01650502346456051,
603
+ "learning_rate": 0.0004453125,
604
+ "loss": 0.0058,
605
+ "step": 355
606
+ },
607
+ {
608
+ "epoch": 11.25,
609
+ "grad_norm": 0.03100210428237915,
610
+ "learning_rate": 0.0004375,
611
+ "loss": 0.0032,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 11.40625,
616
+ "grad_norm": 0.31530651450157166,
617
+ "learning_rate": 0.0004296875,
618
+ "loss": 0.0055,
619
+ "step": 365
620
+ },
621
+ {
622
+ "epoch": 11.5625,
623
+ "grad_norm": 0.018279677256941795,
624
+ "learning_rate": 0.000421875,
625
+ "loss": 0.0042,
626
+ "step": 370
627
+ },
628
+ {
629
+ "epoch": 11.71875,
630
+ "grad_norm": 0.039065517485141754,
631
+ "learning_rate": 0.0004140625,
632
+ "loss": 0.004,
633
+ "step": 375
634
+ },
635
+ {
636
+ "epoch": 11.875,
637
+ "grad_norm": 0.17956194281578064,
638
+ "learning_rate": 0.00040625000000000004,
639
+ "loss": 0.0053,
640
+ "step": 380
641
+ },
642
+ {
643
+ "epoch": 12.0,
644
+ "eval_accuracy": 0.8352941176470589,
645
+ "eval_loss": 0.6048569679260254,
646
+ "eval_runtime": 16.0302,
647
+ "eval_samples_per_second": 63.63,
648
+ "eval_steps_per_second": 1.996,
649
+ "step": 384
650
+ },
651
+ {
652
+ "epoch": 12.03125,
653
+ "grad_norm": 0.0491081103682518,
654
+ "learning_rate": 0.00039843750000000003,
655
+ "loss": 0.004,
656
+ "step": 385
657
+ },
658
+ {
659
+ "epoch": 12.1875,
660
+ "grad_norm": 0.06726662814617157,
661
+ "learning_rate": 0.000390625,
662
+ "loss": 0.0032,
663
+ "step": 390
664
+ },
665
+ {
666
+ "epoch": 12.34375,
667
+ "grad_norm": 0.0226299911737442,
668
+ "learning_rate": 0.0003828125,
669
+ "loss": 0.0027,
670
+ "step": 395
671
+ },
672
+ {
673
+ "epoch": 12.5,
674
+ "grad_norm": 0.021714534610509872,
675
+ "learning_rate": 0.000375,
676
+ "loss": 0.0029,
677
+ "step": 400
678
+ },
679
+ {
680
+ "epoch": 12.65625,
681
+ "grad_norm": 0.07769683748483658,
682
+ "learning_rate": 0.0003671875,
683
+ "loss": 0.0034,
684
+ "step": 405
685
+ },
686
+ {
687
+ "epoch": 12.8125,
688
+ "grad_norm": 0.017162494361400604,
689
+ "learning_rate": 0.000359375,
690
+ "loss": 0.0029,
691
+ "step": 410
692
+ },
693
+ {
694
+ "epoch": 12.96875,
695
+ "grad_norm": 0.08164256066083908,
696
+ "learning_rate": 0.0003515625,
697
+ "loss": 0.0048,
698
+ "step": 415
699
+ },
700
+ {
701
+ "epoch": 13.0,
702
+ "eval_accuracy": 0.8372549019607843,
703
+ "eval_loss": 0.5961340665817261,
704
+ "eval_runtime": 18.3515,
705
+ "eval_samples_per_second": 55.581,
706
+ "eval_steps_per_second": 1.744,
707
+ "step": 416
708
+ },
709
+ {
710
+ "epoch": 13.125,
711
+ "grad_norm": 0.05423242226243019,
712
+ "learning_rate": 0.00034375,
713
+ "loss": 0.0024,
714
+ "step": 420
715
+ },
716
+ {
717
+ "epoch": 13.28125,
718
+ "grad_norm": 0.19008223712444305,
719
+ "learning_rate": 0.00033593750000000003,
720
+ "loss": 0.0083,
721
+ "step": 425
722
+ },
723
+ {
724
+ "epoch": 13.4375,
725
+ "grad_norm": 0.0373542457818985,
726
+ "learning_rate": 0.000328125,
727
+ "loss": 0.002,
728
+ "step": 430
729
+ },
730
+ {
731
+ "epoch": 13.59375,
732
+ "grad_norm": 0.014899961650371552,
733
+ "learning_rate": 0.0003203125,
734
+ "loss": 0.0029,
735
+ "step": 435
736
+ },
737
+ {
738
+ "epoch": 13.75,
739
+ "grad_norm": 0.03342936560511589,
740
+ "learning_rate": 0.0003125,
741
+ "loss": 0.0031,
742
+ "step": 440
743
+ },
744
+ {
745
+ "epoch": 13.90625,
746
+ "grad_norm": 0.018663976341485977,
747
+ "learning_rate": 0.0003046875,
748
+ "loss": 0.0024,
749
+ "step": 445
750
+ },
751
+ {
752
+ "epoch": 14.0,
753
+ "eval_accuracy": 0.8470588235294118,
754
+ "eval_loss": 0.588026225566864,
755
+ "eval_runtime": 16.6123,
756
+ "eval_samples_per_second": 61.4,
757
+ "eval_steps_per_second": 1.926,
758
+ "step": 448
759
+ },
760
+ {
761
+ "epoch": 14.0625,
762
+ "grad_norm": 0.03626991808414459,
763
+ "learning_rate": 0.000296875,
764
+ "loss": 0.004,
765
+ "step": 450
766
+ },
767
+ {
768
+ "epoch": 14.21875,
769
+ "grad_norm": 0.021257249638438225,
770
+ "learning_rate": 0.0002890625,
771
+ "loss": 0.0026,
772
+ "step": 455
773
+ },
774
+ {
775
+ "epoch": 14.375,
776
+ "grad_norm": 0.032649360597133636,
777
+ "learning_rate": 0.00028125000000000003,
778
+ "loss": 0.002,
779
+ "step": 460
780
+ },
781
+ {
782
+ "epoch": 14.53125,
783
+ "grad_norm": 0.022741030901670456,
784
+ "learning_rate": 0.0002734375,
785
+ "loss": 0.007,
786
+ "step": 465
787
+ },
788
+ {
789
+ "epoch": 14.6875,
790
+ "grad_norm": 0.020442800596356392,
791
+ "learning_rate": 0.000265625,
792
+ "loss": 0.0023,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 14.84375,
797
+ "grad_norm": 0.022834857925772667,
798
+ "learning_rate": 0.0002578125,
799
+ "loss": 0.0031,
800
+ "step": 475
801
+ },
802
+ {
803
+ "epoch": 15.0,
804
+ "grad_norm": 0.014007111079990864,
805
+ "learning_rate": 0.00025,
806
+ "loss": 0.0028,
807
+ "step": 480
808
+ },
809
+ {
810
+ "epoch": 15.0,
811
+ "eval_accuracy": 0.8441176470588235,
812
+ "eval_loss": 0.5926622748374939,
813
+ "eval_runtime": 16.2356,
814
+ "eval_samples_per_second": 62.825,
815
+ "eval_steps_per_second": 1.971,
816
+ "step": 480
817
+ },
818
+ {
819
+ "epoch": 15.15625,
820
+ "grad_norm": 0.00949984509497881,
821
+ "learning_rate": 0.0002421875,
822
+ "loss": 0.0023,
823
+ "step": 485
824
+ },
825
+ {
826
+ "epoch": 15.3125,
827
+ "grad_norm": 0.04143200442194939,
828
+ "learning_rate": 0.000234375,
829
+ "loss": 0.0021,
830
+ "step": 490
831
+ },
832
+ {
833
+ "epoch": 15.46875,
834
+ "grad_norm": 0.012401225045323372,
835
+ "learning_rate": 0.0002265625,
836
+ "loss": 0.0021,
837
+ "step": 495
838
+ },
839
+ {
840
+ "epoch": 15.625,
841
+ "grad_norm": 0.040582917630672455,
842
+ "learning_rate": 0.00021875,
843
+ "loss": 0.0031,
844
+ "step": 500
845
+ },
846
+ {
847
+ "epoch": 15.78125,
848
+ "grad_norm": 0.025907032191753387,
849
+ "learning_rate": 0.0002109375,
850
+ "loss": 0.0021,
851
+ "step": 505
852
+ },
853
+ {
854
+ "epoch": 15.9375,
855
+ "grad_norm": 0.008175536058843136,
856
+ "learning_rate": 0.00020312500000000002,
857
+ "loss": 0.0023,
858
+ "step": 510
859
+ },
860
+ {
861
+ "epoch": 16.0,
862
+ "eval_accuracy": 0.8519607843137255,
863
+ "eval_loss": 0.5878445506095886,
864
+ "eval_runtime": 16.1518,
865
+ "eval_samples_per_second": 63.151,
866
+ "eval_steps_per_second": 1.981,
867
+ "step": 512
868
+ },
869
+ {
870
+ "epoch": 16.09375,
871
+ "grad_norm": 0.3129185140132904,
872
+ "learning_rate": 0.0001953125,
873
+ "loss": 0.0044,
874
+ "step": 515
875
+ },
876
+ {
877
+ "epoch": 16.25,
878
+ "grad_norm": 0.030808325856924057,
879
+ "learning_rate": 0.0001875,
880
+ "loss": 0.0036,
881
+ "step": 520
882
+ },
883
+ {
884
+ "epoch": 16.40625,
885
+ "grad_norm": 0.019886957481503487,
886
+ "learning_rate": 0.0001796875,
887
+ "loss": 0.0026,
888
+ "step": 525
889
+ },
890
+ {
891
+ "epoch": 16.5625,
892
+ "grad_norm": 0.019268082454800606,
893
+ "learning_rate": 0.000171875,
894
+ "loss": 0.0034,
895
+ "step": 530
896
+ },
897
+ {
898
+ "epoch": 16.71875,
899
+ "grad_norm": 0.025241246446967125,
900
+ "learning_rate": 0.0001640625,
901
+ "loss": 0.0019,
902
+ "step": 535
903
+ },
904
+ {
905
+ "epoch": 16.875,
906
+ "grad_norm": 0.01479440089315176,
907
+ "learning_rate": 0.00015625,
908
+ "loss": 0.0027,
909
+ "step": 540
910
+ },
911
+ {
912
+ "epoch": 17.0,
913
+ "eval_accuracy": 0.8470588235294118,
914
+ "eval_loss": 0.5872153043746948,
915
+ "eval_runtime": 16.056,
916
+ "eval_samples_per_second": 63.528,
917
+ "eval_steps_per_second": 1.993,
918
+ "step": 544
919
+ },
920
+ {
921
+ "epoch": 17.03125,
922
+ "grad_norm": 0.01011387724429369,
923
+ "learning_rate": 0.0001484375,
924
+ "loss": 0.0019,
925
+ "step": 545
926
+ },
927
+ {
928
+ "epoch": 17.1875,
929
+ "grad_norm": 0.020896941423416138,
930
+ "learning_rate": 0.00014062500000000002,
931
+ "loss": 0.0022,
932
+ "step": 550
933
+ },
934
+ {
935
+ "epoch": 17.34375,
936
+ "grad_norm": 0.040105391293764114,
937
+ "learning_rate": 0.0001328125,
938
+ "loss": 0.002,
939
+ "step": 555
940
+ },
941
+ {
942
+ "epoch": 17.5,
943
+ "grad_norm": 0.016236811876296997,
944
+ "learning_rate": 0.000125,
945
+ "loss": 0.0024,
946
+ "step": 560
947
+ },
948
+ {
949
+ "epoch": 17.65625,
950
+ "grad_norm": 0.010203810408711433,
951
+ "learning_rate": 0.0001171875,
952
+ "loss": 0.002,
953
+ "step": 565
954
+ },
955
+ {
956
+ "epoch": 17.8125,
957
+ "grad_norm": 0.01675267145037651,
958
+ "learning_rate": 0.000109375,
959
+ "loss": 0.0019,
960
+ "step": 570
961
+ },
962
+ {
963
+ "epoch": 17.96875,
964
+ "grad_norm": 0.08755680918693542,
965
+ "learning_rate": 0.00010156250000000001,
966
+ "loss": 0.0028,
967
+ "step": 575
968
+ },
969
+ {
970
+ "epoch": 18.0,
971
+ "eval_accuracy": 0.8450980392156863,
972
+ "eval_loss": 0.5891793966293335,
973
+ "eval_runtime": 15.883,
974
+ "eval_samples_per_second": 64.22,
975
+ "eval_steps_per_second": 2.015,
976
+ "step": 576
977
+ },
978
+ {
979
+ "epoch": 18.125,
980
+ "grad_norm": 0.060470979660749435,
981
+ "learning_rate": 9.375e-05,
982
+ "loss": 0.003,
983
+ "step": 580
984
+ },
985
+ {
986
+ "epoch": 18.28125,
987
+ "grad_norm": 0.02452988736331463,
988
+ "learning_rate": 8.59375e-05,
989
+ "loss": 0.0017,
990
+ "step": 585
991
+ },
992
+ {
993
+ "epoch": 18.4375,
994
+ "grad_norm": 0.02058909274637699,
995
+ "learning_rate": 7.8125e-05,
996
+ "loss": 0.002,
997
+ "step": 590
998
+ },
999
+ {
1000
+ "epoch": 18.59375,
1001
+ "grad_norm": 0.01303939614444971,
1002
+ "learning_rate": 7.031250000000001e-05,
1003
+ "loss": 0.0025,
1004
+ "step": 595
1005
+ },
1006
+ {
1007
+ "epoch": 18.75,
1008
+ "grad_norm": 0.006279917433857918,
1009
+ "learning_rate": 6.25e-05,
1010
+ "loss": 0.0027,
1011
+ "step": 600
1012
+ },
1013
+ {
1014
+ "epoch": 18.90625,
1015
+ "grad_norm": 0.022672630846500397,
1016
+ "learning_rate": 5.46875e-05,
1017
+ "loss": 0.002,
1018
+ "step": 605
1019
+ },
1020
+ {
1021
+ "epoch": 19.0,
1022
+ "eval_accuracy": 0.8411764705882353,
1023
+ "eval_loss": 0.5932831764221191,
1024
+ "eval_runtime": 16.4628,
1025
+ "eval_samples_per_second": 61.958,
1026
+ "eval_steps_per_second": 1.944,
1027
+ "step": 608
1028
+ },
1029
+ {
1030
+ "epoch": 19.0625,
1031
+ "grad_norm": 0.15350750088691711,
1032
+ "learning_rate": 4.6875e-05,
1033
+ "loss": 0.0034,
1034
+ "step": 610
1035
+ },
1036
+ {
1037
+ "epoch": 19.21875,
1038
+ "grad_norm": 0.01092343870550394,
1039
+ "learning_rate": 3.90625e-05,
1040
+ "loss": 0.002,
1041
+ "step": 615
1042
+ },
1043
+ {
1044
+ "epoch": 19.375,
1045
+ "grad_norm": 0.008441799320280552,
1046
+ "learning_rate": 3.125e-05,
1047
+ "loss": 0.0022,
1048
+ "step": 620
1049
+ },
1050
+ {
1051
+ "epoch": 19.53125,
1052
+ "grad_norm": 0.012427592650055885,
1053
+ "learning_rate": 2.34375e-05,
1054
+ "loss": 0.0026,
1055
+ "step": 625
1056
+ },
1057
+ {
1058
+ "epoch": 19.6875,
1059
+ "grad_norm": 0.019600288942456245,
1060
+ "learning_rate": 1.5625e-05,
1061
+ "loss": 0.0016,
1062
+ "step": 630
1063
+ },
1064
+ {
1065
+ "epoch": 19.84375,
1066
+ "grad_norm": 0.0400865413248539,
1067
+ "learning_rate": 7.8125e-06,
1068
+ "loss": 0.0031,
1069
+ "step": 635
1070
+ },
1071
+ {
1072
+ "epoch": 20.0,
1073
+ "grad_norm": 0.03250521048903465,
1074
+ "learning_rate": 0.0,
1075
+ "loss": 0.0017,
1076
+ "step": 640
1077
+ },
1078
+ {
1079
+ "epoch": 20.0,
1080
+ "eval_accuracy": 0.85,
1081
+ "eval_loss": 0.5915272235870361,
1082
+ "eval_runtime": 17.7209,
1083
+ "eval_samples_per_second": 57.559,
1084
+ "eval_steps_per_second": 1.806,
1085
+ "step": 640
1086
+ },
1087
+ {
1088
+ "epoch": 20.0,
1089
+ "step": 640,
1090
+ "total_flos": 4.36977436041216e+17,
1091
+ "train_loss": 0.5368185924002319,
1092
+ "train_runtime": 902.6693,
1093
+ "train_samples_per_second": 22.6,
1094
+ "train_steps_per_second": 0.709
1095
+ }
1096
+ ],
1097
+ "logging_steps": 5,
1098
+ "max_steps": 640,
1099
+ "num_input_tokens_seen": 0,
1100
+ "num_train_epochs": 20,
1101
+ "save_steps": 500,
1102
+ "stateful_callbacks": {
1103
+ "TrainerControl": {
1104
+ "args": {
1105
+ "should_epoch_stop": false,
1106
+ "should_evaluate": false,
1107
+ "should_log": false,
1108
+ "should_save": true,
1109
+ "should_training_stop": true
1110
+ },
1111
+ "attributes": {}
1112
+ }
1113
+ },
1114
+ "total_flos": 4.36977436041216e+17,
1115
+ "train_batch_size": 32,
1116
+ "trial_name": null,
1117
+ "trial_params": null
1118
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:672d131f53e4147ca9ceb033df2f0fab539acde03a45ba383fa02daeb0f762a2
3
+ size 5304