hogru commited on
Commit
dcb0e52
·
1 Parent(s): 64cff3d

Initial commit

Browse files
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 43.0,
3
+ "perplexity": 1.1020061050262677,
4
+ "test_accuracy": 0.5884773413073098,
5
+ "test_loss": 0.09713225066661835,
6
+ "test_runtime": 9.4021,
7
+ "test_samples_per_second": 248.562,
8
+ "test_steps_per_second": 15.635,
9
+ "train_loss": 0.2331255912600619,
10
+ "train_runtime": 2746.0841,
11
+ "train_samples": 7878,
12
+ "train_samples_per_second": 143.441,
13
+ "train_steps_per_second": 4.479
14
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 0,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 1,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 144,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 896,
18
+ "pad_token": " ",
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.27.1",
30
+ "use_cache": true,
31
+ "vocab_size": 52
32
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": 1,
4
+ "length_penalty": 0.0,
5
+ "max_new_tokens": 896,
6
+ "min_new_tokens": 1,
7
+ "num_return_sequences": 100,
8
+ "pad_token_id": 2,
9
+ "transformers_version": "4.27.1"
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:731f207586a9dda4728f79a89e66fbbe0cc2b23b3921ec47fe995fc5aeb2c63b
3
+ size 22270333
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "^",
3
+ "eos_token": "_",
4
+ "pad_token": " ",
5
+ "unk_token": "§"
6
+ }
test_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 43.0,
3
+ "perplexity": 1.1020061050262677,
4
+ "test_accuracy": 0.5884773413073098,
5
+ "test_loss": 0.09713225066661835,
6
+ "test_runtime": 9.4021,
7
+ "test_samples_per_second": 248.562,
8
+ "test_steps_per_second": 15.635
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "^",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "_",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": " ",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "§",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "°",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Split",
55
+ "pattern": {
56
+ "Regex": ""
57
+ },
58
+ "behavior": "Isolated",
59
+ "invert": false
60
+ },
61
+ "post_processor": {
62
+ "type": "TemplateProcessing",
63
+ "single": [
64
+ {
65
+ "SpecialToken": {
66
+ "id": "^",
67
+ "type_id": 0
68
+ }
69
+ },
70
+ {
71
+ "Sequence": {
72
+ "id": "A",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "SpecialToken": {
78
+ "id": "_",
79
+ "type_id": 0
80
+ }
81
+ }
82
+ ],
83
+ "pair": [
84
+ {
85
+ "Sequence": {
86
+ "id": "A",
87
+ "type_id": 0
88
+ }
89
+ },
90
+ {
91
+ "Sequence": {
92
+ "id": "B",
93
+ "type_id": 1
94
+ }
95
+ }
96
+ ],
97
+ "special_tokens": {
98
+ "^": {
99
+ "id": "^",
100
+ "ids": [
101
+ 0
102
+ ],
103
+ "tokens": [
104
+ "^"
105
+ ]
106
+ },
107
+ "_": {
108
+ "id": "_",
109
+ "ids": [
110
+ 1
111
+ ],
112
+ "tokens": [
113
+ "_"
114
+ ]
115
+ }
116
+ }
117
+ },
118
+ "decoder": null,
119
+ "model": {
120
+ "type": "WordLevel",
121
+ "vocab": {
122
+ "^": 0,
123
+ "_": 1,
124
+ " ": 2,
125
+ "§": 3,
126
+ "°": 4,
127
+ ":": 5,
128
+ ";": 6,
129
+ "[": 7,
130
+ "]": 8,
131
+ "0": 9,
132
+ "-": 10,
133
+ "1": 11,
134
+ "H": 12,
135
+ "D": 13,
136
+ "C": 14,
137
+ "2": 15,
138
+ "c": 16,
139
+ "3": 17,
140
+ "+": 18,
141
+ "7": 19,
142
+ "(": 20,
143
+ ")": 21,
144
+ "#": 22,
145
+ "O": 23,
146
+ "4": 24,
147
+ "a": 25,
148
+ ">": 26,
149
+ "=": 27,
150
+ "6": 28,
151
+ "5": 29,
152
+ "8": 30,
153
+ "N": 31,
154
+ ".": 32,
155
+ "9": 33,
156
+ "B": 34,
157
+ "l": 35,
158
+ "n": 36,
159
+ "r": 37,
160
+ "S": 38,
161
+ "F": 39,
162
+ "/": 40,
163
+ "@": 41,
164
+ "I": 42,
165
+ "\\": 43,
166
+ "P": 44,
167
+ "M": 45,
168
+ "g": 46,
169
+ "s": 47,
170
+ "Z": 48,
171
+ "i": 49,
172
+ "o": 50,
173
+ "u": 51
174
+ },
175
+ "unk_token": "§"
176
+ }
177
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "^",
3
+ "eos_token": "_",
4
+ "model_max_length": 896,
5
+ "pad_token": " ",
6
+ "padding_side": "right",
7
+ "tokenizer_class": "PreTrainedTokenizerFast",
8
+ "truncation_side": "left",
9
+ "unk_token": "§"
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 43.0,
3
+ "train_loss": 0.2331255912600619,
4
+ "train_runtime": 2746.0841,
5
+ "train_samples": 7878,
6
+ "train_samples_per_second": 143.441,
7
+ "train_steps_per_second": 4.479
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1042 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0970187559723854,
3
+ "best_model_checkpoint": "/home/stephan/code/molreactgen/checkpoints/2023-05-12_13-06-58_experiment/checkpoint-9850",
4
+ "epoch": 42.99695431472081,
5
+ "global_step": 10588,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.41,
12
+ "learning_rate": 4.065040650406504e-05,
13
+ "loss": 3.0981,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.81,
18
+ "learning_rate": 8.130081300813008e-05,
19
+ "loss": 2.1418,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 1.0,
24
+ "eval_accuracy": 0.2829720403871191,
25
+ "eval_loss": 1.5341426134109497,
26
+ "eval_runtime": 6.1779,
27
+ "eval_samples_per_second": 390.745,
28
+ "eval_steps_per_second": 24.442,
29
+ "step": 246
30
+ },
31
+ {
32
+ "epoch": 1.22,
33
+ "learning_rate": 0.00012195121951219512,
34
+ "loss": 1.588,
35
+ "step": 300
36
+ },
37
+ {
38
+ "epoch": 1.62,
39
+ "learning_rate": 0.00016260162601626016,
40
+ "loss": 1.1354,
41
+ "step": 400
42
+ },
43
+ {
44
+ "epoch": 2.0,
45
+ "eval_accuracy": 0.32898021613845974,
46
+ "eval_loss": 0.7312331795692444,
47
+ "eval_runtime": 6.0715,
48
+ "eval_samples_per_second": 397.598,
49
+ "eval_steps_per_second": 24.87,
50
+ "step": 492
51
+ },
52
+ {
53
+ "epoch": 2.03,
54
+ "learning_rate": 0.0002032520325203252,
55
+ "loss": 0.8911,
56
+ "step": 500
57
+ },
58
+ {
59
+ "epoch": 2.44,
60
+ "learning_rate": 0.00024390243902439024,
61
+ "loss": 0.743,
62
+ "step": 600
63
+ },
64
+ {
65
+ "epoch": 2.84,
66
+ "learning_rate": 0.0002845528455284553,
67
+ "loss": 0.664,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 3.0,
72
+ "eval_accuracy": 0.3475759534583064,
73
+ "eval_loss": 0.5133360028266907,
74
+ "eval_runtime": 6.1012,
75
+ "eval_samples_per_second": 395.662,
76
+ "eval_steps_per_second": 24.749,
77
+ "step": 738
78
+ },
79
+ {
80
+ "epoch": 3.25,
81
+ "learning_rate": 0.0003252032520325203,
82
+ "loss": 0.6077,
83
+ "step": 800
84
+ },
85
+ {
86
+ "epoch": 3.65,
87
+ "learning_rate": 0.00036585365853658537,
88
+ "loss": 0.5517,
89
+ "step": 900
90
+ },
91
+ {
92
+ "epoch": 4.0,
93
+ "eval_accuracy": 0.35866693372907127,
94
+ "eval_loss": 0.40151524543762207,
95
+ "eval_runtime": 6.0531,
96
+ "eval_samples_per_second": 398.803,
97
+ "eval_steps_per_second": 24.946,
98
+ "step": 985
99
+ },
100
+ {
101
+ "epoch": 4.06,
102
+ "learning_rate": 0.0004065040650406504,
103
+ "loss": 0.5002,
104
+ "step": 1000
105
+ },
106
+ {
107
+ "epoch": 4.47,
108
+ "learning_rate": 0.00044715447154471545,
109
+ "loss": 0.4461,
110
+ "step": 1100
111
+ },
112
+ {
113
+ "epoch": 4.87,
114
+ "learning_rate": 0.0004878048780487805,
115
+ "loss": 0.3986,
116
+ "step": 1200
117
+ },
118
+ {
119
+ "epoch": 5.0,
120
+ "eval_accuracy": 0.36889936906506915,
121
+ "eval_loss": 0.30596020817756653,
122
+ "eval_runtime": 6.0649,
123
+ "eval_samples_per_second": 398.03,
124
+ "eval_steps_per_second": 24.897,
125
+ "step": 1231
126
+ },
127
+ {
128
+ "epoch": 5.28,
129
+ "learning_rate": 0.0004999506716812021,
130
+ "loss": 0.3583,
131
+ "step": 1300
132
+ },
133
+ {
134
+ "epoch": 5.69,
135
+ "learning_rate": 0.0004997091104496882,
136
+ "loss": 0.3313,
137
+ "step": 1400
138
+ },
139
+ {
140
+ "epoch": 6.0,
141
+ "eval_accuracy": 0.3759261451059297,
142
+ "eval_loss": 0.241128608584404,
143
+ "eval_runtime": 6.1261,
144
+ "eval_samples_per_second": 394.054,
145
+ "eval_steps_per_second": 24.649,
146
+ "step": 1477
147
+ },
148
+ {
149
+ "epoch": 6.09,
150
+ "learning_rate": 0.0004992664502959351,
151
+ "loss": 0.3005,
152
+ "step": 1500
153
+ },
154
+ {
155
+ "epoch": 6.5,
156
+ "learning_rate": 0.0004986230477086575,
157
+ "loss": 0.2758,
158
+ "step": 1600
159
+ },
160
+ {
161
+ "epoch": 6.9,
162
+ "learning_rate": 0.0004977794208410241,
163
+ "loss": 0.259,
164
+ "step": 1700
165
+ },
166
+ {
167
+ "epoch": 7.0,
168
+ "eval_accuracy": 0.3807942678696614,
169
+ "eval_loss": 0.19934649765491486,
170
+ "eval_runtime": 6.0599,
171
+ "eval_samples_per_second": 398.358,
172
+ "eval_steps_per_second": 24.918,
173
+ "step": 1723
174
+ },
175
+ {
176
+ "epoch": 7.31,
177
+ "learning_rate": 0.0004967362490933723,
178
+ "loss": 0.2409,
179
+ "step": 1800
180
+ },
181
+ {
182
+ "epoch": 7.72,
183
+ "learning_rate": 0.0004954943725660643,
184
+ "loss": 0.2288,
185
+ "step": 1900
186
+ },
187
+ {
188
+ "epoch": 8.0,
189
+ "eval_accuracy": 0.38369857151961545,
190
+ "eval_loss": 0.17374461889266968,
191
+ "eval_runtime": 6.1335,
192
+ "eval_samples_per_second": 393.573,
193
+ "eval_steps_per_second": 24.619,
194
+ "step": 1970
195
+ },
196
+ {
197
+ "epoch": 8.12,
198
+ "learning_rate": 0.0004940547913829275,
199
+ "loss": 0.2201,
200
+ "step": 2000
201
+ },
202
+ {
203
+ "epoch": 8.53,
204
+ "learning_rate": 0.0004924186648858207,
205
+ "loss": 0.2064,
206
+ "step": 2100
207
+ },
208
+ {
209
+ "epoch": 8.93,
210
+ "learning_rate": 0.0004905873107009799,
211
+ "loss": 0.1992,
212
+ "step": 2200
213
+ },
214
+ {
215
+ "epoch": 9.0,
216
+ "eval_accuracy": 0.385633256552892,
217
+ "eval_loss": 0.15620127320289612,
218
+ "eval_runtime": 6.1272,
219
+ "eval_samples_per_second": 393.981,
220
+ "eval_steps_per_second": 24.644,
221
+ "step": 2216
222
+ },
223
+ {
224
+ "epoch": 9.34,
225
+ "learning_rate": 0.0004885622036778897,
226
+ "loss": 0.1894,
227
+ "step": 2300
228
+ },
229
+ {
230
+ "epoch": 9.75,
231
+ "learning_rate": 0.0004863449747015384,
232
+ "loss": 0.182,
233
+ "step": 2400
234
+ },
235
+ {
236
+ "epoch": 10.0,
237
+ "eval_accuracy": 0.38733486894215974,
238
+ "eval_loss": 0.1415354460477829,
239
+ "eval_runtime": 6.1286,
240
+ "eval_samples_per_second": 393.894,
241
+ "eval_steps_per_second": 24.639,
242
+ "step": 2462
243
+ },
244
+ {
245
+ "epoch": 10.15,
246
+ "learning_rate": 0.0004839374093790139,
247
+ "loss": 0.181,
248
+ "step": 2500
249
+ },
250
+ {
251
+ "epoch": 10.56,
252
+ "learning_rate": 0.00048134144660149535,
253
+ "loss": 0.1695,
254
+ "step": 2600
255
+ },
256
+ {
257
+ "epoch": 10.96,
258
+ "learning_rate": 0.0004785591769828005,
259
+ "loss": 0.1687,
260
+ "step": 2700
261
+ },
262
+ {
263
+ "epoch": 11.0,
264
+ "eval_accuracy": 0.3880878028351102,
265
+ "eval_loss": 0.13479964435100555,
266
+ "eval_runtime": 6.1178,
267
+ "eval_samples_per_second": 394.588,
268
+ "eval_steps_per_second": 24.682,
269
+ "step": 2708
270
+ },
271
+ {
272
+ "epoch": 11.37,
273
+ "learning_rate": 0.00047559284117574613,
274
+ "loss": 0.1629,
275
+ "step": 2800
276
+ },
277
+ {
278
+ "epoch": 11.78,
279
+ "learning_rate": 0.0004724448280676768,
280
+ "loss": 0.1592,
281
+ "step": 2900
282
+ },
283
+ {
284
+ "epoch": 12.0,
285
+ "eval_accuracy": 0.38904467529156844,
286
+ "eval_loss": 0.12684421241283417,
287
+ "eval_runtime": 6.118,
288
+ "eval_samples_per_second": 394.571,
289
+ "eval_steps_per_second": 24.681,
290
+ "step": 2955
291
+ },
292
+ {
293
+ "epoch": 12.18,
294
+ "learning_rate": 0.00046911767285661587,
295
+ "loss": 0.1527,
296
+ "step": 3000
297
+ },
298
+ {
299
+ "epoch": 12.59,
300
+ "learning_rate": 0.0004656140550095876,
301
+ "loss": 0.1516,
302
+ "step": 3100
303
+ },
304
+ {
305
+ "epoch": 12.99,
306
+ "learning_rate": 0.00046193679610475414,
307
+ "loss": 0.1476,
308
+ "step": 3200
309
+ },
310
+ {
311
+ "epoch": 13.0,
312
+ "eval_accuracy": 0.3893742545772372,
313
+ "eval_loss": 0.12305640429258347,
314
+ "eval_runtime": 6.1457,
315
+ "eval_samples_per_second": 392.794,
316
+ "eval_steps_per_second": 24.57,
317
+ "step": 3201
318
+ },
319
+ {
320
+ "epoch": 13.4,
321
+ "learning_rate": 0.0004580888575591068,
322
+ "loss": 0.1447,
323
+ "step": 3300
324
+ },
325
+ {
326
+ "epoch": 13.81,
327
+ "learning_rate": 0.00045407333824353966,
328
+ "loss": 0.1417,
329
+ "step": 3400
330
+ },
331
+ {
332
+ "epoch": 14.0,
333
+ "eval_accuracy": 0.3899678614674472,
334
+ "eval_loss": 0.11744081974029541,
335
+ "eval_runtime": 6.1575,
336
+ "eval_samples_per_second": 392.044,
337
+ "eval_steps_per_second": 24.523,
338
+ "step": 3447
339
+ },
340
+ {
341
+ "epoch": 14.21,
342
+ "learning_rate": 0.00044989347198722777,
343
+ "loss": 0.1412,
344
+ "step": 3500
345
+ },
346
+ {
347
+ "epoch": 14.62,
348
+ "learning_rate": 0.00044555262497331783,
349
+ "loss": 0.1403,
350
+ "step": 3600
351
+ },
352
+ {
353
+ "epoch": 15.0,
354
+ "eval_accuracy": 0.3901754417910176,
355
+ "eval_loss": 0.11636122316122055,
356
+ "eval_runtime": 6.1213,
357
+ "eval_samples_per_second": 394.36,
358
+ "eval_steps_per_second": 24.668,
359
+ "step": 3693
360
+ },
361
+ {
362
+ "epoch": 15.03,
363
+ "learning_rate": 0.0004410542930280316,
364
+ "loss": 0.1354,
365
+ "step": 3700
366
+ },
367
+ {
368
+ "epoch": 15.43,
369
+ "learning_rate": 0.0004364020988053623,
370
+ "loss": 0.1314,
371
+ "step": 3800
372
+ },
373
+ {
374
+ "epoch": 15.84,
375
+ "learning_rate": 0.00043159978886963223,
376
+ "loss": 0.1342,
377
+ "step": 3900
378
+ },
379
+ {
380
+ "epoch": 16.0,
381
+ "eval_accuracy": 0.3905068419567177,
382
+ "eval_loss": 0.11423930525779724,
383
+ "eval_runtime": 6.1098,
384
+ "eval_samples_per_second": 395.1,
385
+ "eval_steps_per_second": 24.714,
386
+ "step": 3940
387
+ },
388
+ {
389
+ "epoch": 16.24,
390
+ "learning_rate": 0.0004266512306782628,
391
+ "loss": 0.1307,
392
+ "step": 4000
393
+ },
394
+ {
395
+ "epoch": 16.65,
396
+ "learning_rate": 0.00042156040946718344,
397
+ "loss": 0.1273,
398
+ "step": 4100
399
+ },
400
+ {
401
+ "epoch": 17.0,
402
+ "eval_accuracy": 0.39085189872265264,
403
+ "eval_loss": 0.1110914945602417,
404
+ "eval_runtime": 6.0921,
405
+ "eval_samples_per_second": 396.254,
406
+ "eval_steps_per_second": 24.786,
407
+ "step": 4186
408
+ },
409
+ {
410
+ "epoch": 17.06,
411
+ "learning_rate": 0.00041633142504139133,
412
+ "loss": 0.128,
413
+ "step": 4200
414
+ },
415
+ {
416
+ "epoch": 17.46,
417
+ "learning_rate": 0.00041096848847324417,
418
+ "loss": 0.1247,
419
+ "step": 4300
420
+ },
421
+ {
422
+ "epoch": 17.87,
423
+ "learning_rate": 0.0004054759187111451,
424
+ "loss": 0.1243,
425
+ "step": 4400
426
+ },
427
+ {
428
+ "epoch": 18.0,
429
+ "eval_accuracy": 0.39107313564645796,
430
+ "eval_loss": 0.10974010080099106,
431
+ "eval_runtime": 6.1271,
432
+ "eval_samples_per_second": 393.988,
433
+ "eval_steps_per_second": 24.645,
434
+ "step": 4432
435
+ },
436
+ {
437
+ "epoch": 18.27,
438
+ "learning_rate": 0.00039985813910135305,
439
+ "loss": 0.1231,
440
+ "step": 4500
441
+ },
442
+ {
443
+ "epoch": 18.68,
444
+ "learning_rate": 0.00039411967382571643,
445
+ "loss": 0.1205,
446
+ "step": 4600
447
+ },
448
+ {
449
+ "epoch": 19.0,
450
+ "eval_accuracy": 0.39124429836940194,
451
+ "eval_loss": 0.10775511711835861,
452
+ "eval_runtime": 6.146,
453
+ "eval_samples_per_second": 392.775,
454
+ "eval_steps_per_second": 24.569,
455
+ "step": 4678
456
+ },
457
+ {
458
+ "epoch": 19.09,
459
+ "learning_rate": 0.0003882651442582019,
460
+ "loss": 0.1193,
461
+ "step": 4700
462
+ },
463
+ {
464
+ "epoch": 19.49,
465
+ "learning_rate": 0.00038229926524315015,
466
+ "loss": 0.1175,
467
+ "step": 4800
468
+ },
469
+ {
470
+ "epoch": 19.9,
471
+ "learning_rate": 0.0003762268412982577,
472
+ "loss": 0.1202,
473
+ "step": 4900
474
+ },
475
+ {
476
+ "epoch": 20.0,
477
+ "eval_accuracy": 0.3915374600544443,
478
+ "eval_loss": 0.10535065084695816,
479
+ "eval_runtime": 6.1087,
480
+ "eval_samples_per_second": 395.174,
481
+ "eval_steps_per_second": 24.719,
482
+ "step": 4925
483
+ },
484
+ {
485
+ "epoch": 20.3,
486
+ "learning_rate": 0.00037005276274534144,
487
+ "loss": 0.1151,
488
+ "step": 5000
489
+ },
490
+ {
491
+ "epoch": 20.71,
492
+ "learning_rate": 0.0003637820017720022,
493
+ "loss": 0.1146,
494
+ "step": 5100
495
+ },
496
+ {
497
+ "epoch": 21.0,
498
+ "eval_accuracy": 0.39146917705326983,
499
+ "eval_loss": 0.10494475811719894,
500
+ "eval_runtime": 6.135,
501
+ "eval_samples_per_second": 393.48,
502
+ "eval_steps_per_second": 24.613,
503
+ "step": 5171
504
+ },
505
+ {
506
+ "epoch": 21.12,
507
+ "learning_rate": 0.00035741960842735953,
508
+ "loss": 0.1152,
509
+ "step": 5200
510
+ },
511
+ {
512
+ "epoch": 21.52,
513
+ "learning_rate": 0.0003509707065550817,
514
+ "loss": 0.1133,
515
+ "step": 5300
516
+ },
517
+ {
518
+ "epoch": 21.93,
519
+ "learning_rate": 0.00034444048966698643,
520
+ "loss": 0.1119,
521
+ "step": 5400
522
+ },
523
+ {
524
+ "epoch": 22.0,
525
+ "eval_accuracy": 0.39147099793330115,
526
+ "eval_loss": 0.10487605631351471,
527
+ "eval_runtime": 6.1054,
528
+ "eval_samples_per_second": 395.389,
529
+ "eval_steps_per_second": 24.732,
530
+ "step": 5417
531
+ },
532
+ {
533
+ "epoch": 22.34,
534
+ "learning_rate": 0.0003378342167605362,
535
+ "loss": 0.11,
536
+ "step": 5500
537
+ },
538
+ {
539
+ "epoch": 22.74,
540
+ "learning_rate": 0.00033115720808359495,
541
+ "loss": 0.1107,
542
+ "step": 5600
543
+ },
544
+ {
545
+ "epoch": 23.0,
546
+ "eval_accuracy": 0.39174139861795204,
547
+ "eval_loss": 0.10271785408258438,
548
+ "eval_runtime": 6.1004,
549
+ "eval_samples_per_second": 395.715,
550
+ "eval_steps_per_second": 24.753,
551
+ "step": 5663
552
+ },
553
+ {
554
+ "epoch": 23.15,
555
+ "learning_rate": 0.0003244148408498587,
556
+ "loss": 0.1095,
557
+ "step": 5700
558
+ },
559
+ {
560
+ "epoch": 23.55,
561
+ "learning_rate": 0.000317612544908409,
562
+ "loss": 0.1067,
563
+ "step": 5800
564
+ },
565
+ {
566
+ "epoch": 23.96,
567
+ "learning_rate": 0.000310755798370878,
568
+ "loss": 0.1085,
569
+ "step": 5900
570
+ },
571
+ {
572
+ "epoch": 24.0,
573
+ "eval_accuracy": 0.391701339257263,
574
+ "eval_loss": 0.10227391123771667,
575
+ "eval_runtime": 6.1722,
576
+ "eval_samples_per_second": 391.109,
577
+ "eval_steps_per_second": 24.465,
578
+ "step": 5910
579
+ },
580
+ {
581
+ "epoch": 24.37,
582
+ "learning_rate": 0.00030385012319974537,
583
+ "loss": 0.1055,
584
+ "step": 6000
585
+ },
586
+ {
587
+ "epoch": 24.77,
588
+ "learning_rate": 0.00029690108076132154,
589
+ "loss": 0.1068,
590
+ "step": 6100
591
+ },
592
+ {
593
+ "epoch": 25.0,
594
+ "eval_accuracy": 0.3917787266585941,
595
+ "eval_loss": 0.10184060782194138,
596
+ "eval_runtime": 6.1443,
597
+ "eval_samples_per_second": 392.883,
598
+ "eval_steps_per_second": 24.576,
599
+ "step": 6156
600
+ },
601
+ {
602
+ "epoch": 25.18,
603
+ "learning_rate": 0.0002899142673469971,
604
+ "loss": 0.1049,
605
+ "step": 6200
606
+ },
607
+ {
608
+ "epoch": 25.58,
609
+ "learning_rate": 0.00028289530966636625,
610
+ "loss": 0.1038,
611
+ "step": 6300
612
+ },
613
+ {
614
+ "epoch": 25.99,
615
+ "learning_rate": 0.000275849860315853,
616
+ "loss": 0.1045,
617
+ "step": 6400
618
+ },
619
+ {
620
+ "epoch": 26.0,
621
+ "eval_accuracy": 0.3920181723827126,
622
+ "eval_loss": 0.1005973368883133,
623
+ "eval_runtime": 6.0791,
624
+ "eval_samples_per_second": 397.098,
625
+ "eval_steps_per_second": 24.839,
626
+ "step": 6402
627
+ },
628
+ {
629
+ "epoch": 26.4,
630
+ "learning_rate": 0.0002687835932264908,
631
+ "loss": 0.1008,
632
+ "step": 6500
633
+ },
634
+ {
635
+ "epoch": 26.8,
636
+ "learning_rate": 0.0002617021990945197,
637
+ "loss": 0.1022,
638
+ "step": 6600
639
+ },
640
+ {
641
+ "epoch": 27.0,
642
+ "eval_accuracy": 0.39193259102124056,
643
+ "eval_loss": 0.10028840601444244,
644
+ "eval_runtime": 6.0776,
645
+ "eval_samples_per_second": 397.197,
646
+ "eval_steps_per_second": 24.845,
647
+ "step": 6648
648
+ },
649
+ {
650
+ "epoch": 27.21,
651
+ "learning_rate": 0.0002546113807984821,
652
+ "loss": 0.1011,
653
+ "step": 6700
654
+ },
655
+ {
656
+ "epoch": 27.61,
657
+ "learning_rate": 0.00024751684880650884,
658
+ "loss": 0.0995,
659
+ "step": 6800
660
+ },
661
+ {
662
+ "epoch": 28.0,
663
+ "eval_accuracy": 0.39223394666642386,
664
+ "eval_loss": 0.09900100529193878,
665
+ "eval_runtime": 6.1395,
666
+ "eval_samples_per_second": 393.191,
667
+ "eval_steps_per_second": 24.595,
668
+ "step": 6895
669
+ },
670
+ {
671
+ "epoch": 28.02,
672
+ "learning_rate": 0.00024042431657749118,
673
+ "loss": 0.1009,
674
+ "step": 6900
675
+ },
676
+ {
677
+ "epoch": 28.43,
678
+ "learning_rate": 0.0002333394959598461,
679
+ "loss": 0.0977,
680
+ "step": 7000
681
+ },
682
+ {
683
+ "epoch": 28.83,
684
+ "learning_rate": 0.00022626809259157726,
685
+ "loss": 0.0989,
686
+ "step": 7100
687
+ },
688
+ {
689
+ "epoch": 29.0,
690
+ "eval_accuracy": 0.39221664830612635,
691
+ "eval_loss": 0.09937664866447449,
692
+ "eval_runtime": 6.1179,
693
+ "eval_samples_per_second": 394.581,
694
+ "eval_steps_per_second": 24.682,
695
+ "step": 7141
696
+ },
697
+ {
698
+ "epoch": 29.24,
699
+ "learning_rate": 0.00021921580130533828,
700
+ "loss": 0.0977,
701
+ "step": 7200
702
+ },
703
+ {
704
+ "epoch": 29.64,
705
+ "learning_rate": 0.0002121883015421973,
706
+ "loss": 0.0975,
707
+ "step": 7300
708
+ },
709
+ {
710
+ "epoch": 30.0,
711
+ "eval_accuracy": 0.3923249906679898,
712
+ "eval_loss": 0.09780476242303848,
713
+ "eval_runtime": 6.0619,
714
+ "eval_samples_per_second": 398.224,
715
+ "eval_steps_per_second": 24.91,
716
+ "step": 7387
717
+ },
718
+ {
719
+ "epoch": 30.05,
720
+ "learning_rate": 0.00020519125277779733,
721
+ "loss": 0.0964,
722
+ "step": 7400
723
+ },
724
+ {
725
+ "epoch": 30.46,
726
+ "learning_rate": 0.00019823028996459485,
727
+ "loss": 0.0941,
728
+ "step": 7500
729
+ },
730
+ {
731
+ "epoch": 30.86,
732
+ "learning_rate": 0.00019131101899384867,
733
+ "loss": 0.0956,
734
+ "step": 7600
735
+ },
736
+ {
737
+ "epoch": 31.0,
738
+ "eval_accuracy": 0.39216930542531203,
739
+ "eval_loss": 0.0990176796913147,
740
+ "eval_runtime": 6.1212,
741
+ "eval_samples_per_second": 394.37,
742
+ "eval_steps_per_second": 24.669,
743
+ "step": 7633
744
+ },
745
+ {
746
+ "epoch": 31.27,
747
+ "learning_rate": 0.00018443901218101152,
748
+ "loss": 0.0942,
749
+ "step": 7700
750
+ },
751
+ {
752
+ "epoch": 31.68,
753
+ "learning_rate": 0.00017761980377816285,
754
+ "loss": 0.0931,
755
+ "step": 7800
756
+ },
757
+ {
758
+ "epoch": 32.0,
759
+ "eval_accuracy": 0.39236231870863186,
760
+ "eval_loss": 0.09768786281347275,
761
+ "eval_runtime": 6.0866,
762
+ "eval_samples_per_second": 396.608,
763
+ "eval_steps_per_second": 24.809,
764
+ "step": 7880
765
+ },
766
+ {
767
+ "epoch": 32.08,
768
+ "learning_rate": 0.00017085888551709338,
769
+ "loss": 0.0945,
770
+ "step": 7900
771
+ },
772
+ {
773
+ "epoch": 32.49,
774
+ "learning_rate": 0.00016416170218663446,
775
+ "loss": 0.0916,
776
+ "step": 8000
777
+ },
778
+ {
779
+ "epoch": 32.89,
780
+ "learning_rate": 0.0001575336472477909,
781
+ "loss": 0.0926,
782
+ "step": 8100
783
+ },
784
+ {
785
+ "epoch": 33.0,
786
+ "eval_accuracy": 0.3922312153463769,
787
+ "eval_loss": 0.09789934009313583,
788
+ "eval_runtime": 6.1446,
789
+ "eval_samples_per_second": 392.866,
790
+ "eval_steps_per_second": 24.574,
791
+ "step": 8126
792
+ },
793
+ {
794
+ "epoch": 33.3,
795
+ "learning_rate": 0.0001509800584902108,
796
+ "loss": 0.0912,
797
+ "step": 8200
798
+ },
799
+ {
800
+ "epoch": 33.71,
801
+ "learning_rate": 0.00014450621373348819,
802
+ "loss": 0.0908,
803
+ "step": 8300
804
+ },
805
+ {
806
+ "epoch": 34.0,
807
+ "eval_accuracy": 0.3923732439888198,
808
+ "eval_loss": 0.09784437716007233,
809
+ "eval_runtime": 6.1319,
810
+ "eval_samples_per_second": 393.678,
811
+ "eval_steps_per_second": 24.625,
812
+ "step": 8372
813
+ },
814
+ {
815
+ "epoch": 34.11,
816
+ "learning_rate": 0.0001381173265767623,
817
+ "loss": 0.0905,
818
+ "step": 8400
819
+ },
820
+ {
821
+ "epoch": 34.52,
822
+ "learning_rate": 0.00013181854220003567,
823
+ "loss": 0.0895,
824
+ "step": 8500
825
+ },
826
+ {
827
+ "epoch": 34.92,
828
+ "learning_rate": 0.00012561493322059202,
829
+ "loss": 0.0896,
830
+ "step": 8600
831
+ },
832
+ {
833
+ "epoch": 35.0,
834
+ "eval_accuracy": 0.3924324225898377,
835
+ "eval_loss": 0.09740225225687027,
836
+ "eval_runtime": 6.1438,
837
+ "eval_samples_per_second": 392.916,
838
+ "eval_steps_per_second": 24.578,
839
+ "step": 8618
840
+ },
841
+ {
842
+ "epoch": 35.33,
843
+ "learning_rate": 0.00011951149560785166,
844
+ "loss": 0.0892,
845
+ "step": 8700
846
+ },
847
+ {
848
+ "epoch": 35.74,
849
+ "learning_rate": 0.0001135131446599535,
850
+ "loss": 0.0883,
851
+ "step": 8800
852
+ },
853
+ {
854
+ "epoch": 36.0,
855
+ "eval_accuracy": 0.3924469896300882,
856
+ "eval_loss": 0.09739205241203308,
857
+ "eval_runtime": 6.1304,
858
+ "eval_samples_per_second": 393.776,
859
+ "eval_steps_per_second": 24.631,
860
+ "step": 8865
861
+ },
862
+ {
863
+ "epoch": 36.14,
864
+ "learning_rate": 0.00010762471104530472,
865
+ "loss": 0.0877,
866
+ "step": 8900
867
+ },
868
+ {
869
+ "epoch": 36.55,
870
+ "learning_rate": 0.00010185093691228534,
871
+ "loss": 0.0866,
872
+ "step": 9000
873
+ },
874
+ {
875
+ "epoch": 36.95,
876
+ "learning_rate": 9.619647207024071e-05,
877
+ "loss": 0.0878,
878
+ "step": 9100
879
+ },
880
+ {
881
+ "epoch": 37.0,
882
+ "eval_accuracy": 0.3924242286296967,
883
+ "eval_loss": 0.0978410392999649,
884
+ "eval_runtime": 6.1135,
885
+ "eval_samples_per_second": 394.865,
886
+ "eval_steps_per_second": 24.699,
887
+ "step": 9111
888
+ },
889
+ {
890
+ "epoch": 37.36,
891
+ "learning_rate": 9.0665870244838e-05,
892
+ "loss": 0.0859,
893
+ "step": 9200
894
+ },
895
+ {
896
+ "epoch": 37.77,
897
+ "learning_rate": 8.526358541080173e-05,
898
+ "loss": 0.0865,
899
+ "step": 9300
900
+ },
901
+ {
902
+ "epoch": 38.0,
903
+ "eval_accuracy": 0.3925216457113723,
904
+ "eval_loss": 0.09711020439863205,
905
+ "eval_runtime": 6.1232,
906
+ "eval_samples_per_second": 394.237,
907
+ "eval_steps_per_second": 24.66,
908
+ "step": 9357
909
+ },
910
+ {
911
+ "epoch": 38.17,
912
+ "learning_rate": 7.999396820498208e-05,
913
+ "loss": 0.0857,
914
+ "step": 9400
915
+ },
916
+ {
917
+ "epoch": 38.58,
918
+ "learning_rate": 7.486126242264468e-05,
919
+ "loss": 0.085,
920
+ "step": 9500
921
+ },
922
+ {
923
+ "epoch": 38.98,
924
+ "learning_rate": 6.986960159980326e-05,
925
+ "loss": 0.0855,
926
+ "step": 9600
927
+ },
928
+ {
929
+ "epoch": 39.0,
930
+ "eval_accuracy": 0.3925689885921866,
931
+ "eval_loss": 0.09748318791389465,
932
+ "eval_runtime": 6.1096,
933
+ "eval_samples_per_second": 395.116,
934
+ "eval_steps_per_second": 24.715,
935
+ "step": 9603
936
+ },
937
+ {
938
+ "epoch": 39.39,
939
+ "learning_rate": 6.502300568434777e-05,
940
+ "loss": 0.0838,
941
+ "step": 9700
942
+ },
943
+ {
944
+ "epoch": 39.8,
945
+ "learning_rate": 6.0325377798648745e-05,
946
+ "loss": 0.0847,
947
+ "step": 9800
948
+ },
949
+ {
950
+ "epoch": 40.0,
951
+ "eval_accuracy": 0.3925434962717481,
952
+ "eval_loss": 0.0970187559723854,
953
+ "eval_runtime": 6.2108,
954
+ "eval_samples_per_second": 388.675,
955
+ "eval_steps_per_second": 24.312,
956
+ "step": 9850
957
+ },
958
+ {
959
+ "epoch": 40.2,
960
+ "learning_rate": 5.578050109624511e-05,
961
+ "loss": 0.0827,
962
+ "step": 9900
963
+ },
964
+ {
965
+ "epoch": 40.61,
966
+ "learning_rate": 5.139203571514672e-05,
967
+ "loss": 0.0835,
968
+ "step": 10000
969
+ },
970
+ {
971
+ "epoch": 41.0,
972
+ "eval_accuracy": 0.3925671677121553,
973
+ "eval_loss": 0.09726004302501678,
974
+ "eval_runtime": 6.0996,
975
+ "eval_samples_per_second": 395.763,
976
+ "eval_steps_per_second": 24.756,
977
+ "step": 10096
978
+ },
979
+ {
980
+ "epoch": 41.02,
981
+ "learning_rate": 4.716351583020542e-05,
982
+ "loss": 0.0831,
983
+ "step": 10100
984
+ },
985
+ {
986
+ "epoch": 41.42,
987
+ "learning_rate": 4.3098346806928315e-05,
988
+ "loss": 0.0821,
989
+ "step": 10200
990
+ },
991
+ {
992
+ "epoch": 41.83,
993
+ "learning_rate": 3.919980245902524e-05,
994
+ "loss": 0.0825,
995
+ "step": 10300
996
+ },
997
+ {
998
+ "epoch": 42.0,
999
+ "eval_accuracy": 0.392557152871983,
1000
+ "eval_loss": 0.09756684303283691,
1001
+ "eval_runtime": 6.1355,
1002
+ "eval_samples_per_second": 393.449,
1003
+ "eval_steps_per_second": 24.611,
1004
+ "step": 10342
1005
+ },
1006
+ {
1007
+ "epoch": 42.23,
1008
+ "learning_rate": 3.5471022411899925e-05,
1009
+ "loss": 0.0825,
1010
+ "step": 10400
1011
+ },
1012
+ {
1013
+ "epoch": 42.64,
1014
+ "learning_rate": 3.1915009574206264e-05,
1015
+ "loss": 0.0814,
1016
+ "step": 10500
1017
+ },
1018
+ {
1019
+ "epoch": 43.0,
1020
+ "eval_accuracy": 0.3925817347524058,
1021
+ "eval_loss": 0.09748771786689758,
1022
+ "eval_runtime": 6.1103,
1023
+ "eval_samples_per_second": 395.068,
1024
+ "eval_steps_per_second": 24.712,
1025
+ "step": 10588
1026
+ },
1027
+ {
1028
+ "epoch": 43.0,
1029
+ "step": 10588,
1030
+ "total_flos": 3084220612961280.0,
1031
+ "train_loss": 0.2331255912600619,
1032
+ "train_runtime": 2746.0841,
1033
+ "train_samples_per_second": 143.441,
1034
+ "train_steps_per_second": 4.479
1035
+ }
1036
+ ],
1037
+ "max_steps": 12300,
1038
+ "num_train_epochs": 50,
1039
+ "total_flos": 3084220612961280.0,
1040
+ "trial_name": null,
1041
+ "trial_params": null
1042
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0db7b0d2599b04668617c424a3337e9f602ee02316377c982902ccf0e0c3346
3
+ size 3643