kevinzyz commited on
Commit
82d48a2
·
1 Parent(s): 0590763

Training in progress, epoch 1

Browse files
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.27000001072883606,
4
- "eval_loss": 1.60148286819458,
5
- "eval_runtime": 0.3331,
6
  "eval_samples": 500,
7
- "eval_samples_per_second": 1501.132,
8
- "eval_steps_per_second": 24.018,
9
- "total_flos": 22540141814400.0,
10
- "train_loss": 1.6052362956697979,
11
- "train_runtime": 55.5553,
12
  "train_samples": 4000,
13
- "train_samples_per_second": 720.003,
14
- "train_steps_per_second": 11.34
15
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.2919999957084656,
4
+ "eval_loss": 1.569107174873352,
5
+ "eval_runtime": 0.7522,
6
  "eval_samples": 500,
7
+ "eval_samples_per_second": 664.689,
8
+ "eval_steps_per_second": 42.54,
9
+ "total_flos": 10492735676640.0,
10
+ "train_loss": 1.5705763679504394,
11
+ "train_runtime": 78.9966,
12
  "train_samples": 4000,
13
+ "train_samples_per_second": 253.176,
14
+ "train_steps_per_second": 15.823
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.27000001072883606,
4
- "eval_loss": 1.60148286819458,
5
- "eval_runtime": 0.3331,
6
  "eval_samples": 500,
7
- "eval_samples_per_second": 1501.132,
8
- "eval_steps_per_second": 24.018
9
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.2919999957084656,
4
+ "eval_loss": 1.569107174873352,
5
+ "eval_runtime": 0.7522,
6
  "eval_samples": 500,
7
+ "eval_samples_per_second": 664.689,
8
+ "eval_steps_per_second": 42.54
9
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0096f1a45fa31f275d0de57dd202bc75ce557882075f303de9bcaf314c72d4e
3
  size 12755881
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9208d3ee68d75842fcb7a1f0a04d765ee4bf462a4876219b125988db0d436c
3
  size 12755881
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 22540141814400.0,
4
- "train_loss": 1.6052362956697979,
5
- "train_runtime": 55.5553,
6
  "train_samples": 4000,
7
- "train_samples_per_second": 720.003,
8
- "train_steps_per_second": 11.34
9
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 10492735676640.0,
4
+ "train_loss": 1.5705763679504394,
5
+ "train_runtime": 78.9966,
6
  "train_samples": 4000,
7
+ "train_samples_per_second": 253.176,
8
+ "train_steps_per_second": 15.823
9
  }
trainer_state.json CHANGED
@@ -1,301 +1,442 @@
1
  {
2
- "best_metric": 1.60148286819458,
3
- "best_model_checkpoint": "chinese_roberta_L-2_H-128-finetuned-MC-hyper/checkpoint-630",
4
- "epoch": 10.0,
5
- "global_step": 630,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.32,
12
- "learning_rate": 9.682539682539683e-06,
13
- "loss": 1.6084,
14
  "step": 20
15
  },
16
  {
17
- "epoch": 0.63,
18
- "learning_rate": 9.365079365079366e-06,
19
- "loss": 1.6085,
20
  "step": 40
21
  },
22
  {
23
- "epoch": 0.95,
24
- "learning_rate": 9.047619047619049e-06,
25
- "loss": 1.6093,
26
  "step": 60
27
  },
28
  {
29
- "epoch": 1.0,
30
- "eval_accuracy": 0.27799999713897705,
31
- "eval_loss": 1.6079264879226685,
32
- "eval_runtime": 0.3306,
33
- "eval_samples_per_second": 1512.408,
34
- "eval_steps_per_second": 24.199,
35
- "step": 63
36
- },
37
- {
38
- "epoch": 1.27,
39
- "learning_rate": 8.730158730158731e-06,
40
- "loss": 1.6092,
41
  "step": 80
42
  },
43
  {
44
- "epoch": 1.59,
45
- "learning_rate": 8.412698412698414e-06,
46
- "loss": 1.6084,
47
  "step": 100
48
  },
49
  {
50
- "epoch": 1.9,
51
- "learning_rate": 8.095238095238097e-06,
52
- "loss": 1.6083,
53
  "step": 120
54
  },
55
  {
56
- "epoch": 2.0,
57
- "eval_accuracy": 0.27799999713897705,
58
- "eval_loss": 1.6071099042892456,
59
- "eval_runtime": 0.3325,
60
- "eval_samples_per_second": 1503.597,
61
- "eval_steps_per_second": 24.058,
62
- "step": 126
63
- },
64
- {
65
- "epoch": 2.22,
66
- "learning_rate": 7.77777777777778e-06,
67
- "loss": 1.6073,
68
  "step": 140
69
  },
70
  {
71
- "epoch": 2.54,
72
- "learning_rate": 7.460317460317461e-06,
73
- "loss": 1.6081,
74
  "step": 160
75
  },
76
  {
77
- "epoch": 2.86,
78
- "learning_rate": 7.1428571428571436e-06,
79
- "loss": 1.6077,
80
  "step": 180
81
  },
82
  {
83
- "epoch": 3.0,
84
- "eval_accuracy": 0.28600001335144043,
85
- "eval_loss": 1.6063222885131836,
86
- "eval_runtime": 0.3267,
87
- "eval_samples_per_second": 1530.439,
88
- "eval_steps_per_second": 24.487,
89
- "step": 189
90
- },
91
- {
92
- "epoch": 3.17,
93
- "learning_rate": 6.825396825396826e-06,
94
- "loss": 1.6061,
95
  "step": 200
96
  },
97
  {
98
- "epoch": 3.49,
99
- "learning_rate": 6.507936507936509e-06,
100
- "loss": 1.6081,
101
  "step": 220
102
  },
103
  {
104
- "epoch": 3.81,
105
- "learning_rate": 6.1904761904761914e-06,
106
- "loss": 1.6078,
107
  "step": 240
108
  },
109
  {
110
- "epoch": 4.0,
111
- "eval_accuracy": 0.2919999957084656,
112
- "eval_loss": 1.605468511581421,
113
- "eval_runtime": 0.391,
114
- "eval_samples_per_second": 1278.612,
115
- "eval_steps_per_second": 20.458,
116
- "step": 252
117
  },
118
  {
119
- "epoch": 4.13,
120
- "learning_rate": 5.873015873015874e-06,
121
- "loss": 1.606,
122
  "step": 260
123
  },
124
  {
125
- "epoch": 4.44,
126
- "learning_rate": 5.555555555555557e-06,
127
- "loss": 1.6059,
128
  "step": 280
129
  },
130
  {
131
- "epoch": 4.76,
132
- "learning_rate": 5.2380952380952384e-06,
133
- "loss": 1.6047,
134
  "step": 300
135
  },
136
  {
137
- "epoch": 5.0,
138
- "eval_accuracy": 0.28200000524520874,
139
- "eval_loss": 1.6045148372650146,
140
- "eval_runtime": 0.3348,
141
- "eval_samples_per_second": 1493.284,
142
- "eval_steps_per_second": 23.893,
143
- "step": 315
144
- },
145
- {
146
- "epoch": 5.08,
147
- "learning_rate": 4.920634920634921e-06,
148
- "loss": 1.6059,
149
  "step": 320
150
  },
151
  {
152
- "epoch": 5.4,
153
- "learning_rate": 4.603174603174604e-06,
154
- "loss": 1.6057,
155
  "step": 340
156
  },
157
  {
158
- "epoch": 5.71,
159
- "learning_rate": 4.2857142857142855e-06,
160
- "loss": 1.6042,
161
  "step": 360
162
  },
163
  {
164
- "epoch": 6.0,
165
- "eval_accuracy": 0.27799999713897705,
166
- "eval_loss": 1.60362708568573,
167
- "eval_runtime": 0.3262,
168
- "eval_samples_per_second": 1532.953,
169
- "eval_steps_per_second": 24.527,
170
- "step": 378
171
- },
172
- {
173
- "epoch": 6.03,
174
- "learning_rate": 3.968253968253968e-06,
175
- "loss": 1.6031,
176
  "step": 380
177
  },
178
  {
179
- "epoch": 6.35,
180
- "learning_rate": 3.6507936507936507e-06,
181
- "loss": 1.6043,
182
  "step": 400
183
  },
184
  {
185
- "epoch": 6.67,
186
- "learning_rate": 3.3333333333333333e-06,
187
- "loss": 1.604,
188
  "step": 420
189
  },
190
  {
191
- "epoch": 6.98,
192
- "learning_rate": 3.015873015873016e-06,
193
- "loss": 1.6029,
194
  "step": 440
195
  },
196
  {
197
- "epoch": 7.0,
198
- "eval_accuracy": 0.27399998903274536,
199
- "eval_loss": 1.6027634143829346,
200
- "eval_runtime": 0.3276,
201
- "eval_samples_per_second": 1526.145,
202
- "eval_steps_per_second": 24.418,
203
- "step": 441
204
- },
205
- {
206
- "epoch": 7.3,
207
- "learning_rate": 2.6984126984126986e-06,
208
- "loss": 1.6022,
209
  "step": 460
210
  },
211
  {
212
- "epoch": 7.62,
213
- "learning_rate": 2.380952380952381e-06,
214
- "loss": 1.6041,
215
  "step": 480
216
  },
217
  {
218
- "epoch": 7.94,
219
- "learning_rate": 2.0634920634920634e-06,
220
- "loss": 1.6007,
221
  "step": 500
222
  },
223
  {
224
- "epoch": 8.0,
225
- "eval_accuracy": 0.27000001072883606,
226
- "eval_loss": 1.6020437479019165,
227
- "eval_runtime": 0.3322,
228
- "eval_samples_per_second": 1505.137,
229
- "eval_steps_per_second": 24.082,
230
- "step": 504
231
  },
232
  {
233
- "epoch": 8.25,
234
- "learning_rate": 1.746031746031746e-06,
235
- "loss": 1.6023,
236
  "step": 520
237
  },
238
  {
239
- "epoch": 8.57,
240
- "learning_rate": 1.4285714285714286e-06,
241
- "loss": 1.6026,
242
  "step": 540
243
  },
244
  {
245
- "epoch": 8.89,
246
- "learning_rate": 1.111111111111111e-06,
247
- "loss": 1.6015,
248
  "step": 560
249
  },
250
  {
251
- "epoch": 9.0,
252
- "eval_accuracy": 0.27000001072883606,
253
- "eval_loss": 1.6016141176223755,
254
- "eval_runtime": 0.3269,
255
- "eval_samples_per_second": 1529.374,
256
- "eval_steps_per_second": 24.47,
257
- "step": 567
258
- },
259
- {
260
- "epoch": 9.21,
261
- "learning_rate": 7.936507936507937e-07,
262
- "loss": 1.6016,
263
  "step": 580
264
  },
265
  {
266
- "epoch": 9.52,
267
- "learning_rate": 4.7619047619047623e-07,
268
- "loss": 1.6024,
269
  "step": 600
270
  },
271
  {
272
- "epoch": 9.84,
273
- "learning_rate": 1.5873015873015874e-07,
274
- "loss": 1.6017,
275
  "step": 620
276
  },
277
  {
278
- "epoch": 10.0,
279
- "eval_accuracy": 0.27000001072883606,
280
- "eval_loss": 1.60148286819458,
281
- "eval_runtime": 0.3257,
282
- "eval_samples_per_second": 1535.263,
283
- "eval_steps_per_second": 24.564,
284
- "step": 630
285
  },
286
  {
287
- "epoch": 10.0,
288
- "step": 630,
289
- "total_flos": 22540141814400.0,
290
- "train_loss": 1.6052362956697979,
291
- "train_runtime": 55.5553,
292
- "train_samples_per_second": 720.003,
293
- "train_steps_per_second": 11.34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  }
295
  ],
296
- "max_steps": 630,
297
- "num_train_epochs": 10,
298
- "total_flos": 22540141814400.0,
299
  "trial_name": null,
300
  "trial_params": null
301
  }
 
1
  {
2
+ "best_metric": 1.569107174873352,
3
+ "best_model_checkpoint": "chinese_roberta_L-2_H-128-finetuned-MC-hyper/checkpoint-1250",
4
+ "epoch": 5.0,
5
+ "global_step": 1250,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.08,
12
+ "learning_rate": 4.92e-05,
13
+ "loss": 1.6109,
14
  "step": 20
15
  },
16
  {
17
+ "epoch": 0.16,
18
+ "learning_rate": 4.8400000000000004e-05,
19
+ "loss": 1.6093,
20
  "step": 40
21
  },
22
  {
23
+ "epoch": 0.24,
24
+ "learning_rate": 4.76e-05,
25
+ "loss": 1.6059,
26
  "step": 60
27
  },
28
  {
29
+ "epoch": 0.32,
30
+ "learning_rate": 4.6800000000000006e-05,
31
+ "loss": 1.6083,
 
 
 
 
 
 
 
 
 
32
  "step": 80
33
  },
34
  {
35
+ "epoch": 0.4,
36
+ "learning_rate": 4.600000000000001e-05,
37
+ "loss": 1.6063,
38
  "step": 100
39
  },
40
  {
41
+ "epoch": 0.48,
42
+ "learning_rate": 4.52e-05,
43
+ "loss": 1.606,
44
  "step": 120
45
  },
46
  {
47
+ "epoch": 0.56,
48
+ "learning_rate": 4.44e-05,
49
+ "loss": 1.6085,
 
 
 
 
 
 
 
 
 
50
  "step": 140
51
  },
52
  {
53
+ "epoch": 0.64,
54
+ "learning_rate": 4.36e-05,
55
+ "loss": 1.609,
56
  "step": 160
57
  },
58
  {
59
+ "epoch": 0.72,
60
+ "learning_rate": 4.2800000000000004e-05,
61
+ "loss": 1.6032,
62
  "step": 180
63
  },
64
  {
65
+ "epoch": 0.8,
66
+ "learning_rate": 4.2e-05,
67
+ "loss": 1.609,
 
 
 
 
 
 
 
 
 
68
  "step": 200
69
  },
70
  {
71
+ "epoch": 0.88,
72
+ "learning_rate": 4.12e-05,
73
+ "loss": 1.6022,
74
  "step": 220
75
  },
76
  {
77
+ "epoch": 0.96,
78
+ "learning_rate": 4.0400000000000006e-05,
79
+ "loss": 1.5993,
80
  "step": 240
81
  },
82
  {
83
+ "epoch": 1.0,
84
+ "eval_accuracy": 0.27799999713897705,
85
+ "eval_loss": 1.596663236618042,
86
+ "eval_runtime": 0.7533,
87
+ "eval_samples_per_second": 663.789,
88
+ "eval_steps_per_second": 42.483,
89
+ "step": 250
90
  },
91
  {
92
+ "epoch": 1.04,
93
+ "learning_rate": 3.960000000000001e-05,
94
+ "loss": 1.6047,
95
  "step": 260
96
  },
97
  {
98
+ "epoch": 1.12,
99
+ "learning_rate": 3.88e-05,
100
+ "loss": 1.6005,
101
  "step": 280
102
  },
103
  {
104
+ "epoch": 1.2,
105
+ "learning_rate": 3.8e-05,
106
+ "loss": 1.5939,
107
  "step": 300
108
  },
109
  {
110
+ "epoch": 1.28,
111
+ "learning_rate": 3.72e-05,
112
+ "loss": 1.5916,
 
 
 
 
 
 
 
 
 
113
  "step": 320
114
  },
115
  {
116
+ "epoch": 1.36,
117
+ "learning_rate": 3.6400000000000004e-05,
118
+ "loss": 1.5908,
119
  "step": 340
120
  },
121
  {
122
+ "epoch": 1.44,
123
+ "learning_rate": 3.56e-05,
124
+ "loss": 1.59,
125
  "step": 360
126
  },
127
  {
128
+ "epoch": 1.52,
129
+ "learning_rate": 3.48e-05,
130
+ "loss": 1.5996,
 
 
 
 
 
 
 
 
 
131
  "step": 380
132
  },
133
  {
134
+ "epoch": 1.6,
135
+ "learning_rate": 3.4000000000000007e-05,
136
+ "loss": 1.586,
137
  "step": 400
138
  },
139
  {
140
+ "epoch": 1.68,
141
+ "learning_rate": 3.32e-05,
142
+ "loss": 1.5712,
143
  "step": 420
144
  },
145
  {
146
+ "epoch": 1.76,
147
+ "learning_rate": 3.24e-05,
148
+ "loss": 1.5726,
149
  "step": 440
150
  },
151
  {
152
+ "epoch": 1.84,
153
+ "learning_rate": 3.16e-05,
154
+ "loss": 1.5727,
 
 
 
 
 
 
 
 
 
155
  "step": 460
156
  },
157
  {
158
+ "epoch": 1.92,
159
+ "learning_rate": 3.08e-05,
160
+ "loss": 1.5822,
161
  "step": 480
162
  },
163
  {
164
+ "epoch": 2.0,
165
+ "learning_rate": 3e-05,
166
+ "loss": 1.5731,
167
  "step": 500
168
  },
169
  {
170
+ "epoch": 2.0,
171
+ "eval_accuracy": 0.3019999861717224,
172
+ "eval_loss": 1.5877653360366821,
173
+ "eval_runtime": 0.647,
174
+ "eval_samples_per_second": 772.82,
175
+ "eval_steps_per_second": 49.46,
176
+ "step": 500
177
  },
178
  {
179
+ "epoch": 2.08,
180
+ "learning_rate": 2.9199999999999998e-05,
181
+ "loss": 1.5914,
182
  "step": 520
183
  },
184
  {
185
+ "epoch": 2.16,
186
+ "learning_rate": 2.84e-05,
187
+ "loss": 1.5627,
188
  "step": 540
189
  },
190
  {
191
+ "epoch": 2.24,
192
+ "learning_rate": 2.7600000000000003e-05,
193
+ "loss": 1.5672,
194
  "step": 560
195
  },
196
  {
197
+ "epoch": 2.32,
198
+ "learning_rate": 2.6800000000000004e-05,
199
+ "loss": 1.564,
 
 
 
 
 
 
 
 
 
200
  "step": 580
201
  },
202
  {
203
+ "epoch": 2.4,
204
+ "learning_rate": 2.6000000000000002e-05,
205
+ "loss": 1.577,
206
  "step": 600
207
  },
208
  {
209
+ "epoch": 2.48,
210
+ "learning_rate": 2.5200000000000003e-05,
211
+ "loss": 1.5712,
212
  "step": 620
213
  },
214
  {
215
+ "epoch": 2.56,
216
+ "learning_rate": 2.44e-05,
217
+ "loss": 1.5725,
218
+ "step": 640
 
 
 
219
  },
220
  {
221
+ "epoch": 2.64,
222
+ "learning_rate": 2.36e-05,
223
+ "loss": 1.5532,
224
+ "step": 660
225
+ },
226
+ {
227
+ "epoch": 2.72,
228
+ "learning_rate": 2.2800000000000002e-05,
229
+ "loss": 1.5703,
230
+ "step": 680
231
+ },
232
+ {
233
+ "epoch": 2.8,
234
+ "learning_rate": 2.2000000000000003e-05,
235
+ "loss": 1.5803,
236
+ "step": 700
237
+ },
238
+ {
239
+ "epoch": 2.88,
240
+ "learning_rate": 2.12e-05,
241
+ "loss": 1.5728,
242
+ "step": 720
243
+ },
244
+ {
245
+ "epoch": 2.96,
246
+ "learning_rate": 2.04e-05,
247
+ "loss": 1.5434,
248
+ "step": 740
249
+ },
250
+ {
251
+ "epoch": 3.0,
252
+ "eval_accuracy": 0.29600000381469727,
253
+ "eval_loss": 1.5791035890579224,
254
+ "eval_runtime": 0.7356,
255
+ "eval_samples_per_second": 679.717,
256
+ "eval_steps_per_second": 43.502,
257
+ "step": 750
258
+ },
259
+ {
260
+ "epoch": 3.04,
261
+ "learning_rate": 1.9600000000000002e-05,
262
+ "loss": 1.5569,
263
+ "step": 760
264
+ },
265
+ {
266
+ "epoch": 3.12,
267
+ "learning_rate": 1.88e-05,
268
+ "loss": 1.5382,
269
+ "step": 780
270
+ },
271
+ {
272
+ "epoch": 3.2,
273
+ "learning_rate": 1.8e-05,
274
+ "loss": 1.5478,
275
+ "step": 800
276
+ },
277
+ {
278
+ "epoch": 3.28,
279
+ "learning_rate": 1.7199999999999998e-05,
280
+ "loss": 1.5665,
281
+ "step": 820
282
+ },
283
+ {
284
+ "epoch": 3.36,
285
+ "learning_rate": 1.6400000000000002e-05,
286
+ "loss": 1.5494,
287
+ "step": 840
288
+ },
289
+ {
290
+ "epoch": 3.44,
291
+ "learning_rate": 1.56e-05,
292
+ "loss": 1.5489,
293
+ "step": 860
294
+ },
295
+ {
296
+ "epoch": 3.52,
297
+ "learning_rate": 1.48e-05,
298
+ "loss": 1.5483,
299
+ "step": 880
300
+ },
301
+ {
302
+ "epoch": 3.6,
303
+ "learning_rate": 1.4000000000000001e-05,
304
+ "loss": 1.5388,
305
+ "step": 900
306
+ },
307
+ {
308
+ "epoch": 3.68,
309
+ "learning_rate": 1.32e-05,
310
+ "loss": 1.5563,
311
+ "step": 920
312
+ },
313
+ {
314
+ "epoch": 3.76,
315
+ "learning_rate": 1.24e-05,
316
+ "loss": 1.5616,
317
+ "step": 940
318
+ },
319
+ {
320
+ "epoch": 3.84,
321
+ "learning_rate": 1.16e-05,
322
+ "loss": 1.5512,
323
+ "step": 960
324
+ },
325
+ {
326
+ "epoch": 3.92,
327
+ "learning_rate": 1.08e-05,
328
+ "loss": 1.5288,
329
+ "step": 980
330
+ },
331
+ {
332
+ "epoch": 4.0,
333
+ "learning_rate": 1e-05,
334
+ "loss": 1.5473,
335
+ "step": 1000
336
+ },
337
+ {
338
+ "epoch": 4.0,
339
+ "eval_accuracy": 0.2939999997615814,
340
+ "eval_loss": 1.5701889991760254,
341
+ "eval_runtime": 0.7589,
342
+ "eval_samples_per_second": 658.833,
343
+ "eval_steps_per_second": 42.165,
344
+ "step": 1000
345
+ },
346
+ {
347
+ "epoch": 4.08,
348
+ "learning_rate": 9.2e-06,
349
+ "loss": 1.5642,
350
+ "step": 1020
351
+ },
352
+ {
353
+ "epoch": 4.16,
354
+ "learning_rate": 8.400000000000001e-06,
355
+ "loss": 1.5334,
356
+ "step": 1040
357
+ },
358
+ {
359
+ "epoch": 4.24,
360
+ "learning_rate": 7.6e-06,
361
+ "loss": 1.526,
362
+ "step": 1060
363
+ },
364
+ {
365
+ "epoch": 4.32,
366
+ "learning_rate": 6.800000000000001e-06,
367
+ "loss": 1.5361,
368
+ "step": 1080
369
+ },
370
+ {
371
+ "epoch": 4.4,
372
+ "learning_rate": 6e-06,
373
+ "loss": 1.573,
374
+ "step": 1100
375
+ },
376
+ {
377
+ "epoch": 4.48,
378
+ "learning_rate": 5.2e-06,
379
+ "loss": 1.5543,
380
+ "step": 1120
381
+ },
382
+ {
383
+ "epoch": 4.56,
384
+ "learning_rate": 4.4e-06,
385
+ "loss": 1.5193,
386
+ "step": 1140
387
+ },
388
+ {
389
+ "epoch": 4.64,
390
+ "learning_rate": 3.6e-06,
391
+ "loss": 1.5401,
392
+ "step": 1160
393
+ },
394
+ {
395
+ "epoch": 4.72,
396
+ "learning_rate": 2.8000000000000003e-06,
397
+ "loss": 1.5373,
398
+ "step": 1180
399
+ },
400
+ {
401
+ "epoch": 4.8,
402
+ "learning_rate": 2.0000000000000003e-06,
403
+ "loss": 1.5279,
404
+ "step": 1200
405
+ },
406
+ {
407
+ "epoch": 4.88,
408
+ "learning_rate": 1.2000000000000002e-06,
409
+ "loss": 1.5503,
410
+ "step": 1220
411
+ },
412
+ {
413
+ "epoch": 4.96,
414
+ "learning_rate": 4.0000000000000003e-07,
415
+ "loss": 1.5422,
416
+ "step": 1240
417
+ },
418
+ {
419
+ "epoch": 5.0,
420
+ "eval_accuracy": 0.2919999957084656,
421
+ "eval_loss": 1.569107174873352,
422
+ "eval_runtime": 0.7454,
423
+ "eval_samples_per_second": 670.819,
424
+ "eval_steps_per_second": 42.932,
425
+ "step": 1250
426
+ },
427
+ {
428
+ "epoch": 5.0,
429
+ "step": 1250,
430
+ "total_flos": 10492735676640.0,
431
+ "train_loss": 1.5705763679504394,
432
+ "train_runtime": 78.9966,
433
+ "train_samples_per_second": 253.176,
434
+ "train_steps_per_second": 15.823
435
  }
436
  ],
437
+ "max_steps": 1250,
438
+ "num_train_epochs": 5,
439
+ "total_flos": 10492735676640.0,
440
  "trial_name": null,
441
  "trial_params": null
442
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:369f79da83f13b5b29eb52d0370d4338999b9bc2fbbcd98395648ddfc7a6d687
3
  size 2799
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1eb0324ba0a73ee29fb2559c05fad4e6c64a97f34e7bb4de884b1c0c9cc415f
3
  size 2799