yyx123 commited on
Commit
efe5471
1 Parent(s): 0782c3f

Model save

Browse files
Files changed (5) hide show
  1. README.md +22 -25
  2. all_results.json +8 -8
  3. eval_results.json +4 -4
  4. train_results.json +4 -4
  5. trainer_state.json +246 -138
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba3
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba3
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 4.3351
26
 
27
  ## Model description
28
 
@@ -54,25 +50,26 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 2.644 | 1.0 | 55 | 2.3047 |
58
- | 1.9548 | 2.0 | 110 | 1.9419 |
59
- | 1.788 | 3.0 | 165 | 1.9135 |
60
- | 1.6342 | 4.0 | 220 | 1.9499 |
61
- | 1.3781 | 5.0 | 275 | 2.1321 |
62
- | 1.0617 | 6.0 | 330 | 2.3518 |
63
- | 0.8104 | 7.0 | 385 | 2.6090 |
64
- | 0.5864 | 8.0 | 440 | 2.8890 |
65
- | 0.4159 | 9.0 | 495 | 3.1356 |
66
- | 0.3344 | 10.0 | 550 | 3.3190 |
67
- | 0.2446 | 11.0 | 605 | 3.5470 |
68
- | 0.199 | 12.0 | 660 | 3.6840 |
69
- | 0.1245 | 13.0 | 715 | 3.7653 |
70
- | 0.1208 | 14.0 | 770 | 3.8722 |
71
- | 0.1003 | 15.0 | 825 | 3.9575 |
72
- | 0.0767 | 16.0 | 880 | 3.9671 |
73
- | 0.0913 | 17.0 | 935 | 3.9921 |
74
- | 0.0895 | 18.0 | 990 | 3.9940 |
75
- | 0.0671 | 19.0 | 1045 | 3.9915 |
 
76
 
77
 
78
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba3
 
16
 
17
  # Yi-6B-ruozhiba3
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 4.1874
22
 
23
  ## Model description
24
 
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 2.7671 | 1.0 | 55 | 2.3123 |
54
+ | 2.0319 | 2.0 | 110 | 1.9679 |
55
+ | 1.7972 | 3.0 | 165 | 1.9426 |
56
+ | 1.5841 | 4.0 | 220 | 2.0110 |
57
+ | 1.2842 | 5.0 | 275 | 2.2671 |
58
+ | 0.9305 | 6.0 | 330 | 2.5263 |
59
+ | 0.6734 | 7.0 | 385 | 2.7798 |
60
+ | 0.4579 | 8.0 | 440 | 3.1052 |
61
+ | 0.3091 | 9.0 | 495 | 3.3409 |
62
+ | 0.2418 | 10.0 | 550 | 3.4999 |
63
+ | 0.1718 | 11.0 | 605 | 3.6688 |
64
+ | 0.1555 | 12.0 | 660 | 3.7819 |
65
+ | 0.1191 | 13.0 | 715 | 3.9108 |
66
+ | 0.1291 | 14.0 | 770 | 3.9953 |
67
+ | 0.1213 | 15.0 | 825 | 4.1020 |
68
+ | 0.1 | 16.0 | 880 | 4.1205 |
69
+ | 0.115 | 17.0 | 935 | 4.1606 |
70
+ | 0.1076 | 18.0 | 990 | 4.1839 |
71
+ | 0.0962 | 19.0 | 1045 | 4.1873 |
72
+ | 0.0917 | 20.0 | 1100 | 4.1874 |
73
 
74
 
75
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_loss": 4.335062026977539,
4
- "eval_runtime": 6.2462,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 3.682,
7
- "eval_steps_per_second": 0.961,
8
- "train_loss": 0.0,
9
- "train_runtime": 10.1895,
10
  "train_samples": 217,
11
- "train_samples_per_second": 425.927,
12
- "train_steps_per_second": 107.954
13
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_loss": 4.187403678894043,
4
+ "eval_runtime": 4.9245,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.671,
7
+ "eval_steps_per_second": 1.218,
8
+ "train_loss": 0.660087760145014,
9
+ "train_runtime": 8612.1752,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 0.504,
12
+ "train_steps_per_second": 0.128
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_loss": 4.335062026977539,
4
- "eval_runtime": 6.2462,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 3.682,
7
- "eval_steps_per_second": 0.961
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_loss": 4.187403678894043,
4
+ "eval_runtime": 4.9245,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.671,
7
+ "eval_steps_per_second": 1.218
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 10.1895,
5
  "train_samples": 217,
6
- "train_samples_per_second": 425.927,
7
- "train_steps_per_second": 107.954
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "train_loss": 0.660087760145014,
4
+ "train_runtime": 8612.1752,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 0.504,
7
+ "train_steps_per_second": 0.128
8
  }
trainer_state.json CHANGED
@@ -11,499 +11,607 @@
11
  {
12
  "epoch": 0.02,
13
  "learning_rate": 4.545454545454545e-07,
14
- "loss": 2.5611,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.36,
19
  "learning_rate": 9.090909090909091e-06,
20
- "loss": 2.5692,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.73,
25
  "learning_rate": 1.8181818181818182e-05,
26
- "loss": 2.644,
27
  "step": 40
28
  },
29
  {
30
  "epoch": 1.0,
31
- "eval_loss": 2.304738759994507,
32
- "eval_runtime": 1.217,
33
- "eval_samples_per_second": 18.899,
34
- "eval_steps_per_second": 4.93,
 
 
 
 
 
35
  "step": 55
36
  },
37
  {
38
  "epoch": 1.09,
39
  "learning_rate": 2.7272727272727273e-05,
40
- "loss": 2.3827,
41
  "step": 60
42
  },
43
  {
44
  "epoch": 1.45,
45
  "learning_rate": 3.6363636363636364e-05,
46
- "loss": 2.0781,
47
  "step": 80
48
  },
49
  {
50
  "epoch": 1.82,
51
  "learning_rate": 4.545454545454546e-05,
52
- "loss": 1.9548,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 2.0,
57
- "eval_loss": 1.9419394731521606,
58
- "eval_runtime": 1.2171,
59
- "eval_samples_per_second": 18.898,
60
- "eval_steps_per_second": 4.93,
 
 
 
 
 
61
  "step": 110
62
  },
63
  {
64
  "epoch": 2.18,
65
  "learning_rate": 4.9987413559579636e-05,
66
- "loss": 1.8022,
67
  "step": 120
68
  },
69
  {
70
  "epoch": 2.55,
71
  "learning_rate": 4.988679806432712e-05,
72
- "loss": 1.8295,
73
  "step": 140
74
  },
75
  {
76
  "epoch": 2.91,
77
  "learning_rate": 4.968597221690986e-05,
78
- "loss": 1.788,
79
  "step": 160
80
  },
81
  {
82
  "epoch": 3.0,
83
- "eval_loss": 1.9134601354599,
84
- "eval_runtime": 1.2132,
85
- "eval_samples_per_second": 18.958,
86
- "eval_steps_per_second": 4.946,
 
 
 
 
 
87
  "step": 165
88
  },
89
  {
90
  "epoch": 3.27,
91
  "learning_rate": 4.938574467213518e-05,
92
- "loss": 1.6784,
93
  "step": 180
94
  },
95
  {
96
  "epoch": 3.64,
97
  "learning_rate": 4.898732434036244e-05,
98
- "loss": 1.6528,
99
  "step": 200
100
  },
101
  {
102
  "epoch": 4.0,
103
  "learning_rate": 4.849231551964771e-05,
104
- "loss": 1.6342,
 
 
 
 
 
105
  "step": 220
106
  },
107
  {
108
  "epoch": 4.0,
109
- "eval_loss": 1.9498955011367798,
110
- "eval_runtime": 1.2502,
111
- "eval_samples_per_second": 18.397,
112
- "eval_steps_per_second": 4.799,
113
  "step": 220
114
  },
115
  {
116
  "epoch": 4.36,
117
  "learning_rate": 4.790271143580174e-05,
118
- "loss": 1.3818,
119
  "step": 240
120
  },
121
  {
122
  "epoch": 4.73,
123
  "learning_rate": 4.722088621637309e-05,
124
- "loss": 1.3781,
125
  "step": 260
126
  },
127
  {
128
  "epoch": 5.0,
129
- "eval_loss": 2.1321451663970947,
130
- "eval_runtime": 1.2118,
131
- "eval_samples_per_second": 18.98,
132
- "eval_steps_per_second": 4.951,
 
 
 
 
 
133
  "step": 275
134
  },
135
  {
136
  "epoch": 5.09,
137
  "learning_rate": 4.644958533087443e-05,
138
- "loss": 1.2512,
139
  "step": 280
140
  },
141
  {
142
  "epoch": 5.45,
143
  "learning_rate": 4.559191453574582e-05,
144
- "loss": 1.0475,
145
  "step": 300
146
  },
147
  {
148
  "epoch": 5.82,
149
  "learning_rate": 4.465132736856969e-05,
150
- "loss": 1.0617,
151
  "step": 320
152
  },
153
  {
154
  "epoch": 6.0,
155
- "eval_loss": 2.3518364429473877,
156
- "eval_runtime": 1.216,
157
- "eval_samples_per_second": 18.914,
158
- "eval_steps_per_second": 4.934,
 
 
 
 
 
159
  "step": 330
160
  },
161
  {
162
  "epoch": 6.18,
163
  "learning_rate": 4.3631611241893874e-05,
164
- "loss": 0.9003,
165
  "step": 340
166
  },
167
  {
168
  "epoch": 6.55,
169
  "learning_rate": 4.2536872192658036e-05,
170
- "loss": 0.7805,
171
  "step": 360
172
  },
173
  {
174
  "epoch": 6.91,
175
  "learning_rate": 4.137151834863213e-05,
176
- "loss": 0.8104,
177
  "step": 380
178
  },
179
  {
180
  "epoch": 7.0,
181
- "eval_loss": 2.609004259109497,
182
- "eval_runtime": 1.2167,
183
- "eval_samples_per_second": 18.904,
184
- "eval_steps_per_second": 4.932,
 
 
 
 
 
185
  "step": 385
186
  },
187
  {
188
  "epoch": 7.27,
189
  "learning_rate": 4.014024217844167e-05,
190
- "loss": 0.6542,
191
  "step": 400
192
  },
193
  {
194
  "epoch": 7.64,
195
  "learning_rate": 3.884800159665276e-05,
196
- "loss": 0.5753,
197
  "step": 420
198
  },
199
  {
200
  "epoch": 8.0,
201
  "learning_rate": 3.7500000000000003e-05,
202
- "loss": 0.5864,
 
 
 
 
 
203
  "step": 440
204
  },
205
  {
206
  "epoch": 8.0,
207
- "eval_loss": 2.8889544010162354,
208
- "eval_runtime": 1.2434,
209
- "eval_samples_per_second": 18.498,
210
- "eval_steps_per_second": 4.825,
211
  "step": 440
212
  },
213
  {
214
  "epoch": 8.36,
215
  "learning_rate": 3.610166531514436e-05,
216
- "loss": 0.4181,
217
  "step": 460
218
  },
219
  {
220
  "epoch": 8.73,
221
  "learning_rate": 3.465862814232822e-05,
222
- "loss": 0.4159,
223
  "step": 480
224
  },
225
  {
226
  "epoch": 9.0,
227
- "eval_loss": 3.1356287002563477,
228
- "eval_runtime": 1.2192,
229
- "eval_samples_per_second": 18.865,
230
- "eval_steps_per_second": 4.921,
 
 
 
 
 
231
  "step": 495
232
  },
233
  {
234
  "epoch": 9.09,
235
  "learning_rate": 3.3176699082935545e-05,
236
- "loss": 0.4188,
237
  "step": 500
238
  },
239
  {
240
  "epoch": 9.45,
241
  "learning_rate": 3.166184534225087e-05,
242
- "loss": 0.3131,
243
  "step": 520
244
  },
245
  {
246
  "epoch": 9.82,
247
  "learning_rate": 3.012016670162977e-05,
248
- "loss": 0.3344,
249
  "step": 540
250
  },
251
  {
252
  "epoch": 10.0,
253
- "eval_loss": 3.3189520835876465,
254
- "eval_runtime": 1.2131,
255
- "eval_samples_per_second": 18.96,
256
- "eval_steps_per_second": 4.946,
 
 
 
 
 
257
  "step": 550
258
  },
259
  {
260
  "epoch": 10.18,
261
  "learning_rate": 2.8557870956832132e-05,
262
- "loss": 0.3005,
263
  "step": 560
264
  },
265
  {
266
  "epoch": 10.55,
267
  "learning_rate": 2.698124892141971e-05,
268
- "loss": 0.2527,
269
  "step": 580
270
  },
271
  {
272
  "epoch": 10.91,
273
  "learning_rate": 2.5396649095870202e-05,
274
- "loss": 0.2446,
275
  "step": 600
276
  },
277
  {
278
  "epoch": 11.0,
279
- "eval_loss": 3.5470495223999023,
280
- "eval_runtime": 1.216,
281
- "eval_samples_per_second": 18.914,
282
- "eval_steps_per_second": 4.934,
 
 
 
 
 
283
  "step": 605
284
  },
285
  {
286
  "epoch": 11.27,
287
  "learning_rate": 2.3810452104406444e-05,
288
- "loss": 0.1855,
289
  "step": 620
290
  },
291
  {
292
  "epoch": 11.64,
293
  "learning_rate": 2.222904500247473e-05,
294
- "loss": 0.1745,
295
  "step": 640
296
  },
297
  {
298
  "epoch": 12.0,
299
  "learning_rate": 2.0658795558326743e-05,
300
- "loss": 0.199,
 
 
 
 
 
301
  "step": 660
302
  },
303
  {
304
  "epoch": 12.0,
305
- "eval_loss": 3.68398118019104,
306
- "eval_runtime": 1.2409,
307
- "eval_samples_per_second": 18.535,
308
- "eval_steps_per_second": 4.835,
309
  "step": 660
310
  },
311
  {
312
  "epoch": 12.36,
313
  "learning_rate": 1.9106026612264316e-05,
314
- "loss": 0.1455,
315
  "step": 680
316
  },
317
  {
318
  "epoch": 12.73,
319
  "learning_rate": 1.7576990616793137e-05,
320
- "loss": 0.1245,
321
  "step": 700
322
  },
323
  {
324
  "epoch": 13.0,
325
- "eval_loss": 3.765277147293091,
326
- "eval_runtime": 1.2162,
327
- "eval_samples_per_second": 18.911,
328
- "eval_steps_per_second": 4.933,
 
 
 
 
 
329
  "step": 715
330
  },
331
  {
332
  "epoch": 13.09,
333
  "learning_rate": 1.6077844460203206e-05,
334
- "loss": 0.1351,
335
  "step": 720
336
  },
337
  {
338
  "epoch": 13.45,
339
  "learning_rate": 1.4614624674952842e-05,
340
- "loss": 0.0967,
341
  "step": 740
342
  },
343
  {
344
  "epoch": 13.82,
345
  "learning_rate": 1.3193223130682936e-05,
346
- "loss": 0.1208,
347
  "step": 760
348
  },
349
  {
350
  "epoch": 14.0,
351
- "eval_loss": 3.8721702098846436,
352
- "eval_runtime": 1.2167,
353
- "eval_samples_per_second": 18.903,
354
- "eval_steps_per_second": 4.931,
 
 
 
 
 
355
  "step": 770
356
  },
357
  {
358
  "epoch": 14.18,
359
  "learning_rate": 1.181936330973744e-05,
360
- "loss": 0.0853,
361
  "step": 780
362
  },
363
  {
364
  "epoch": 14.55,
365
  "learning_rate": 1.049857726072005e-05,
366
- "loss": 0.0854,
367
  "step": 800
368
  },
369
  {
370
  "epoch": 14.91,
371
  "learning_rate": 9.236183322886945e-06,
372
- "loss": 0.1003,
373
  "step": 820
374
  },
375
  {
376
  "epoch": 15.0,
377
- "eval_loss": 3.9574601650238037,
378
- "eval_runtime": 1.2205,
379
- "eval_samples_per_second": 18.844,
380
- "eval_steps_per_second": 4.916,
 
 
 
 
 
381
  "step": 825
382
  },
383
  {
384
  "epoch": 15.27,
385
  "learning_rate": 8.0372647110717e-06,
386
- "loss": 0.0753,
387
  "step": 840
388
  },
389
  {
390
  "epoch": 15.64,
391
  "learning_rate": 6.906649047373246e-06,
392
- "loss": 0.0928,
393
  "step": 860
394
  },
395
  {
396
  "epoch": 16.0,
397
  "learning_rate": 5.848888922025553e-06,
398
- "loss": 0.0767,
 
 
 
 
 
399
  "step": 880
400
  },
401
  {
402
  "epoch": 16.0,
403
- "eval_loss": 3.9671382904052734,
404
- "eval_runtime": 1.2561,
405
- "eval_samples_per_second": 18.311,
406
- "eval_steps_per_second": 4.777,
407
  "step": 880
408
  },
409
  {
410
  "epoch": 16.36,
411
  "learning_rate": 4.868243561723535e-06,
412
- "loss": 0.0702,
413
  "step": 900
414
  },
415
  {
416
  "epoch": 16.73,
417
  "learning_rate": 3.968661679220468e-06,
418
- "loss": 0.0913,
419
  "step": 920
420
  },
421
  {
422
  "epoch": 17.0,
423
- "eval_loss": 3.9921278953552246,
424
- "eval_runtime": 1.2166,
425
- "eval_samples_per_second": 18.905,
426
- "eval_steps_per_second": 4.932,
 
 
 
 
 
427
  "step": 935
428
  },
429
  {
430
  "epoch": 17.09,
431
  "learning_rate": 3.1537655732553768e-06,
432
- "loss": 0.0698,
433
  "step": 940
434
  },
435
  {
436
  "epoch": 17.45,
437
  "learning_rate": 2.4268365428344736e-06,
438
- "loss": 0.0661,
439
  "step": 960
440
  },
441
  {
442
  "epoch": 17.82,
443
  "learning_rate": 1.790801674598186e-06,
444
- "loss": 0.0895,
445
  "step": 980
446
  },
447
  {
448
  "epoch": 18.0,
449
- "eval_loss": 3.9939558506011963,
450
- "eval_runtime": 1.2161,
451
- "eval_samples_per_second": 18.913,
452
- "eval_steps_per_second": 4.934,
 
 
 
 
 
453
  "step": 990
454
  },
455
  {
456
  "epoch": 18.18,
457
  "learning_rate": 1.248222056476367e-06,
458
- "loss": 0.0695,
459
  "step": 1000
460
  },
461
  {
462
  "epoch": 18.55,
463
  "learning_rate": 8.012824650910938e-07,
464
- "loss": 0.086,
465
  "step": 1020
466
  },
467
  {
468
  "epoch": 18.91,
469
  "learning_rate": 4.517825684323324e-07,
470
- "loss": 0.0671,
471
  "step": 1040
472
  },
473
  {
474
  "epoch": 19.0,
475
- "eval_loss": 3.9915316104888916,
476
- "eval_runtime": 1.2143,
477
- "eval_samples_per_second": 18.941,
478
- "eval_steps_per_second": 4.941,
 
 
 
 
 
479
  "step": 1045
480
  },
481
  {
482
  "epoch": 19.27,
483
  "learning_rate": 2.011296792301165e-07,
484
- "loss": 0.0681,
485
  "step": 1060
486
  },
487
  {
488
  "epoch": 19.64,
489
  "learning_rate": 5.033308820289184e-08,
490
- "loss": 0.09,
491
  "step": 1080
492
  },
493
  {
494
  "epoch": 20.0,
495
  "learning_rate": 0.0,
496
- "loss": 0.0671,
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  "step": 1100
498
  },
499
  {
500
  "epoch": 20.0,
501
  "step": 1100,
502
- "total_flos": 3.807078373542298e+16,
503
- "train_loss": 0.0,
504
- "train_runtime": 10.1895,
505
- "train_samples_per_second": 425.927,
506
- "train_steps_per_second": 107.954
507
  }
508
  ],
509
  "logging_steps": 20,
@@ -511,7 +619,7 @@
511
  "num_input_tokens_seen": 0,
512
  "num_train_epochs": 20,
513
  "save_steps": 20,
514
- "total_flos": 3.807078373542298e+16,
515
  "train_batch_size": 4,
516
  "trial_name": null,
517
  "trial_params": null
 
11
  {
12
  "epoch": 0.02,
13
  "learning_rate": 4.545454545454545e-07,
14
+ "loss": 2.7297,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.36,
19
  "learning_rate": 9.090909090909091e-06,
20
+ "loss": 2.7244,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.73,
25
  "learning_rate": 1.8181818181818182e-05,
26
+ "loss": 2.7671,
27
  "step": 40
28
  },
29
  {
30
  "epoch": 1.0,
31
+ "gpt4_scores": 0.43333333333333335,
32
+ "step": 55
33
+ },
34
+ {
35
+ "epoch": 1.0,
36
+ "eval_loss": 2.312265396118164,
37
+ "eval_runtime": 4.9266,
38
+ "eval_samples_per_second": 4.669,
39
+ "eval_steps_per_second": 1.218,
40
  "step": 55
41
  },
42
  {
43
  "epoch": 1.09,
44
  "learning_rate": 2.7272727272727273e-05,
45
+ "loss": 2.4172,
46
  "step": 60
47
  },
48
  {
49
  "epoch": 1.45,
50
  "learning_rate": 3.6363636363636364e-05,
51
+ "loss": 2.1566,
52
  "step": 80
53
  },
54
  {
55
  "epoch": 1.82,
56
  "learning_rate": 4.545454545454546e-05,
57
+ "loss": 2.0319,
58
  "step": 100
59
  },
60
  {
61
  "epoch": 2.0,
62
+ "gpt4_scores": 0.6,
63
+ "step": 110
64
+ },
65
+ {
66
+ "epoch": 2.0,
67
+ "eval_loss": 1.9679468870162964,
68
+ "eval_runtime": 4.9521,
69
+ "eval_samples_per_second": 4.644,
70
+ "eval_steps_per_second": 1.212,
71
  "step": 110
72
  },
73
  {
74
  "epoch": 2.18,
75
  "learning_rate": 4.9987413559579636e-05,
76
+ "loss": 1.8348,
77
  "step": 120
78
  },
79
  {
80
  "epoch": 2.55,
81
  "learning_rate": 4.988679806432712e-05,
82
+ "loss": 1.8492,
83
  "step": 140
84
  },
85
  {
86
  "epoch": 2.91,
87
  "learning_rate": 4.968597221690986e-05,
88
+ "loss": 1.7972,
89
  "step": 160
90
  },
91
  {
92
  "epoch": 3.0,
93
+ "gpt4_scores": 0.5166666666666666,
94
+ "step": 165
95
+ },
96
+ {
97
+ "epoch": 3.0,
98
+ "eval_loss": 1.942632794380188,
99
+ "eval_runtime": 4.9711,
100
+ "eval_samples_per_second": 4.627,
101
+ "eval_steps_per_second": 1.207,
102
  "step": 165
103
  },
104
  {
105
  "epoch": 3.27,
106
  "learning_rate": 4.938574467213518e-05,
107
+ "loss": 1.6588,
108
  "step": 180
109
  },
110
  {
111
  "epoch": 3.64,
112
  "learning_rate": 4.898732434036244e-05,
113
+ "loss": 1.6187,
114
  "step": 200
115
  },
116
  {
117
  "epoch": 4.0,
118
  "learning_rate": 4.849231551964771e-05,
119
+ "loss": 1.5841,
120
+ "step": 220
121
+ },
122
+ {
123
+ "epoch": 4.0,
124
+ "gpt4_scores": 0.6833333333333332,
125
  "step": 220
126
  },
127
  {
128
  "epoch": 4.0,
129
+ "eval_loss": 2.0110199451446533,
130
+ "eval_runtime": 4.9304,
131
+ "eval_samples_per_second": 4.665,
132
+ "eval_steps_per_second": 1.217,
133
  "step": 220
134
  },
135
  {
136
  "epoch": 4.36,
137
  "learning_rate": 4.790271143580174e-05,
138
+ "loss": 1.2998,
139
  "step": 240
140
  },
141
  {
142
  "epoch": 4.73,
143
  "learning_rate": 4.722088621637309e-05,
144
+ "loss": 1.2842,
145
  "step": 260
146
  },
147
  {
148
  "epoch": 5.0,
149
+ "gpt4_scores": 0.5166666666666667,
150
+ "step": 275
151
+ },
152
+ {
153
+ "epoch": 5.0,
154
+ "eval_loss": 2.2670648097991943,
155
+ "eval_runtime": 4.9304,
156
+ "eval_samples_per_second": 4.665,
157
+ "eval_steps_per_second": 1.217,
158
  "step": 275
159
  },
160
  {
161
  "epoch": 5.09,
162
  "learning_rate": 4.644958533087443e-05,
163
+ "loss": 1.1442,
164
  "step": 280
165
  },
166
  {
167
  "epoch": 5.45,
168
  "learning_rate": 4.559191453574582e-05,
169
+ "loss": 0.9076,
170
  "step": 300
171
  },
172
  {
173
  "epoch": 5.82,
174
  "learning_rate": 4.465132736856969e-05,
175
+ "loss": 0.9305,
176
  "step": 320
177
  },
178
  {
179
  "epoch": 6.0,
180
+ "gpt4_scores": 0.75,
181
+ "step": 330
182
+ },
183
+ {
184
+ "epoch": 6.0,
185
+ "eval_loss": 2.5263493061065674,
186
+ "eval_runtime": 4.9267,
187
+ "eval_samples_per_second": 4.668,
188
+ "eval_steps_per_second": 1.218,
189
  "step": 330
190
  },
191
  {
192
  "epoch": 6.18,
193
  "learning_rate": 4.3631611241893874e-05,
194
+ "loss": 0.7708,
195
  "step": 340
196
  },
197
  {
198
  "epoch": 6.55,
199
  "learning_rate": 4.2536872192658036e-05,
200
+ "loss": 0.6432,
201
  "step": 360
202
  },
203
  {
204
  "epoch": 6.91,
205
  "learning_rate": 4.137151834863213e-05,
206
+ "loss": 0.6734,
207
  "step": 380
208
  },
209
  {
210
  "epoch": 7.0,
211
+ "gpt4_scores": 0.7333333333333334,
212
+ "step": 385
213
+ },
214
+ {
215
+ "epoch": 7.0,
216
+ "eval_loss": 2.7797744274139404,
217
+ "eval_runtime": 4.982,
218
+ "eval_samples_per_second": 4.617,
219
+ "eval_steps_per_second": 1.204,
220
  "step": 385
221
  },
222
  {
223
  "epoch": 7.27,
224
  "learning_rate": 4.014024217844167e-05,
225
+ "loss": 0.5284,
226
  "step": 400
227
  },
228
  {
229
  "epoch": 7.64,
230
  "learning_rate": 3.884800159665276e-05,
231
+ "loss": 0.4544,
232
  "step": 420
233
  },
234
  {
235
  "epoch": 8.0,
236
  "learning_rate": 3.7500000000000003e-05,
237
+ "loss": 0.4579,
238
+ "step": 440
239
+ },
240
+ {
241
+ "epoch": 8.0,
242
+ "gpt4_scores": 0.5499999999999999,
243
  "step": 440
244
  },
245
  {
246
  "epoch": 8.0,
247
+ "eval_loss": 3.1051599979400635,
248
+ "eval_runtime": 4.9596,
249
+ "eval_samples_per_second": 4.637,
250
+ "eval_steps_per_second": 1.21,
251
  "step": 440
252
  },
253
  {
254
  "epoch": 8.36,
255
  "learning_rate": 3.610166531514436e-05,
256
+ "loss": 0.3129,
257
  "step": 460
258
  },
259
  {
260
  "epoch": 8.73,
261
  "learning_rate": 3.465862814232822e-05,
262
+ "loss": 0.3091,
263
  "step": 480
264
  },
265
  {
266
  "epoch": 9.0,
267
+ "gpt4_scores": 0.5666666666666668,
268
+ "step": 495
269
+ },
270
+ {
271
+ "epoch": 9.0,
272
+ "eval_loss": 3.3408806324005127,
273
+ "eval_runtime": 4.9659,
274
+ "eval_samples_per_second": 4.632,
275
+ "eval_steps_per_second": 1.208,
276
  "step": 495
277
  },
278
  {
279
  "epoch": 9.09,
280
  "learning_rate": 3.3176699082935545e-05,
281
+ "loss": 0.3082,
282
  "step": 500
283
  },
284
  {
285
  "epoch": 9.45,
286
  "learning_rate": 3.166184534225087e-05,
287
+ "loss": 0.2184,
288
  "step": 520
289
  },
290
  {
291
  "epoch": 9.82,
292
  "learning_rate": 3.012016670162977e-05,
293
+ "loss": 0.2418,
294
  "step": 540
295
  },
296
  {
297
  "epoch": 10.0,
298
+ "gpt4_scores": 0.48333333333333334,
299
+ "step": 550
300
+ },
301
+ {
302
+ "epoch": 10.0,
303
+ "eval_loss": 3.499851703643799,
304
+ "eval_runtime": 4.9663,
305
+ "eval_samples_per_second": 4.631,
306
+ "eval_steps_per_second": 1.208,
307
  "step": 550
308
  },
309
  {
310
  "epoch": 10.18,
311
  "learning_rate": 2.8557870956832132e-05,
312
+ "loss": 0.1996,
313
  "step": 560
314
  },
315
  {
316
  "epoch": 10.55,
317
  "learning_rate": 2.698124892141971e-05,
318
+ "loss": 0.1778,
319
  "step": 580
320
  },
321
  {
322
  "epoch": 10.91,
323
  "learning_rate": 2.5396649095870202e-05,
324
+ "loss": 0.1718,
325
  "step": 600
326
  },
327
  {
328
  "epoch": 11.0,
329
+ "gpt4_scores": 0.6333333333333334,
330
+ "step": 605
331
+ },
332
+ {
333
+ "epoch": 11.0,
334
+ "eval_loss": 3.6687815189361572,
335
+ "eval_runtime": 4.9484,
336
+ "eval_samples_per_second": 4.648,
337
+ "eval_steps_per_second": 1.213,
338
  "step": 605
339
  },
340
  {
341
  "epoch": 11.27,
342
  "learning_rate": 2.3810452104406444e-05,
343
+ "loss": 0.1401,
344
  "step": 620
345
  },
346
  {
347
  "epoch": 11.64,
348
  "learning_rate": 2.222904500247473e-05,
349
+ "loss": 0.1344,
350
  "step": 640
351
  },
352
  {
353
  "epoch": 12.0,
354
  "learning_rate": 2.0658795558326743e-05,
355
+ "loss": 0.1555,
356
+ "step": 660
357
+ },
358
+ {
359
+ "epoch": 12.0,
360
+ "gpt4_scores": 0.6,
361
  "step": 660
362
  },
363
  {
364
  "epoch": 12.0,
365
+ "eval_loss": 3.78193998336792,
366
+ "eval_runtime": 4.9817,
367
+ "eval_samples_per_second": 4.617,
368
+ "eval_steps_per_second": 1.204,
369
  "step": 660
370
  },
371
  {
372
  "epoch": 12.36,
373
  "learning_rate": 1.9106026612264316e-05,
374
+ "loss": 0.1414,
375
  "step": 680
376
  },
377
  {
378
  "epoch": 12.73,
379
  "learning_rate": 1.7576990616793137e-05,
380
+ "loss": 0.1191,
381
  "step": 700
382
  },
383
  {
384
  "epoch": 13.0,
385
+ "gpt4_scores": 0.3666666666666667,
386
+ "step": 715
387
+ },
388
+ {
389
+ "epoch": 13.0,
390
+ "eval_loss": 3.910775899887085,
391
+ "eval_runtime": 4.9709,
392
+ "eval_samples_per_second": 4.627,
393
+ "eval_steps_per_second": 1.207,
394
  "step": 715
395
  },
396
  {
397
  "epoch": 13.09,
398
  "learning_rate": 1.6077844460203206e-05,
399
+ "loss": 0.1191,
400
  "step": 720
401
  },
402
  {
403
  "epoch": 13.45,
404
  "learning_rate": 1.4614624674952842e-05,
405
+ "loss": 0.1069,
406
  "step": 740
407
  },
408
  {
409
  "epoch": 13.82,
410
  "learning_rate": 1.3193223130682936e-05,
411
+ "loss": 0.1291,
412
  "step": 760
413
  },
414
  {
415
  "epoch": 14.0,
416
+ "gpt4_scores": 0.3666666666666667,
417
+ "step": 770
418
+ },
419
+ {
420
+ "epoch": 14.0,
421
+ "eval_loss": 3.995321750640869,
422
+ "eval_runtime": 4.9833,
423
+ "eval_samples_per_second": 4.615,
424
+ "eval_steps_per_second": 1.204,
425
  "step": 770
426
  },
427
  {
428
  "epoch": 14.18,
429
  "learning_rate": 1.181936330973744e-05,
430
+ "loss": 0.1075,
431
  "step": 780
432
  },
433
  {
434
  "epoch": 14.55,
435
  "learning_rate": 1.049857726072005e-05,
436
+ "loss": 0.1058,
437
  "step": 800
438
  },
439
  {
440
  "epoch": 14.91,
441
  "learning_rate": 9.236183322886945e-06,
442
+ "loss": 0.1213,
443
  "step": 820
444
  },
445
  {
446
  "epoch": 15.0,
447
+ "gpt4_scores": 0.3333333333333333,
448
+ "step": 825
449
+ },
450
+ {
451
+ "epoch": 15.0,
452
+ "eval_loss": 4.102020740509033,
453
+ "eval_runtime": 4.931,
454
+ "eval_samples_per_second": 4.664,
455
+ "eval_steps_per_second": 1.217,
456
  "step": 825
457
  },
458
  {
459
  "epoch": 15.27,
460
  "learning_rate": 8.0372647110717e-06,
461
+ "loss": 0.1013,
462
  "step": 840
463
  },
464
  {
465
  "epoch": 15.64,
466
  "learning_rate": 6.906649047373246e-06,
467
+ "loss": 0.1176,
468
  "step": 860
469
  },
470
  {
471
  "epoch": 16.0,
472
  "learning_rate": 5.848888922025553e-06,
473
+ "loss": 0.1,
474
+ "step": 880
475
+ },
476
+ {
477
+ "epoch": 16.0,
478
+ "gpt4_scores": 0.43333333333333335,
479
  "step": 880
480
  },
481
  {
482
  "epoch": 16.0,
483
+ "eval_loss": 4.120510578155518,
484
+ "eval_runtime": 4.9308,
485
+ "eval_samples_per_second": 4.665,
486
+ "eval_steps_per_second": 1.217,
487
  "step": 880
488
  },
489
  {
490
  "epoch": 16.36,
491
  "learning_rate": 4.868243561723535e-06,
492
+ "loss": 0.0974,
493
  "step": 900
494
  },
495
  {
496
  "epoch": 16.73,
497
  "learning_rate": 3.968661679220468e-06,
498
+ "loss": 0.115,
499
  "step": 920
500
  },
501
  {
502
  "epoch": 17.0,
503
+ "gpt4_scores": 0.45,
504
+ "step": 935
505
+ },
506
+ {
507
+ "epoch": 17.0,
508
+ "eval_loss": 4.160642623901367,
509
+ "eval_runtime": 4.9259,
510
+ "eval_samples_per_second": 4.669,
511
+ "eval_steps_per_second": 1.218,
512
  "step": 935
513
  },
514
  {
515
  "epoch": 17.09,
516
  "learning_rate": 3.1537655732553768e-06,
517
+ "loss": 0.1013,
518
  "step": 940
519
  },
520
  {
521
  "epoch": 17.45,
522
  "learning_rate": 2.4268365428344736e-06,
523
+ "loss": 0.0971,
524
  "step": 960
525
  },
526
  {
527
  "epoch": 17.82,
528
  "learning_rate": 1.790801674598186e-06,
529
+ "loss": 0.1076,
530
  "step": 980
531
  },
532
  {
533
  "epoch": 18.0,
534
+ "gpt4_scores": 0.31666666666666665,
535
+ "step": 990
536
+ },
537
+ {
538
+ "epoch": 18.0,
539
+ "eval_loss": 4.183867454528809,
540
+ "eval_runtime": 4.9845,
541
+ "eval_samples_per_second": 4.614,
542
+ "eval_steps_per_second": 1.204,
543
  "step": 990
544
  },
545
  {
546
  "epoch": 18.18,
547
  "learning_rate": 1.248222056476367e-06,
548
+ "loss": 0.103,
549
  "step": 1000
550
  },
551
  {
552
  "epoch": 18.55,
553
  "learning_rate": 8.012824650910938e-07,
554
+ "loss": 0.1082,
555
  "step": 1020
556
  },
557
  {
558
  "epoch": 18.91,
559
  "learning_rate": 4.517825684323324e-07,
560
+ "loss": 0.0962,
561
  "step": 1040
562
  },
563
  {
564
  "epoch": 19.0,
565
+ "gpt4_scores": 0.4666666666666666,
566
+ "step": 1045
567
+ },
568
+ {
569
+ "epoch": 19.0,
570
+ "eval_loss": 4.187291622161865,
571
+ "eval_runtime": 4.9682,
572
+ "eval_samples_per_second": 4.629,
573
+ "eval_steps_per_second": 1.208,
574
  "step": 1045
575
  },
576
  {
577
  "epoch": 19.27,
578
  "learning_rate": 2.011296792301165e-07,
579
+ "loss": 0.0981,
580
  "step": 1060
581
  },
582
  {
583
  "epoch": 19.64,
584
  "learning_rate": 5.033308820289184e-08,
585
+ "loss": 0.1154,
586
  "step": 1080
587
  },
588
  {
589
  "epoch": 20.0,
590
  "learning_rate": 0.0,
591
+ "loss": 0.0917,
592
+ "step": 1100
593
+ },
594
+ {
595
+ "epoch": 20.0,
596
+ "gpt4_scores": 0.31666666666666665,
597
+ "step": 1100
598
+ },
599
+ {
600
+ "epoch": 20.0,
601
+ "eval_loss": 4.187403678894043,
602
+ "eval_runtime": 4.9884,
603
+ "eval_samples_per_second": 4.611,
604
+ "eval_steps_per_second": 1.203,
605
  "step": 1100
606
  },
607
  {
608
  "epoch": 20.0,
609
  "step": 1100,
610
+ "total_flos": 3.836907908090266e+16,
611
+ "train_loss": 0.660087760145014,
612
+ "train_runtime": 8612.1752,
613
+ "train_samples_per_second": 0.504,
614
+ "train_steps_per_second": 0.128
615
  }
616
  ],
617
  "logging_steps": 20,
 
619
  "num_input_tokens_seen": 0,
620
  "num_train_epochs": 20,
621
  "save_steps": 20,
622
+ "total_flos": 3.836907908090266e+16,
623
  "train_batch_size": 4,
624
  "trial_name": null,
625
  "trial_params": null