yyx123 commited on
Commit
60b730c
1 Parent(s): d2ef6fa

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba-1e-5-50
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba-1e-5-50
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 3.5129
26
 
27
  ## Model description
28
 
@@ -41,7 +37,7 @@ More information needed
41
  ### Training hyperparameters
42
 
43
  The following hyperparameters were used during training:
44
- - learning_rate: 0.0005
45
  - train_batch_size: 4
46
  - eval_batch_size: 4
47
  - seed: 42
@@ -54,14 +50,14 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.5902 | 2.0 | 110 | 2.0347 |
58
- | 0.4146 | 4.0 | 220 | 2.7832 |
59
- | 0.2193 | 5.0 | 275 | 3.0240 |
60
- | 0.1547 | 6.0 | 330 | 3.2444 |
61
- | 0.1096 | 7.0 | 385 | 3.3953 |
62
- | 0.1053 | 8.0 | 440 | 3.4807 |
63
- | 0.0969 | 9.0 | 495 | 3.5086 |
64
- | 0.1768 | 10.0 | 550 | 3.5129 |
65
 
66
 
67
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba-1e-5-50
 
16
 
17
  # Yi-6B-ruozhiba-1e-5-50
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1.9851
22
 
23
  ## Model description
24
 
 
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
+ - learning_rate: 1e-05
41
  - train_batch_size: 4
42
  - eval_batch_size: 4
43
  - seed: 42
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 2.6749 | 1.0 | 55 | 2.5387 |
54
+ | 2.0502 | 3.0 | 165 | 2.0924 |
55
+ | 1.9287 | 5.0 | 275 | 2.0103 |
56
+ | 2.0021 | 6.0 | 330 | 1.9935 |
57
+ | 1.8199 | 7.0 | 385 | 1.9878 |
58
+ | 1.9559 | 8.0 | 440 | 1.9858 |
59
+ | 1.8229 | 9.0 | 495 | 1.9853 |
60
+ | 1.8178 | 10.0 | 550 | 1.9851 |
61
 
62
 
63
  ### Framework versions
adapter_config.json CHANGED
@@ -19,12 +19,12 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "gate_proj",
23
- "q_proj",
24
- "up_proj",
25
  "v_proj",
26
  "k_proj",
27
  "o_proj",
 
 
 
28
  "down_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
 
 
22
  "v_proj",
23
  "k_proj",
24
  "o_proj",
25
+ "gate_proj",
26
+ "q_proj",
27
+ "up_proj",
28
  "down_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:100ba1f2d2db812411e9024ecd40df04fae1757af141c2d75c573a9f711135e6
3
  size 145287696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a7400933ae66031ef5dd58be0dff5a4913af8976f677bd919c53f2f056545d7
3
  size 145287696
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_loss": 3.5129005908966064,
4
- "eval_runtime": 4.948,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 4.648,
7
- "eval_steps_per_second": 1.213,
8
- "train_loss": 0.1272620278596878,
9
- "train_runtime": 22126.0761,
10
  "train_samples": 217,
11
- "train_samples_per_second": 0.098,
12
- "train_steps_per_second": 0.025
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_loss": 1.9850994348526,
4
+ "eval_runtime": 4.9118,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.683,
7
+ "eval_steps_per_second": 1.222,
8
+ "train_loss": 1.1392531923814253,
9
+ "train_runtime": 18256.4954,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 0.119,
12
+ "train_steps_per_second": 0.03
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_loss": 3.5129005908966064,
4
- "eval_runtime": 4.948,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 4.648,
7
- "eval_steps_per_second": 1.213
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_loss": 1.9850994348526,
4
+ "eval_runtime": 4.9118,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.683,
7
+ "eval_steps_per_second": 1.222
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "train_loss": 0.1272620278596878,
4
- "train_runtime": 22126.0761,
5
  "train_samples": 217,
6
- "train_samples_per_second": 0.098,
7
- "train_steps_per_second": 0.025
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "train_loss": 1.1392531923814253,
4
+ "train_runtime": 18256.4954,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 0.119,
7
+ "train_steps_per_second": 0.03
8
  }
trainer_state.json CHANGED
@@ -10,952 +10,952 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "learning_rate": 9.090909090909091e-06,
14
  "loss": 2.7431,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.07,
19
- "learning_rate": 3.6363636363636364e-05,
20
- "loss": 2.8776,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.15,
25
- "learning_rate": 7.272727272727273e-05,
26
- "loss": 2.634,
27
  "step": 8
28
  },
29
  {
30
  "epoch": 0.22,
31
- "learning_rate": 0.00010909090909090909,
32
- "loss": 2.5152,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.29,
37
- "learning_rate": 0.00014545454545454546,
38
- "loss": 2.3073,
39
  "step": 16
40
  },
41
  {
42
  "epoch": 0.36,
43
- "learning_rate": 0.00018181818181818183,
44
- "loss": 2.2473,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.44,
49
- "learning_rate": 0.00021818181818181818,
50
- "loss": 2.1606,
51
  "step": 24
52
  },
53
  {
54
  "epoch": 0.51,
55
- "learning_rate": 0.0002545454545454545,
56
- "loss": 2.1845,
57
  "step": 28
58
  },
59
  {
60
  "epoch": 0.58,
61
- "learning_rate": 0.0002909090909090909,
62
- "loss": 2.0583,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 0.65,
67
- "learning_rate": 0.00032727272727272726,
68
- "loss": 2.0335,
69
  "step": 36
70
  },
71
  {
72
  "epoch": 0.73,
73
- "learning_rate": 0.00036363636363636367,
74
- "loss": 1.924,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
- "learning_rate": 0.0004,
80
- "loss": 1.9703,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
- "learning_rate": 0.00043636363636363637,
86
- "loss": 1.9575,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
- "learning_rate": 0.0004727272727272727,
92
- "loss": 2.0255,
93
  "step": 52
94
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  {
96
  "epoch": 1.02,
97
- "learning_rate": 0.0004999949650182266,
98
- "loss": 1.9794,
99
  "step": 56
100
  },
101
  {
102
  "epoch": 1.09,
103
- "learning_rate": 0.0004998741355957963,
104
- "loss": 1.789,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 1.16,
109
- "learning_rate": 0.0004995922759815339,
110
- "loss": 1.7866,
111
  "step": 64
112
  },
113
  {
114
  "epoch": 1.24,
115
- "learning_rate": 0.0004991495678185201,
116
- "loss": 1.7061,
117
  "step": 68
118
  },
119
  {
120
  "epoch": 1.31,
121
- "learning_rate": 0.0004985462964079136,
122
- "loss": 1.6782,
123
  "step": 72
124
  },
125
  {
126
  "epoch": 1.38,
127
- "learning_rate": 0.0004977828505250904,
128
- "loss": 1.5574,
129
  "step": 76
130
  },
131
  {
132
  "epoch": 1.45,
133
- "learning_rate": 0.0004968597221690986,
134
- "loss": 1.5883,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 1.53,
139
- "learning_rate": 0.0004957775062455933,
140
- "loss": 1.7155,
141
  "step": 84
142
  },
143
  {
144
  "epoch": 1.6,
145
- "learning_rate": 0.0004945369001834514,
146
- "loss": 1.6924,
147
  "step": 88
148
  },
149
  {
150
  "epoch": 1.67,
151
- "learning_rate": 0.0004931387034853173,
152
- "loss": 1.6414,
153
  "step": 92
154
  },
155
  {
156
  "epoch": 1.75,
157
- "learning_rate": 0.0004915838172123671,
158
- "loss": 1.6989,
159
  "step": 96
160
  },
161
  {
162
  "epoch": 1.82,
163
- "learning_rate": 0.0004898732434036243,
164
- "loss": 1.7026,
165
  "step": 100
166
  },
167
  {
168
  "epoch": 1.89,
169
- "learning_rate": 0.0004880080844302004,
170
- "loss": 1.538,
171
  "step": 104
172
  },
173
  {
174
  "epoch": 1.96,
175
- "learning_rate": 0.0004859895422848767,
176
- "loss": 1.5902,
177
  "step": 108
178
  },
179
- {
180
- "epoch": 2.0,
181
- "pls_score": 62.8,
182
- "std": 4.561052510112109,
183
- "step": 110
184
- },
185
- {
186
- "epoch": 2.0,
187
- "eval_loss": 2.0346944332122803,
188
- "eval_runtime": 4.9191,
189
- "eval_samples_per_second": 4.676,
190
- "eval_steps_per_second": 1.22,
191
- "step": 110
192
- },
193
  {
194
  "epoch": 2.04,
195
- "learning_rate": 0.00048381891780748665,
196
- "loss": 1.5041,
197
  "step": 112
198
  },
199
  {
200
  "epoch": 2.11,
201
- "learning_rate": 0.0004814976098465951,
202
- "loss": 0.9413,
203
  "step": 116
204
  },
205
  {
206
  "epoch": 2.18,
207
- "learning_rate": 0.0004790271143580174,
208
- "loss": 0.9638,
209
  "step": 120
210
  },
211
  {
212
  "epoch": 2.25,
213
- "learning_rate": 0.0004764090234407577,
214
- "loss": 0.8674,
215
  "step": 124
216
  },
217
  {
218
  "epoch": 2.33,
219
- "learning_rate": 0.0004736450243109884,
220
- "loss": 0.8621,
221
  "step": 128
222
  },
223
  {
224
  "epoch": 2.4,
225
- "learning_rate": 0.00047073689821473173,
226
- "loss": 0.86,
227
  "step": 132
228
  },
229
  {
230
  "epoch": 2.47,
231
- "learning_rate": 0.00046768651927994433,
232
- "loss": 0.7993,
233
  "step": 136
234
  },
235
  {
236
  "epoch": 2.55,
237
- "learning_rate": 0.0004644958533087443,
238
- "loss": 0.7921,
239
  "step": 140
240
  },
241
  {
242
  "epoch": 2.62,
243
- "learning_rate": 0.0004611669565105596,
244
- "loss": 0.9535,
245
  "step": 144
246
  },
247
  {
248
  "epoch": 2.69,
249
- "learning_rate": 0.00045770197417701366,
250
- "loss": 0.823,
251
  "step": 148
252
  },
253
  {
254
  "epoch": 2.76,
255
- "learning_rate": 0.00045410313929940244,
256
- "loss": 0.8705,
257
  "step": 152
258
  },
259
  {
260
  "epoch": 2.84,
261
- "learning_rate": 0.00045037277112965383,
262
- "loss": 0.9627,
263
  "step": 156
264
  },
265
  {
266
  "epoch": 2.91,
267
- "learning_rate": 0.0004465132736856969,
268
- "loss": 1.0076,
269
  "step": 160
270
  },
271
  {
272
  "epoch": 2.98,
273
- "learning_rate": 0.00044252713420220394,
274
- "loss": 0.9706,
275
  "step": 164
276
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  {
278
  "epoch": 3.05,
279
- "learning_rate": 0.00043841692152770415,
280
- "loss": 0.4535,
281
  "step": 168
282
  },
283
  {
284
  "epoch": 3.13,
285
- "learning_rate": 0.00043418528446910123,
286
- "loss": 0.3861,
287
  "step": 172
288
  },
289
  {
290
  "epoch": 3.2,
291
- "learning_rate": 0.0004298349500846628,
292
- "loss": 0.4925,
293
  "step": 176
294
  },
295
  {
296
  "epoch": 3.27,
297
- "learning_rate": 0.00042536872192658034,
298
- "loss": 0.4753,
299
  "step": 180
300
  },
301
  {
302
  "epoch": 3.35,
303
- "learning_rate": 0.00042078947823423365,
304
- "loss": 0.4131,
305
  "step": 184
306
  },
307
  {
308
  "epoch": 3.42,
309
- "learning_rate": 0.0004161001700793231,
310
- "loss": 0.3977,
311
  "step": 188
312
  },
313
  {
314
  "epoch": 3.49,
315
- "learning_rate": 0.00041130381946406574,
316
- "loss": 0.4816,
317
  "step": 192
318
  },
319
  {
320
  "epoch": 3.56,
321
- "learning_rate": 0.0004064035173736804,
322
- "loss": 0.4944,
323
  "step": 196
324
  },
325
  {
326
  "epoch": 3.64,
327
- "learning_rate": 0.00040140242178441667,
328
- "loss": 0.4889,
329
  "step": 200
330
  },
331
  {
332
  "epoch": 3.71,
333
- "learning_rate": 0.0003963037556284129,
334
- "loss": 0.449,
335
  "step": 204
336
  },
337
  {
338
  "epoch": 3.78,
339
- "learning_rate": 0.0003911108047166924,
340
- "loss": 0.5103,
341
  "step": 208
342
  },
343
  {
344
  "epoch": 3.85,
345
- "learning_rate": 0.00038582691562163827,
346
- "loss": 0.4971,
347
  "step": 212
348
  },
349
  {
350
  "epoch": 3.93,
351
- "learning_rate": 0.0003804554935203115,
352
- "loss": 0.4087,
353
  "step": 216
354
  },
355
  {
356
  "epoch": 4.0,
357
- "learning_rate": 0.000375,
358
- "loss": 0.4146,
359
- "step": 220
360
- },
361
- {
362
- "epoch": 4.0,
363
- "pls_score": 63.6,
364
- "std": 4.067038234391212,
365
- "step": 220
366
- },
367
- {
368
- "epoch": 4.0,
369
- "eval_loss": 2.783196449279785,
370
- "eval_runtime": 4.9791,
371
- "eval_samples_per_second": 4.619,
372
- "eval_steps_per_second": 1.205,
373
  "step": 220
374
  },
375
  {
376
  "epoch": 4.07,
377
- "learning_rate": 0.0003694639508274158,
378
- "loss": 0.2699,
379
  "step": 224
380
  },
381
  {
382
  "epoch": 4.15,
383
- "learning_rate": 0.0003638509136829758,
384
- "loss": 0.24,
385
  "step": 228
386
  },
387
  {
388
  "epoch": 4.22,
389
- "learning_rate": 0.00035816450586162706,
390
- "loss": 0.2368,
391
  "step": 232
392
  },
393
  {
394
  "epoch": 4.29,
395
- "learning_rate": 0.00035240839194169884,
396
- "loss": 0.1893,
397
  "step": 236
398
  },
399
  {
400
  "epoch": 4.36,
401
- "learning_rate": 0.00034658628142328216,
402
- "loss": 0.2246,
403
  "step": 240
404
  },
405
  {
406
  "epoch": 4.44,
407
- "learning_rate": 0.00034070192633766023,
408
- "loss": 0.3669,
409
  "step": 244
410
  },
411
  {
412
  "epoch": 4.51,
413
- "learning_rate": 0.0003347591188293301,
414
- "loss": 0.2311,
415
  "step": 248
416
  },
417
  {
418
  "epoch": 4.58,
419
- "learning_rate": 0.00032876168871217323,
420
- "loss": 0.213,
421
  "step": 252
422
  },
423
  {
424
  "epoch": 4.65,
425
- "learning_rate": 0.00032271350100134975,
426
- "loss": 0.2392,
427
  "step": 256
428
  },
429
  {
430
  "epoch": 4.73,
431
- "learning_rate": 0.0003166184534225087,
432
- "loss": 0.2525,
433
  "step": 260
434
  },
435
  {
436
  "epoch": 4.8,
437
- "learning_rate": 0.0003104804738999169,
438
- "loss": 0.2326,
439
  "step": 264
440
  },
441
  {
442
  "epoch": 4.87,
443
- "learning_rate": 0.00030430351802512693,
444
- "loss": 0.2374,
445
  "step": 268
446
  },
447
  {
448
  "epoch": 4.95,
449
- "learning_rate": 0.00029809156650781527,
450
- "loss": 0.2193,
451
  "step": 272
452
  },
453
  {
454
  "epoch": 5.0,
455
- "pls_score": 62.0,
456
- "std": 3.676955262170047,
457
  "step": 275
458
  },
459
  {
460
  "epoch": 5.0,
461
- "eval_loss": 3.024041175842285,
462
- "eval_runtime": 4.9575,
463
- "eval_samples_per_second": 4.639,
464
- "eval_steps_per_second": 1.21,
465
  "step": 275
466
  },
467
  {
468
  "epoch": 5.02,
469
- "learning_rate": 0.0002918486226104327,
470
- "loss": 0.2179,
471
  "step": 276
472
  },
473
  {
474
  "epoch": 5.09,
475
- "learning_rate": 0.00028557870956832135,
476
- "loss": 0.1443,
477
  "step": 280
478
  },
479
  {
480
  "epoch": 5.16,
481
- "learning_rate": 0.0002792858679969596,
482
- "loss": 0.1345,
483
  "step": 284
484
  },
485
  {
486
  "epoch": 5.24,
487
- "learning_rate": 0.0002729741532880069,
488
- "loss": 0.1421,
489
  "step": 288
490
  },
491
  {
492
  "epoch": 5.31,
493
- "learning_rate": 0.000266647632995826,
494
- "loss": 0.1247,
495
  "step": 292
496
  },
497
  {
498
  "epoch": 5.38,
499
- "learning_rate": 0.00026031038421616684,
500
- "loss": 0.1541,
501
  "step": 296
502
  },
503
  {
504
  "epoch": 5.45,
505
- "learning_rate": 0.000253966490958702,
506
- "loss": 0.1539,
507
  "step": 300
508
  },
509
  {
510
  "epoch": 5.53,
511
- "learning_rate": 0.00024762004151510585,
512
- "loss": 0.1498,
513
  "step": 304
514
  },
515
  {
516
  "epoch": 5.6,
517
- "learning_rate": 0.00024127512582437484,
518
- "loss": 0.1678,
519
  "step": 308
520
  },
521
  {
522
  "epoch": 5.67,
523
- "learning_rate": 0.00023493583283708543,
524
- "loss": 0.2514,
525
  "step": 312
526
  },
527
  {
528
  "epoch": 5.75,
529
- "learning_rate": 0.00022860624788029015,
530
- "loss": 0.1418,
531
  "step": 316
532
  },
533
  {
534
  "epoch": 5.82,
535
- "learning_rate": 0.00022229045002474727,
536
- "loss": 0.1455,
537
  "step": 320
538
  },
539
  {
540
  "epoch": 5.89,
541
- "learning_rate": 0.000215992509456184,
542
- "loss": 0.1259,
543
  "step": 324
544
  },
545
  {
546
  "epoch": 5.96,
547
- "learning_rate": 0.000209716484852284,
548
- "loss": 0.1547,
549
  "step": 328
550
  },
551
  {
552
  "epoch": 6.0,
553
- "pls_score": 60.8,
554
- "std": 4.155382052230576,
555
  "step": 330
556
  },
557
  {
558
  "epoch": 6.0,
559
- "eval_loss": 3.2443737983703613,
560
- "eval_runtime": 4.9618,
561
- "eval_samples_per_second": 4.635,
562
- "eval_steps_per_second": 1.209,
563
  "step": 330
564
  },
565
  {
566
  "epoch": 6.04,
567
- "learning_rate": 0.0002034664207670925,
568
- "loss": 0.1248,
569
  "step": 332
570
  },
571
  {
572
  "epoch": 6.11,
573
- "learning_rate": 0.0001972463450245226,
574
- "loss": 0.1031,
575
  "step": 336
576
  },
577
  {
578
  "epoch": 6.18,
579
- "learning_rate": 0.00019106026612264316,
580
- "loss": 0.1077,
581
  "step": 340
582
  },
583
  {
584
  "epoch": 6.25,
585
- "learning_rate": 0.00018491217065042198,
586
- "loss": 0.1014,
587
  "step": 344
588
  },
589
  {
590
  "epoch": 6.33,
591
- "learning_rate": 0.00017880602071858692,
592
- "loss": 0.1129,
593
  "step": 348
594
  },
595
  {
596
  "epoch": 6.4,
597
- "learning_rate": 0.00017274575140626317,
598
- "loss": 0.1099,
599
  "step": 352
600
  },
601
  {
602
  "epoch": 6.47,
603
- "learning_rate": 0.00016673526822502983,
604
- "loss": 0.1073,
605
  "step": 356
606
  },
607
  {
608
  "epoch": 6.55,
609
- "learning_rate": 0.00016077844460203207,
610
- "loss": 0.1967,
611
  "step": 360
612
  },
613
  {
614
  "epoch": 6.62,
615
- "learning_rate": 0.00015487911938376925,
616
- "loss": 0.1046,
617
  "step": 364
618
  },
619
  {
620
  "epoch": 6.69,
621
- "learning_rate": 0.00014904109436216883,
622
- "loss": 0.1225,
623
  "step": 368
624
  },
625
  {
626
  "epoch": 6.76,
627
- "learning_rate": 0.00014326813182453956,
628
- "loss": 0.1107,
629
  "step": 372
630
  },
631
  {
632
  "epoch": 6.84,
633
- "learning_rate": 0.0001375639521289836,
634
- "loss": 0.106,
635
  "step": 376
636
  },
637
  {
638
  "epoch": 6.91,
639
- "learning_rate": 0.00013193223130682935,
640
- "loss": 0.1025,
641
  "step": 380
642
  },
643
  {
644
  "epoch": 6.98,
645
- "learning_rate": 0.00012637659869363084,
646
- "loss": 0.1096,
647
  "step": 384
648
  },
649
  {
650
  "epoch": 7.0,
651
- "pls_score": 59.183673469387756,
652
- "std": 4.240153693923696,
653
  "step": 385
654
  },
655
  {
656
  "epoch": 7.0,
657
- "eval_loss": 3.395291566848755,
658
- "eval_runtime": 4.9577,
659
- "eval_samples_per_second": 4.639,
660
- "eval_steps_per_second": 1.21,
661
  "step": 385
662
  },
663
  {
664
  "epoch": 7.05,
665
- "learning_rate": 0.00012090063459025954,
666
- "loss": 0.0881,
667
  "step": 388
668
  },
669
  {
670
  "epoch": 7.13,
671
- "learning_rate": 0.0001155078679555969,
672
- "loss": 0.0862,
673
  "step": 392
674
  },
675
  {
676
  "epoch": 7.2,
677
- "learning_rate": 0.00011020177413231333,
678
- "loss": 0.1033,
679
  "step": 396
680
  },
681
  {
682
  "epoch": 7.27,
683
- "learning_rate": 0.00010498577260720049,
684
- "loss": 0.0953,
685
  "step": 400
686
  },
687
  {
688
  "epoch": 7.35,
689
- "learning_rate": 9.986322480749927e-05,
690
- "loss": 0.0981,
691
  "step": 404
692
  },
693
  {
694
  "epoch": 7.42,
695
- "learning_rate": 9.483743193464408e-05,
696
- "loss": 0.1042,
697
  "step": 408
698
  },
699
  {
700
  "epoch": 7.49,
701
- "learning_rate": 8.991163283681945e-05,
702
- "loss": 0.0992,
703
  "step": 412
704
  },
705
  {
706
  "epoch": 7.56,
707
- "learning_rate": 8.508900192169963e-05,
708
- "loss": 0.0904,
709
  "step": 416
710
  },
711
  {
712
  "epoch": 7.64,
713
- "learning_rate": 8.037264711071699e-05,
714
- "loss": 0.1085,
715
  "step": 420
716
  },
717
  {
718
  "epoch": 7.71,
719
- "learning_rate": 7.576560783617667e-05,
720
- "loss": 0.0924,
721
  "step": 424
722
  },
723
  {
724
  "epoch": 7.78,
725
- "learning_rate": 7.127085308250913e-05,
726
- "loss": 0.213,
727
  "step": 428
728
  },
729
  {
730
  "epoch": 7.85,
731
- "learning_rate": 6.689127947292231e-05,
732
- "loss": 0.0925,
733
  "step": 432
734
  },
735
  {
736
  "epoch": 7.93,
737
- "learning_rate": 6.262970940268654e-05,
738
- "loss": 0.1037,
739
  "step": 436
740
  },
741
  {
742
  "epoch": 8.0,
743
- "learning_rate": 5.848888922025553e-05,
744
- "loss": 0.1053,
745
  "step": 440
746
  },
747
  {
748
  "epoch": 8.0,
749
- "pls_score": 58.0,
750
- "std": 3.7629775444453557,
751
  "step": 440
752
  },
753
  {
754
  "epoch": 8.0,
755
- "eval_loss": 3.4806602001190186,
756
- "eval_runtime": 4.9487,
757
- "eval_samples_per_second": 4.648,
758
- "eval_steps_per_second": 1.212,
759
  "step": 440
760
  },
761
  {
762
  "epoch": 8.07,
763
- "learning_rate": 5.4471487457395216e-05,
764
- "loss": 0.0906,
765
  "step": 444
766
  },
767
  {
768
  "epoch": 8.15,
769
- "learning_rate": 5.058009310946118e-05,
770
- "loss": 0.1902,
771
  "step": 448
772
  },
773
  {
774
  "epoch": 8.22,
775
- "learning_rate": 4.6817213966933034e-05,
776
- "loss": 0.093,
777
  "step": 452
778
  },
779
  {
780
  "epoch": 8.29,
781
- "learning_rate": 4.318527499928074e-05,
782
- "loss": 0.0911,
783
  "step": 456
784
  },
785
  {
786
  "epoch": 8.36,
787
- "learning_rate": 3.968661679220467e-05,
788
- "loss": 0.0958,
789
  "step": 460
790
  },
791
  {
792
  "epoch": 8.44,
793
- "learning_rate": 3.632349403925664e-05,
794
- "loss": 0.0956,
795
  "step": 464
796
  },
797
  {
798
  "epoch": 8.51,
799
- "learning_rate": 3.309807408881269e-05,
800
- "loss": 0.094,
801
  "step": 468
802
  },
803
  {
804
  "epoch": 8.58,
805
- "learning_rate": 3.0012435547336736e-05,
806
- "loss": 0.097,
807
  "step": 472
808
  },
809
  {
810
  "epoch": 8.65,
811
- "learning_rate": 2.7068566939831645e-05,
812
- "loss": 0.0919,
813
  "step": 476
814
  },
815
  {
816
  "epoch": 8.73,
817
- "learning_rate": 2.4268365428344735e-05,
818
- "loss": 0.0937,
819
  "step": 480
820
  },
821
  {
822
  "epoch": 8.8,
823
- "learning_rate": 2.1613635589349755e-05,
824
- "loss": 0.1222,
825
  "step": 484
826
  },
827
  {
828
  "epoch": 8.87,
829
- "learning_rate": 1.9106088250797264e-05,
830
- "loss": 0.0907,
831
  "step": 488
832
  },
833
  {
834
  "epoch": 8.95,
835
- "learning_rate": 1.674733938957873e-05,
836
- "loss": 0.0969,
837
  "step": 492
838
  },
839
  {
840
  "epoch": 9.0,
841
- "pls_score": 55.2,
842
- "std": 4.022337628792492,
843
  "step": 495
844
  },
845
  {
846
  "epoch": 9.0,
847
- "eval_loss": 3.5086076259613037,
848
- "eval_runtime": 4.9635,
849
- "eval_samples_per_second": 4.634,
850
- "eval_steps_per_second": 1.209,
851
  "step": 495
852
  },
853
  {
854
  "epoch": 9.02,
855
- "learning_rate": 1.4538909090118846e-05,
856
- "loss": 0.0869,
857
  "step": 496
858
  },
859
  {
860
  "epoch": 9.09,
861
- "learning_rate": 1.2482220564763668e-05,
862
- "loss": 0.1033,
863
  "step": 500
864
  },
865
  {
866
  "epoch": 9.16,
867
- "learning_rate": 1.0578599236598707e-05,
868
- "loss": 0.105,
869
  "step": 504
870
  },
871
  {
872
  "epoch": 9.24,
873
- "learning_rate": 8.829271885286095e-06,
874
- "loss": 0.0989,
875
  "step": 508
876
  },
877
  {
878
  "epoch": 9.31,
879
- "learning_rate": 7.235365856472442e-06,
880
- "loss": 0.0901,
881
  "step": 512
882
  },
883
  {
884
  "epoch": 9.38,
885
- "learning_rate": 5.797908335276214e-06,
886
- "loss": 0.0912,
887
  "step": 516
888
  },
889
  {
890
  "epoch": 9.45,
891
- "learning_rate": 4.517825684323323e-06,
892
- "loss": 0.0857,
893
  "step": 520
894
  },
895
  {
896
  "epoch": 9.53,
897
- "learning_rate": 3.3959428467570664e-06,
898
- "loss": 0.112,
899
  "step": 524
900
  },
901
  {
902
  "epoch": 9.6,
903
- "learning_rate": 2.4329828146074094e-06,
904
- "loss": 0.0845,
905
  "step": 528
906
  },
907
  {
908
  "epoch": 9.67,
909
- "learning_rate": 1.6295661628624448e-06,
910
- "loss": 0.0966,
911
  "step": 532
912
  },
913
  {
914
  "epoch": 9.75,
915
- "learning_rate": 9.862106495415469e-07,
916
- "loss": 0.0973,
917
  "step": 536
918
  },
919
  {
920
  "epoch": 9.82,
921
- "learning_rate": 5.033308820289185e-07,
922
- "loss": 0.0894,
923
  "step": 540
924
  },
925
  {
926
  "epoch": 9.89,
927
- "learning_rate": 1.8123804988159908e-07,
928
- "loss": 0.0867,
929
  "step": 544
930
  },
931
  {
932
  "epoch": 9.96,
933
- "learning_rate": 2.0139724285161975e-08,
934
- "loss": 0.1768,
935
  "step": 548
936
  },
937
  {
938
  "epoch": 10.0,
939
- "pls_score": 57.95918367346939,
940
- "std": 4.172255118366954,
941
  "step": 550
942
  },
943
  {
944
  "epoch": 10.0,
945
- "eval_loss": 3.5129005908966064,
946
- "eval_runtime": 4.9489,
947
- "eval_samples_per_second": 4.648,
948
- "eval_steps_per_second": 1.212,
949
  "step": 550
950
  },
951
  {
952
  "epoch": 10.0,
953
  "step": 550,
954
- "total_flos": 1.8757245462528e+16,
955
- "train_loss": 0.1272620278596878,
956
- "train_runtime": 22126.0761,
957
- "train_samples_per_second": 0.098,
958
- "train_steps_per_second": 0.025
959
  }
960
  ],
961
  "logging_steps": 4,
@@ -963,7 +963,7 @@
963
  "num_input_tokens_seen": 0,
964
  "num_train_epochs": 10,
965
  "save_steps": 55,
966
- "total_flos": 1.8757245462528e+16,
967
  "train_batch_size": 4,
968
  "trial_name": null,
969
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "learning_rate": 1.8181818181818183e-07,
14
  "loss": 2.7431,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.07,
19
+ "learning_rate": 7.272727272727273e-07,
20
+ "loss": 2.89,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.15,
25
+ "learning_rate": 1.4545454545454546e-06,
26
+ "loss": 2.7283,
27
  "step": 8
28
  },
29
  {
30
  "epoch": 0.22,
31
+ "learning_rate": 2.181818181818182e-06,
32
+ "loss": 2.6956,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.29,
37
+ "learning_rate": 2.9090909090909093e-06,
38
+ "loss": 2.6684,
39
  "step": 16
40
  },
41
  {
42
  "epoch": 0.36,
43
+ "learning_rate": 3.6363636363636366e-06,
44
+ "loss": 2.7545,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.44,
49
+ "learning_rate": 4.363636363636364e-06,
50
+ "loss": 2.8797,
51
  "step": 24
52
  },
53
  {
54
  "epoch": 0.51,
55
+ "learning_rate": 5.090909090909091e-06,
56
+ "loss": 3.0007,
57
  "step": 28
58
  },
59
  {
60
  "epoch": 0.58,
61
+ "learning_rate": 5.8181818181818185e-06,
62
+ "loss": 2.9172,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 0.65,
67
+ "learning_rate": 6.545454545454546e-06,
68
+ "loss": 2.8011,
69
  "step": 36
70
  },
71
  {
72
  "epoch": 0.73,
73
+ "learning_rate": 7.272727272727273e-06,
74
+ "loss": 2.6401,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
+ "learning_rate": 8.000000000000001e-06,
80
+ "loss": 2.7162,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
+ "learning_rate": 8.727272727272728e-06,
86
+ "loss": 2.6036,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
+ "learning_rate": 9.454545454545456e-06,
92
+ "loss": 2.6749,
93
  "step": 52
94
  },
95
+ {
96
+ "epoch": 1.0,
97
+ "pls_score": 26.262626262626263,
98
+ "std": 2.1312214954012014,
99
+ "step": 55
100
+ },
101
+ {
102
+ "epoch": 1.0,
103
+ "eval_loss": 2.5386734008789062,
104
+ "eval_runtime": 4.9212,
105
+ "eval_samples_per_second": 4.674,
106
+ "eval_steps_per_second": 1.219,
107
+ "step": 55
108
+ },
109
  {
110
  "epoch": 1.02,
111
+ "learning_rate": 9.999899300364534e-06,
112
+ "loss": 2.6648,
113
  "step": 56
114
  },
115
  {
116
  "epoch": 1.09,
117
+ "learning_rate": 9.997482711915926e-06,
118
+ "loss": 2.5038,
119
  "step": 60
120
  },
121
  {
122
  "epoch": 1.16,
123
+ "learning_rate": 9.991845519630679e-06,
124
+ "loss": 2.424,
125
  "step": 64
126
  },
127
  {
128
  "epoch": 1.24,
129
+ "learning_rate": 9.982991356370404e-06,
130
+ "loss": 2.492,
131
  "step": 68
132
  },
133
  {
134
  "epoch": 1.31,
135
+ "learning_rate": 9.970925928158275e-06,
136
+ "loss": 2.5985,
137
  "step": 72
138
  },
139
  {
140
  "epoch": 1.38,
141
+ "learning_rate": 9.955657010501807e-06,
142
+ "loss": 2.2698,
143
  "step": 76
144
  },
145
  {
146
  "epoch": 1.45,
147
+ "learning_rate": 9.937194443381972e-06,
148
+ "loss": 2.3577,
149
  "step": 80
150
  },
151
  {
152
  "epoch": 1.53,
153
+ "learning_rate": 9.915550124911866e-06,
154
+ "loss": 2.4562,
155
  "step": 84
156
  },
157
  {
158
  "epoch": 1.6,
159
+ "learning_rate": 9.890738003669029e-06,
160
+ "loss": 2.266,
161
  "step": 88
162
  },
163
  {
164
  "epoch": 1.67,
165
+ "learning_rate": 9.862774069706346e-06,
166
+ "loss": 2.2448,
167
  "step": 92
168
  },
169
  {
170
  "epoch": 1.75,
171
+ "learning_rate": 9.831676344247343e-06,
172
+ "loss": 2.2374,
173
  "step": 96
174
  },
175
  {
176
  "epoch": 1.82,
177
+ "learning_rate": 9.797464868072489e-06,
178
+ "loss": 2.2036,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 1.89,
183
+ "learning_rate": 9.760161688604008e-06,
184
+ "loss": 2.1167,
185
  "step": 104
186
  },
187
  {
188
  "epoch": 1.96,
189
+ "learning_rate": 9.719790845697534e-06,
190
+ "loss": 2.133,
191
  "step": 108
192
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  {
194
  "epoch": 2.04,
195
+ "learning_rate": 9.676378356149733e-06,
196
+ "loss": 2.2253,
197
  "step": 112
198
  },
199
  {
200
  "epoch": 2.11,
201
+ "learning_rate": 9.629952196931902e-06,
202
+ "loss": 2.1681,
203
  "step": 116
204
  },
205
  {
206
  "epoch": 2.18,
207
+ "learning_rate": 9.580542287160348e-06,
208
+ "loss": 2.3558,
209
  "step": 120
210
  },
211
  {
212
  "epoch": 2.25,
213
+ "learning_rate": 9.528180468815155e-06,
214
+ "loss": 2.1262,
215
  "step": 124
216
  },
217
  {
218
  "epoch": 2.33,
219
+ "learning_rate": 9.47290048621977e-06,
220
+ "loss": 2.0675,
221
  "step": 128
222
  },
223
  {
224
  "epoch": 2.4,
225
+ "learning_rate": 9.414737964294636e-06,
226
+ "loss": 2.0984,
227
  "step": 132
228
  },
229
  {
230
  "epoch": 2.47,
231
+ "learning_rate": 9.353730385598887e-06,
232
+ "loss": 2.0763,
233
  "step": 136
234
  },
235
  {
236
  "epoch": 2.55,
237
+ "learning_rate": 9.289917066174887e-06,
238
+ "loss": 2.0899,
239
  "step": 140
240
  },
241
  {
242
  "epoch": 2.62,
243
+ "learning_rate": 9.223339130211194e-06,
244
+ "loss": 1.9647,
245
  "step": 144
246
  },
247
  {
248
  "epoch": 2.69,
249
+ "learning_rate": 9.154039483540273e-06,
250
+ "loss": 2.0805,
251
  "step": 148
252
  },
253
  {
254
  "epoch": 2.76,
255
+ "learning_rate": 9.08206278598805e-06,
256
+ "loss": 1.996,
257
  "step": 152
258
  },
259
  {
260
  "epoch": 2.84,
261
+ "learning_rate": 9.007455422593077e-06,
262
+ "loss": 2.1735,
263
  "step": 156
264
  },
265
  {
266
  "epoch": 2.91,
267
+ "learning_rate": 8.930265473713939e-06,
268
+ "loss": 2.197,
269
  "step": 160
270
  },
271
  {
272
  "epoch": 2.98,
273
+ "learning_rate": 8.850542684044078e-06,
274
+ "loss": 2.0502,
275
  "step": 164
276
  },
277
+ {
278
+ "epoch": 3.0,
279
+ "pls_score": 60.4,
280
+ "std": 4.326291714621197,
281
+ "step": 165
282
+ },
283
+ {
284
+ "epoch": 3.0,
285
+ "eval_loss": 2.0924267768859863,
286
+ "eval_runtime": 4.9011,
287
+ "eval_samples_per_second": 4.693,
288
+ "eval_steps_per_second": 1.224,
289
+ "step": 165
290
+ },
291
  {
292
  "epoch": 3.05,
293
+ "learning_rate": 8.768338430554083e-06,
294
+ "loss": 1.8989,
295
  "step": 168
296
  },
297
  {
298
  "epoch": 3.13,
299
+ "learning_rate": 8.683705689382025e-06,
300
+ "loss": 2.149,
301
  "step": 172
302
  },
303
  {
304
  "epoch": 3.2,
305
+ "learning_rate": 8.596699001693257e-06,
306
+ "loss": 2.0518,
307
  "step": 176
308
  },
309
  {
310
  "epoch": 3.27,
311
+ "learning_rate": 8.507374438531606e-06,
312
+ "loss": 2.1007,
313
  "step": 180
314
  },
315
  {
316
  "epoch": 3.35,
317
+ "learning_rate": 8.415789564684673e-06,
318
+ "loss": 2.0187,
319
  "step": 184
320
  },
321
  {
322
  "epoch": 3.42,
323
+ "learning_rate": 8.322003401586463e-06,
324
+ "loss": 1.9728,
325
  "step": 188
326
  },
327
  {
328
  "epoch": 3.49,
329
+ "learning_rate": 8.226076389281316e-06,
330
+ "loss": 1.9981,
331
  "step": 192
332
  },
333
  {
334
  "epoch": 3.56,
335
+ "learning_rate": 8.128070347473609e-06,
336
+ "loss": 2.0167,
337
  "step": 196
338
  },
339
  {
340
  "epoch": 3.64,
341
+ "learning_rate": 8.028048435688333e-06,
342
+ "loss": 2.023,
343
  "step": 200
344
  },
345
  {
346
  "epoch": 3.71,
347
+ "learning_rate": 7.92607511256826e-06,
348
+ "loss": 2.0933,
349
  "step": 204
350
  },
351
  {
352
  "epoch": 3.78,
353
+ "learning_rate": 7.822216094333847e-06,
354
+ "loss": 2.0343,
355
  "step": 208
356
  },
357
  {
358
  "epoch": 3.85,
359
+ "learning_rate": 7.716538312432767e-06,
360
+ "loss": 2.0147,
361
  "step": 212
362
  },
363
  {
364
  "epoch": 3.93,
365
+ "learning_rate": 7.60910987040623e-06,
366
+ "loss": 2.0385,
367
  "step": 216
368
  },
369
  {
370
  "epoch": 4.0,
371
+ "learning_rate": 7.500000000000001e-06,
372
+ "loss": 1.953,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  "step": 220
374
  },
375
  {
376
  "epoch": 4.07,
377
+ "learning_rate": 7.3892790165483164e-06,
378
+ "loss": 1.9079,
379
  "step": 224
380
  },
381
  {
382
  "epoch": 4.15,
383
+ "learning_rate": 7.2770182736595164e-06,
384
+ "loss": 2.0103,
385
  "step": 228
386
  },
387
  {
388
  "epoch": 4.22,
389
+ "learning_rate": 7.163290117232542e-06,
390
+ "loss": 2.0291,
391
  "step": 232
392
  },
393
  {
394
  "epoch": 4.29,
395
+ "learning_rate": 7.048167838833977e-06,
396
+ "loss": 1.9614,
397
  "step": 236
398
  },
399
  {
400
  "epoch": 4.36,
401
+ "learning_rate": 6.931725628465643e-06,
402
+ "loss": 1.9674,
403
  "step": 240
404
  },
405
  {
406
  "epoch": 4.44,
407
+ "learning_rate": 6.814038526753205e-06,
408
+ "loss": 1.8811,
409
  "step": 244
410
  },
411
  {
412
  "epoch": 4.51,
413
+ "learning_rate": 6.695182376586603e-06,
414
+ "loss": 1.9888,
415
  "step": 248
416
  },
417
  {
418
  "epoch": 4.58,
419
+ "learning_rate": 6.5752337742434644e-06,
420
+ "loss": 1.8913,
421
  "step": 252
422
  },
423
  {
424
  "epoch": 4.65,
425
+ "learning_rate": 6.454270020026996e-06,
426
+ "loss": 2.0448,
427
  "step": 256
428
  },
429
  {
430
  "epoch": 4.73,
431
+ "learning_rate": 6.332369068450175e-06,
432
+ "loss": 1.9602,
433
  "step": 260
434
  },
435
  {
436
  "epoch": 4.8,
437
+ "learning_rate": 6.209609477998339e-06,
438
+ "loss": 2.0635,
439
  "step": 264
440
  },
441
  {
442
  "epoch": 4.87,
443
+ "learning_rate": 6.08607036050254e-06,
444
+ "loss": 1.9178,
445
  "step": 268
446
  },
447
  {
448
  "epoch": 4.95,
449
+ "learning_rate": 5.961831330156306e-06,
450
+ "loss": 1.9287,
451
  "step": 272
452
  },
453
  {
454
  "epoch": 5.0,
455
+ "pls_score": 57.4,
456
+ "std": 3.9880822458921283,
457
  "step": 275
458
  },
459
  {
460
  "epoch": 5.0,
461
+ "eval_loss": 2.0102691650390625,
462
+ "eval_runtime": 4.9168,
463
+ "eval_samples_per_second": 4.678,
464
+ "eval_steps_per_second": 1.22,
465
  "step": 275
466
  },
467
  {
468
  "epoch": 5.02,
469
+ "learning_rate": 5.8369724522086545e-06,
470
+ "loss": 1.8716,
471
  "step": 276
472
  },
473
  {
474
  "epoch": 5.09,
475
+ "learning_rate": 5.711574191366427e-06,
476
+ "loss": 1.9052,
477
  "step": 280
478
  },
479
  {
480
  "epoch": 5.16,
481
+ "learning_rate": 5.585717359939192e-06,
482
+ "loss": 1.8104,
483
  "step": 284
484
  },
485
  {
486
  "epoch": 5.24,
487
+ "learning_rate": 5.459483065760138e-06,
488
+ "loss": 1.9212,
489
  "step": 288
490
  },
491
  {
492
  "epoch": 5.31,
493
+ "learning_rate": 5.33295265991652e-06,
494
+ "loss": 1.9339,
495
  "step": 292
496
  },
497
  {
498
  "epoch": 5.38,
499
+ "learning_rate": 5.206207684323337e-06,
500
+ "loss": 1.9414,
501
  "step": 296
502
  },
503
  {
504
  "epoch": 5.45,
505
+ "learning_rate": 5.07932981917404e-06,
506
+ "loss": 1.8935,
507
  "step": 300
508
  },
509
  {
510
  "epoch": 5.53,
511
+ "learning_rate": 4.952400830302117e-06,
512
+ "loss": 2.0557,
513
  "step": 304
514
  },
515
  {
516
  "epoch": 5.6,
517
+ "learning_rate": 4.825502516487497e-06,
518
+ "loss": 1.8989,
519
  "step": 308
520
  },
521
  {
522
  "epoch": 5.67,
523
+ "learning_rate": 4.6987166567417085e-06,
524
+ "loss": 1.8834,
525
  "step": 312
526
  },
527
  {
528
  "epoch": 5.75,
529
+ "learning_rate": 4.572124957605803e-06,
530
+ "loss": 1.8287,
531
  "step": 316
532
  },
533
  {
534
  "epoch": 5.82,
535
+ "learning_rate": 4.445809000494945e-06,
536
+ "loss": 1.944,
537
  "step": 320
538
  },
539
  {
540
  "epoch": 5.89,
541
+ "learning_rate": 4.319850189123681e-06,
542
+ "loss": 1.9083,
543
  "step": 324
544
  },
545
  {
546
  "epoch": 5.96,
547
+ "learning_rate": 4.194329697045681e-06,
548
+ "loss": 2.0021,
549
  "step": 328
550
  },
551
  {
552
  "epoch": 6.0,
553
+ "pls_score": 58.4,
554
+ "std": 4.227150340359329,
555
  "step": 330
556
  },
557
  {
558
  "epoch": 6.0,
559
+ "eval_loss": 1.9934663772583008,
560
+ "eval_runtime": 4.9133,
561
+ "eval_samples_per_second": 4.681,
562
+ "eval_steps_per_second": 1.221,
563
  "step": 330
564
  },
565
  {
566
  "epoch": 6.04,
567
+ "learning_rate": 4.06932841534185e-06,
568
+ "loss": 2.0032,
569
  "step": 332
570
  },
571
  {
572
  "epoch": 6.11,
573
+ "learning_rate": 3.944926900490452e-06,
574
+ "loss": 1.968,
575
  "step": 336
576
  },
577
  {
578
  "epoch": 6.18,
579
+ "learning_rate": 3.821205322452863e-06,
580
+ "loss": 1.8771,
581
  "step": 340
582
  },
583
  {
584
  "epoch": 6.25,
585
+ "learning_rate": 3.69824341300844e-06,
586
+ "loss": 1.948,
587
  "step": 344
588
  },
589
  {
590
  "epoch": 6.33,
591
+ "learning_rate": 3.5761204143717387e-06,
592
+ "loss": 1.9014,
593
  "step": 348
594
  },
595
  {
596
  "epoch": 6.4,
597
+ "learning_rate": 3.4549150281252635e-06,
598
+ "loss": 1.9815,
599
  "step": 352
600
  },
601
  {
602
  "epoch": 6.47,
603
+ "learning_rate": 3.3347053645005965e-06,
604
+ "loss": 1.9389,
605
  "step": 356
606
  },
607
  {
608
  "epoch": 6.55,
609
+ "learning_rate": 3.2155688920406415e-06,
610
+ "loss": 1.7294,
611
  "step": 360
612
  },
613
  {
614
  "epoch": 6.62,
615
+ "learning_rate": 3.097582387675385e-06,
616
+ "loss": 1.8354,
617
  "step": 364
618
  },
619
  {
620
  "epoch": 6.69,
621
+ "learning_rate": 2.980821887243377e-06,
622
+ "loss": 1.9155,
623
  "step": 368
624
  },
625
  {
626
  "epoch": 6.76,
627
+ "learning_rate": 2.8653626364907918e-06,
628
+ "loss": 1.9047,
629
  "step": 372
630
  },
631
  {
632
  "epoch": 6.84,
633
+ "learning_rate": 2.751279042579672e-06,
634
+ "loss": 1.949,
635
  "step": 376
636
  },
637
  {
638
  "epoch": 6.91,
639
+ "learning_rate": 2.6386446261365874e-06,
640
+ "loss": 1.8363,
641
  "step": 380
642
  },
643
  {
644
  "epoch": 6.98,
645
+ "learning_rate": 2.527531973872617e-06,
646
+ "loss": 1.8199,
647
  "step": 384
648
  },
649
  {
650
  "epoch": 7.0,
651
+ "pls_score": 59.6,
652
+ "std": 4.098389927764319,
653
  "step": 385
654
  },
655
  {
656
  "epoch": 7.0,
657
+ "eval_loss": 1.9878090620040894,
658
+ "eval_runtime": 4.9106,
659
+ "eval_samples_per_second": 4.684,
660
+ "eval_steps_per_second": 1.222,
661
  "step": 385
662
  },
663
  {
664
  "epoch": 7.05,
665
+ "learning_rate": 2.418012691805191e-06,
666
+ "loss": 2.0762,
667
  "step": 388
668
  },
669
  {
670
  "epoch": 7.13,
671
+ "learning_rate": 2.310157359111938e-06,
672
+ "loss": 1.8485,
673
  "step": 392
674
  },
675
  {
676
  "epoch": 7.2,
677
+ "learning_rate": 2.204035482646267e-06,
678
+ "loss": 1.9232,
679
  "step": 396
680
  },
681
  {
682
  "epoch": 7.27,
683
+ "learning_rate": 2.09971545214401e-06,
684
+ "loss": 1.8724,
685
  "step": 400
686
  },
687
  {
688
  "epoch": 7.35,
689
+ "learning_rate": 1.9972644961499853e-06,
690
+ "loss": 1.8075,
691
  "step": 404
692
  },
693
  {
694
  "epoch": 7.42,
695
+ "learning_rate": 1.8967486386928819e-06,
696
+ "loss": 1.8526,
697
  "step": 408
698
  },
699
  {
700
  "epoch": 7.49,
701
+ "learning_rate": 1.798232656736389e-06,
702
+ "loss": 1.899,
703
  "step": 412
704
  },
705
  {
706
  "epoch": 7.56,
707
+ "learning_rate": 1.7017800384339928e-06,
708
+ "loss": 1.7487,
709
  "step": 416
710
  },
711
  {
712
  "epoch": 7.64,
713
+ "learning_rate": 1.6074529422143398e-06,
714
+ "loss": 1.9218,
715
  "step": 420
716
  },
717
  {
718
  "epoch": 7.71,
719
+ "learning_rate": 1.5153121567235334e-06,
720
+ "loss": 1.8776,
721
  "step": 424
722
  },
723
  {
724
  "epoch": 7.78,
725
+ "learning_rate": 1.4254170616501828e-06,
726
+ "loss": 1.761,
727
  "step": 428
728
  },
729
  {
730
  "epoch": 7.85,
731
+ "learning_rate": 1.3378255894584463e-06,
732
+ "loss": 1.9254,
733
  "step": 432
734
  },
735
  {
736
  "epoch": 7.93,
737
+ "learning_rate": 1.2525941880537307e-06,
738
+ "loss": 1.8895,
739
  "step": 436
740
  },
741
  {
742
  "epoch": 8.0,
743
+ "learning_rate": 1.1697777844051105e-06,
744
+ "loss": 1.9559,
745
  "step": 440
746
  },
747
  {
748
  "epoch": 8.0,
749
+ "pls_score": 60.0,
750
+ "std": 4.233202097703344,
751
  "step": 440
752
  },
753
  {
754
  "epoch": 8.0,
755
+ "eval_loss": 1.9857509136199951,
756
+ "eval_runtime": 4.912,
757
+ "eval_samples_per_second": 4.682,
758
+ "eval_steps_per_second": 1.222,
759
  "step": 440
760
  },
761
  {
762
  "epoch": 8.07,
763
+ "learning_rate": 1.0894297491479044e-06,
764
+ "loss": 1.8595,
765
  "step": 444
766
  },
767
  {
768
  "epoch": 8.15,
769
+ "learning_rate": 1.0116018621892237e-06,
770
+ "loss": 1.7881,
771
  "step": 448
772
  },
773
  {
774
  "epoch": 8.22,
775
+ "learning_rate": 9.363442793386606e-07,
776
+ "loss": 1.8575,
777
  "step": 452
778
  },
779
  {
780
  "epoch": 8.29,
781
+ "learning_rate": 8.637054999856148e-07,
782
+ "loss": 1.8457,
783
  "step": 456
784
  },
785
  {
786
  "epoch": 8.36,
787
+ "learning_rate": 7.937323358440935e-07,
788
+ "loss": 1.8717,
789
  "step": 460
790
  },
791
  {
792
  "epoch": 8.44,
793
+ "learning_rate": 7.264698807851328e-07,
794
+ "loss": 1.887,
795
  "step": 464
796
  },
797
  {
798
  "epoch": 8.51,
799
+ "learning_rate": 6.619614817762537e-07,
800
+ "loss": 1.8931,
801
  "step": 468
802
  },
803
  {
804
  "epoch": 8.58,
805
+ "learning_rate": 6.002487109467347e-07,
806
+ "loss": 1.8454,
807
  "step": 472
808
  },
809
  {
810
  "epoch": 8.65,
811
+ "learning_rate": 5.413713387966329e-07,
812
+ "loss": 1.9584,
813
  "step": 476
814
  },
815
  {
816
  "epoch": 8.73,
817
+ "learning_rate": 4.853673085668947e-07,
818
+ "loss": 1.9234,
819
  "step": 480
820
  },
821
  {
822
  "epoch": 8.8,
823
+ "learning_rate": 4.322727117869951e-07,
824
+ "loss": 1.8912,
825
  "step": 484
826
  },
827
  {
828
  "epoch": 8.87,
829
+ "learning_rate": 3.821217650159453e-07,
830
+ "loss": 1.8707,
831
  "step": 488
832
  },
833
  {
834
  "epoch": 8.95,
835
+ "learning_rate": 3.3494678779157464e-07,
836
+ "loss": 1.8229,
837
  "step": 492
838
  },
839
  {
840
  "epoch": 9.0,
841
+ "pls_score": 59.2,
842
+ "std": 4.343639027359432,
843
  "step": 495
844
  },
845
  {
846
  "epoch": 9.0,
847
+ "eval_loss": 1.9853376150131226,
848
+ "eval_runtime": 4.916,
849
+ "eval_samples_per_second": 4.679,
850
+ "eval_steps_per_second": 1.221,
851
  "step": 495
852
  },
853
  {
854
  "epoch": 9.02,
855
+ "learning_rate": 2.9077818180237693e-07,
856
+ "loss": 1.7843,
857
  "step": 496
858
  },
859
  {
860
  "epoch": 9.09,
861
+ "learning_rate": 2.4964441129527337e-07,
862
+ "loss": 1.9182,
863
  "step": 500
864
  },
865
  {
866
  "epoch": 9.16,
867
+ "learning_rate": 2.1157198473197417e-07,
868
+ "loss": 1.9246,
869
  "step": 504
870
  },
871
  {
872
  "epoch": 9.24,
873
+ "learning_rate": 1.765854377057219e-07,
874
+ "loss": 1.8744,
875
  "step": 508
876
  },
877
  {
878
  "epoch": 9.31,
879
+ "learning_rate": 1.4470731712944885e-07,
880
+ "loss": 1.872,
881
  "step": 512
882
  },
883
  {
884
  "epoch": 9.38,
885
+ "learning_rate": 1.1595816670552429e-07,
886
+ "loss": 1.8631,
887
  "step": 516
888
  },
889
  {
890
  "epoch": 9.45,
891
+ "learning_rate": 9.035651368646647e-08,
892
+ "loss": 1.889,
893
  "step": 520
894
  },
895
  {
896
  "epoch": 9.53,
897
+ "learning_rate": 6.791885693514134e-08,
898
+ "loss": 1.9402,
899
  "step": 524
900
  },
901
  {
902
  "epoch": 9.6,
903
+ "learning_rate": 4.865965629214819e-08,
904
+ "loss": 1.7325,
905
  "step": 528
906
  },
907
  {
908
  "epoch": 9.67,
909
+ "learning_rate": 3.25913232572489e-08,
910
+ "loss": 1.8187,
911
  "step": 532
912
  },
913
  {
914
  "epoch": 9.75,
915
+ "learning_rate": 1.9724212990830938e-08,
916
+ "loss": 1.877,
917
  "step": 536
918
  },
919
  {
920
  "epoch": 9.82,
921
+ "learning_rate": 1.006661764057837e-08,
922
+ "loss": 1.9245,
923
  "step": 540
924
  },
925
  {
926
  "epoch": 9.89,
927
+ "learning_rate": 3.6247609976319818e-09,
928
+ "loss": 1.8485,
929
  "step": 544
930
  },
931
  {
932
  "epoch": 9.96,
933
+ "learning_rate": 4.027944857032395e-10,
934
+ "loss": 1.8178,
935
  "step": 548
936
  },
937
  {
938
  "epoch": 10.0,
939
+ "pls_score": 56.326530612244895,
940
+ "std": 4.189332932171676,
941
  "step": 550
942
  },
943
  {
944
  "epoch": 10.0,
945
+ "eval_loss": 1.9850994348526,
946
+ "eval_runtime": 4.916,
947
+ "eval_samples_per_second": 4.679,
948
+ "eval_steps_per_second": 1.22,
949
  "step": 550
950
  },
951
  {
952
  "epoch": 10.0,
953
  "step": 550,
954
+ "total_flos": 1.8777166830084096e+16,
955
+ "train_loss": 1.1392531923814253,
956
+ "train_runtime": 18256.4954,
957
+ "train_samples_per_second": 0.119,
958
+ "train_steps_per_second": 0.03
959
  }
960
  ],
961
  "logging_steps": 4,
 
963
  "num_input_tokens_seen": 0,
964
  "num_train_epochs": 10,
965
  "save_steps": 55,
966
+ "total_flos": 1.8777166830084096e+16,
967
  "train_batch_size": 4,
968
  "trial_name": null,
969
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c677dd79b462fc73f2789bacd7347362aa793784208e35b1e034cb575ba3a945
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba76d65b0f816a12acd6deb020064c70467d4ba65c899f758b8c99c2c6e5f2d
3
  size 4792