alexue4 commited on
Commit
60aaa0e
1 Parent(s): 1306317

End of training

Browse files
Files changed (4) hide show
  1. README.md +21 -11
  2. pytorch_model.bin +1 -1
  3. trainer_state.json +952 -852
  4. training_args.bin +1 -1
README.md CHANGED
@@ -43,22 +43,32 @@ The following hyperparameters were used during training:
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 10
47
 
48
  ### Training results
49
 
50
  | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
  |:-------------:|:-----:|:-----:|:---------------:|:-------------:|:------------:|
52
- | 0.0023 | 1.0 | 3572 | 0.0001 | 0 | 0 |
53
- | 0.0017 | 2.0 | 7144 | 0.0000 | 0 | 0 |
54
- | 0.0017 | 3.0 | 10716 | 0.0000 | 0 | 0 |
55
- | 0.001 | 4.0 | 14288 | 0.0000 | 0 | 0 |
56
- | 0.0008 | 5.0 | 17860 | 0.0000 | 0 | 0 |
57
- | 0.0007 | 6.0 | 21432 | 0.0000 | 0 | 0 |
58
- | 0.0007 | 7.0 | 25004 | 0.0000 | 0 | 0 |
59
- | 0.0008 | 8.0 | 28576 | 0.0000 | 0 | 0 |
60
- | 0.0005 | 9.0 | 32148 | 0.0000 | 0 | 0 |
61
- | 0.0007 | 10.0 | 35720 | 0.0000 | 0 | 0 |
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  ### Framework versions
 
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 20
47
 
48
  ### Training results
49
 
50
  | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
  |:-------------:|:-----:|:-----:|:---------------:|:-------------:|:------------:|
52
+ | 0.0006 | 1.0 | 3574 | 0.0000 | 0 | 0 |
53
+ | 0.0005 | 2.0 | 7148 | 0.0000 | 0 | 0 |
54
+ | 0.0007 | 3.0 | 10722 | 0.0000 | 0 | 0 |
55
+ | 0.0008 | 4.0 | 14296 | 0.0000 | 0 | 0 |
56
+ | 0.0006 | 5.0 | 17870 | 0.0000 | 0 | 0 |
57
+ | 0.0006 | 6.0 | 21444 | 0.0000 | 0 | 0 |
58
+ | 0.0005 | 7.0 | 25018 | 0.0000 | 0 | 0 |
59
+ | 0.0005 | 8.0 | 28592 | 0.0000 | 0 | 0 |
60
+ | 0.0005 | 9.0 | 32166 | 0.0000 | 0 | 0 |
61
+ | 0.0004 | 10.0 | 35740 | 0.0000 | 0 | 0 |
62
+ | 0.0005 | 11.0 | 39314 | 0.0000 | 0 | 0 |
63
+ | 0.0004 | 12.0 | 42888 | 0.0000 | 0 | 0 |
64
+ | 0.0004 | 13.0 | 46462 | 0.0000 | 0 | 0 |
65
+ | 0.0004 | 14.0 | 50036 | 0.0000 | 0 | 0 |
66
+ | 0.0004 | 15.0 | 53610 | 0.0000 | 0 | 0 |
67
+ | 0.0004 | 16.0 | 57184 | 0.0000 | 0 | 0 |
68
+ | 0.0003 | 17.0 | 60758 | 0.0000 | 0 | 0 |
69
+ | 0.0003 | 18.0 | 64332 | 0.0000 | 0 | 0 |
70
+ | 0.0003 | 19.0 | 67906 | 0.0000 | 0 | 0 |
71
+ | 0.0004 | 20.0 | 71480 | 0.0000 | 0 | 0 |
72
 
73
 
74
  ### Framework versions
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11a07597ec89dc7de75db1cbdfb6734ffd642e668d8be3f0de777528a7673b07
3
  size 258643461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c49758897d50ad7e34d2f2e332d6496fcdfefab4b62a0755a6998165dd2526
3
  size 258643461
trainer_state.json CHANGED
@@ -1,1328 +1,1428 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 35720,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 2.799552071668533e-08,
14
- "loss": 0.0143,
15
  "step": 1
16
  },
17
- {
18
- "epoch": 0.05,
19
- "learning_rate": 5.011198208286674e-06,
20
- "loss": 0.0113,
21
- "step": 179
22
- },
23
  {
24
  "epoch": 0.1,
25
- "learning_rate": 1.0022396416573348e-05,
26
- "loss": 0.0079,
27
  "step": 358
28
  },
29
- {
30
- "epoch": 0.15,
31
- "learning_rate": 1.5033594624860023e-05,
32
- "loss": 0.0086,
33
- "step": 537
34
- },
35
  {
36
  "epoch": 0.2,
37
- "learning_rate": 2.0044792833146696e-05,
38
- "loss": 0.0089,
39
  "step": 716
40
  },
41
- {
42
- "epoch": 0.25,
43
- "learning_rate": 2.5055991041433373e-05,
44
- "loss": 0.0065,
45
- "step": 895
46
- },
47
  {
48
  "epoch": 0.3,
49
- "learning_rate": 3.0067189249720047e-05,
50
- "loss": 0.0049,
51
  "step": 1074
52
  },
53
- {
54
- "epoch": 0.35,
55
- "learning_rate": 3.5078387458006724e-05,
56
- "loss": 0.0043,
57
- "step": 1253
58
- },
59
  {
60
  "epoch": 0.4,
61
- "learning_rate": 4.008958566629339e-05,
62
- "loss": 0.0038,
63
  "step": 1432
64
  },
65
- {
66
- "epoch": 0.45,
67
- "learning_rate": 4.510078387458007e-05,
68
- "loss": 0.0035,
69
- "step": 1611
70
- },
71
  {
72
  "epoch": 0.5,
73
- "learning_rate": 5.0111982082866746e-05,
74
- "loss": 0.0034,
75
  "step": 1790
76
  },
77
- {
78
- "epoch": 0.55,
79
- "learning_rate": 5.512318029115342e-05,
80
- "loss": 0.0026,
81
- "step": 1969
82
- },
83
  {
84
  "epoch": 0.6,
85
- "learning_rate": 6.0134378499440094e-05,
86
- "loss": 0.0022,
87
  "step": 2148
88
  },
89
- {
90
- "epoch": 0.65,
91
- "learning_rate": 6.514557670772677e-05,
92
- "loss": 0.0022,
93
- "step": 2327
94
- },
95
  {
96
  "epoch": 0.7,
97
- "learning_rate": 7.015677491601345e-05,
98
- "loss": 0.0029,
99
  "step": 2506
100
  },
101
- {
102
- "epoch": 0.75,
103
- "learning_rate": 7.516797312430012e-05,
104
- "loss": 0.0029,
105
- "step": 2685
106
- },
107
  {
108
  "epoch": 0.8,
109
- "learning_rate": 8.017917133258678e-05,
110
- "loss": 0.0027,
111
  "step": 2864
112
  },
113
- {
114
- "epoch": 0.85,
115
- "learning_rate": 8.519036954087346e-05,
116
- "loss": 0.0018,
117
- "step": 3043
118
- },
119
  {
120
  "epoch": 0.9,
121
- "learning_rate": 9.020156774916014e-05,
122
- "loss": 0.0023,
123
  "step": 3222
124
  },
125
- {
126
- "epoch": 0.95,
127
- "learning_rate": 9.521276595744681e-05,
128
- "loss": 0.0023,
129
- "step": 3401
130
- },
131
  {
132
  "epoch": 1.0,
133
- "eval_loss": 5.158845306141302e-05,
134
  "eval_max_distance": 0,
135
  "eval_mean_distance": 0,
136
- "eval_runtime": 2.7067,
137
- "eval_samples_per_second": 18.473,
138
- "eval_steps_per_second": 0.739,
139
- "step": 3572
140
  },
141
  {
142
  "epoch": 1.0,
143
- "learning_rate": 9.997511509269628e-05,
144
- "loss": 0.0016,
145
  "step": 3580
146
  },
147
- {
148
- "epoch": 1.05,
149
- "learning_rate": 9.941831529177555e-05,
150
- "loss": 0.0021,
151
- "step": 3759
152
- },
153
  {
154
  "epoch": 1.1,
155
- "learning_rate": 9.886151549085481e-05,
156
- "loss": 0.0012,
157
  "step": 3938
158
  },
159
- {
160
- "epoch": 1.15,
161
- "learning_rate": 9.830471568993406e-05,
162
- "loss": 0.0018,
163
- "step": 4117
164
- },
165
  {
166
  "epoch": 1.2,
167
- "learning_rate": 9.774791588901332e-05,
168
- "loss": 0.0017,
169
  "step": 4296
170
  },
171
- {
172
- "epoch": 1.25,
173
- "learning_rate": 9.719111608809259e-05,
174
- "loss": 0.002,
175
- "step": 4475
176
- },
177
  {
178
  "epoch": 1.3,
179
- "learning_rate": 9.663431628717184e-05,
180
- "loss": 0.0022,
181
  "step": 4654
182
  },
183
- {
184
- "epoch": 1.35,
185
- "learning_rate": 9.60775164862511e-05,
186
- "loss": 0.0015,
187
- "step": 4833
188
- },
189
  {
190
  "epoch": 1.4,
191
- "learning_rate": 9.552071668533035e-05,
192
- "loss": 0.002,
193
  "step": 5012
194
  },
195
- {
196
- "epoch": 1.45,
197
- "learning_rate": 9.496391688440962e-05,
198
- "loss": 0.0013,
199
- "step": 5191
200
- },
201
  {
202
  "epoch": 1.5,
203
- "learning_rate": 9.440711708348887e-05,
204
- "loss": 0.0017,
205
  "step": 5370
206
  },
207
- {
208
- "epoch": 1.55,
209
- "learning_rate": 9.385031728256813e-05,
210
- "loss": 0.0016,
211
- "step": 5549
212
- },
213
  {
214
  "epoch": 1.6,
215
- "learning_rate": 9.329351748164738e-05,
216
- "loss": 0.0014,
217
  "step": 5728
218
  },
219
- {
220
- "epoch": 1.65,
221
- "learning_rate": 9.273671768072664e-05,
222
- "loss": 0.0019,
223
- "step": 5907
224
- },
225
  {
226
  "epoch": 1.7,
227
- "learning_rate": 9.21799178798059e-05,
228
- "loss": 0.0015,
229
  "step": 6086
230
  },
231
- {
232
- "epoch": 1.75,
233
- "learning_rate": 9.162311807888516e-05,
234
- "loss": 0.0014,
235
- "step": 6265
236
- },
237
  {
238
  "epoch": 1.8,
239
- "learning_rate": 9.106631827796442e-05,
240
- "loss": 0.0015,
241
  "step": 6444
242
  },
243
- {
244
- "epoch": 1.85,
245
- "learning_rate": 9.050951847704367e-05,
246
- "loss": 0.001,
247
- "step": 6623
248
- },
249
  {
250
  "epoch": 1.9,
251
- "learning_rate": 8.995271867612293e-05,
252
- "loss": 0.002,
253
  "step": 6802
254
  },
255
- {
256
- "epoch": 1.95,
257
- "learning_rate": 8.939591887520218e-05,
258
- "loss": 0.0017,
259
- "step": 6981
260
- },
261
  {
262
  "epoch": 2.0,
263
- "eval_loss": 9.73735313891666e-06,
264
  "eval_max_distance": 0,
265
  "eval_mean_distance": 0,
266
- "eval_runtime": 2.711,
267
- "eval_samples_per_second": 18.443,
268
- "eval_steps_per_second": 0.738,
269
- "step": 7144
270
  },
271
  {
272
  "epoch": 2.0,
273
- "learning_rate": 8.883911907428145e-05,
274
- "loss": 0.0019,
275
  "step": 7160
276
  },
277
- {
278
- "epoch": 2.05,
279
- "learning_rate": 8.828231927336071e-05,
280
- "loss": 0.001,
281
- "step": 7339
282
- },
283
  {
284
  "epoch": 2.1,
285
- "learning_rate": 8.772551947243996e-05,
286
- "loss": 0.0011,
287
  "step": 7518
288
  },
289
- {
290
- "epoch": 2.15,
291
- "learning_rate": 8.716871967151923e-05,
292
- "loss": 0.0012,
293
- "step": 7697
294
- },
295
  {
296
  "epoch": 2.2,
297
- "learning_rate": 8.661191987059849e-05,
298
- "loss": 0.0009,
299
  "step": 7876
300
  },
301
  {
302
- "epoch": 2.26,
303
- "learning_rate": 8.605512006967774e-05,
304
- "loss": 0.0012,
305
- "step": 8055
306
- },
307
- {
308
- "epoch": 2.31,
309
- "learning_rate": 8.5498320268757e-05,
310
- "loss": 0.001,
311
  "step": 8234
312
  },
313
  {
314
- "epoch": 2.36,
315
- "learning_rate": 8.494152046783627e-05,
316
- "loss": 0.0015,
317
- "step": 8413
318
- },
319
- {
320
- "epoch": 2.41,
321
- "learning_rate": 8.438472066691552e-05,
322
- "loss": 0.0012,
323
  "step": 8592
324
  },
325
  {
326
- "epoch": 2.46,
327
- "learning_rate": 8.382792086599478e-05,
328
- "loss": 0.0017,
329
- "step": 8771
330
- },
331
- {
332
- "epoch": 2.51,
333
- "learning_rate": 8.327112106507403e-05,
334
- "loss": 0.0011,
335
  "step": 8950
336
  },
337
  {
338
- "epoch": 2.56,
339
- "learning_rate": 8.27143212641533e-05,
340
- "loss": 0.0018,
341
- "step": 9129
342
- },
343
- {
344
- "epoch": 2.61,
345
- "learning_rate": 8.215752146323256e-05,
346
- "loss": 0.0012,
347
  "step": 9308
348
  },
349
  {
350
- "epoch": 2.66,
351
- "learning_rate": 8.160072166231181e-05,
352
- "loss": 0.0012,
353
- "step": 9487
354
- },
355
- {
356
- "epoch": 2.71,
357
- "learning_rate": 8.104392186139107e-05,
358
- "loss": 0.0012,
359
  "step": 9666
360
  },
361
  {
362
- "epoch": 2.76,
363
- "learning_rate": 8.048712206047034e-05,
364
- "loss": 0.0011,
365
- "step": 9845
366
- },
367
- {
368
- "epoch": 2.81,
369
- "learning_rate": 7.993032225954959e-05,
370
- "loss": 0.0012,
371
  "step": 10024
372
  },
373
  {
374
- "epoch": 2.86,
375
- "learning_rate": 7.937352245862885e-05,
376
- "loss": 0.0012,
377
- "step": 10203
378
- },
379
- {
380
- "epoch": 2.91,
381
- "learning_rate": 7.881672265770811e-05,
382
- "loss": 0.0012,
383
  "step": 10382
384
  },
385
- {
386
- "epoch": 2.96,
387
- "learning_rate": 7.825992285678736e-05,
388
- "loss": 0.0017,
389
- "step": 10561
390
- },
391
  {
392
  "epoch": 3.0,
393
- "eval_loss": 2.2839917619421612e-06,
394
  "eval_max_distance": 0,
395
  "eval_mean_distance": 0,
396
- "eval_runtime": 2.7296,
397
- "eval_samples_per_second": 18.318,
398
- "eval_steps_per_second": 0.733,
399
- "step": 10716
400
  },
401
  {
402
  "epoch": 3.01,
403
- "learning_rate": 7.770312305586663e-05,
404
- "loss": 0.0012,
405
- "step": 10740
406
- },
407
- {
408
- "epoch": 3.06,
409
- "learning_rate": 7.714632325494588e-05,
410
  "loss": 0.0008,
411
- "step": 10919
412
  },
413
  {
414
  "epoch": 3.11,
415
- "learning_rate": 7.658952345402514e-05,
416
- "loss": 0.0014,
417
  "step": 11098
418
  },
419
- {
420
- "epoch": 3.16,
421
- "learning_rate": 7.603272365310439e-05,
422
- "loss": 0.0009,
423
- "step": 11277
424
- },
425
  {
426
  "epoch": 3.21,
427
- "learning_rate": 7.547592385218366e-05,
428
- "loss": 0.0009,
429
  "step": 11456
430
  },
431
- {
432
- "epoch": 3.26,
433
- "learning_rate": 7.49191240512629e-05,
434
- "loss": 0.001,
435
- "step": 11635
436
- },
437
  {
438
  "epoch": 3.31,
439
- "learning_rate": 7.436232425034217e-05,
440
- "loss": 0.001,
441
  "step": 11814
442
  },
443
- {
444
- "epoch": 3.36,
445
- "learning_rate": 7.380552444942142e-05,
446
- "loss": 0.0011,
447
- "step": 11993
448
- },
449
  {
450
  "epoch": 3.41,
451
- "learning_rate": 7.324872464850068e-05,
452
- "loss": 0.001,
453
  "step": 12172
454
  },
455
- {
456
- "epoch": 3.46,
457
- "learning_rate": 7.269192484757995e-05,
458
- "loss": 0.0009,
459
- "step": 12351
460
- },
461
  {
462
  "epoch": 3.51,
463
- "learning_rate": 7.21351250466592e-05,
464
- "loss": 0.0009,
465
  "step": 12530
466
  },
467
- {
468
- "epoch": 3.56,
469
- "learning_rate": 7.157832524573846e-05,
470
- "loss": 0.0011,
471
- "step": 12709
472
- },
473
  {
474
  "epoch": 3.61,
475
- "learning_rate": 7.102152544481773e-05,
476
- "loss": 0.001,
477
  "step": 12888
478
  },
479
- {
480
- "epoch": 3.66,
481
- "learning_rate": 7.046472564389698e-05,
482
- "loss": 0.0013,
483
- "step": 13067
484
- },
485
  {
486
  "epoch": 3.71,
487
- "learning_rate": 6.990792584297624e-05,
488
- "loss": 0.0008,
489
  "step": 13246
490
  },
491
- {
492
- "epoch": 3.76,
493
- "learning_rate": 6.935112604205549e-05,
494
- "loss": 0.001,
495
- "step": 13425
496
- },
497
  {
498
  "epoch": 3.81,
499
- "learning_rate": 6.879432624113475e-05,
500
- "loss": 0.0008,
501
  "step": 13604
502
  },
503
- {
504
- "epoch": 3.86,
505
- "learning_rate": 6.823752644021402e-05,
506
- "loss": 0.0007,
507
- "step": 13783
508
- },
509
  {
510
  "epoch": 3.91,
511
- "learning_rate": 6.768072663929327e-05,
512
- "loss": 0.001,
513
  "step": 13962
514
  },
515
- {
516
- "epoch": 3.96,
517
- "learning_rate": 6.712392683837253e-05,
518
- "loss": 0.001,
519
- "step": 14141
520
- },
521
  {
522
  "epoch": 4.0,
523
- "eval_loss": 2.3551267531729536e-06,
524
  "eval_max_distance": 0,
525
  "eval_mean_distance": 0,
526
- "eval_runtime": 2.712,
527
- "eval_samples_per_second": 18.437,
528
- "eval_steps_per_second": 0.737,
529
- "step": 14288
530
  },
531
  {
532
  "epoch": 4.01,
533
- "learning_rate": 6.65671270374518e-05,
534
- "loss": 0.0016,
535
  "step": 14320
536
  },
537
- {
538
- "epoch": 4.06,
539
- "learning_rate": 6.601032723653104e-05,
540
- "loss": 0.0009,
541
- "step": 14499
542
- },
543
  {
544
  "epoch": 4.11,
545
- "learning_rate": 6.545352743561031e-05,
546
- "loss": 0.0008,
547
  "step": 14678
548
  },
549
- {
550
- "epoch": 4.16,
551
- "learning_rate": 6.489672763468957e-05,
552
- "loss": 0.0006,
553
- "step": 14857
554
- },
555
  {
556
  "epoch": 4.21,
557
- "learning_rate": 6.433992783376882e-05,
558
- "loss": 0.0007,
559
  "step": 15036
560
  },
561
- {
562
- "epoch": 4.26,
563
- "learning_rate": 6.378312803284809e-05,
564
- "loss": 0.0008,
565
- "step": 15215
566
- },
567
  {
568
  "epoch": 4.31,
569
- "learning_rate": 6.322632823192734e-05,
570
- "loss": 0.0011,
571
  "step": 15394
572
  },
573
- {
574
- "epoch": 4.36,
575
- "learning_rate": 6.26695284310066e-05,
576
- "loss": 0.0007,
577
- "step": 15573
578
- },
579
  {
580
  "epoch": 4.41,
581
- "learning_rate": 6.211272863008586e-05,
582
- "loss": 0.001,
583
  "step": 15752
584
  },
585
- {
586
- "epoch": 4.46,
587
- "learning_rate": 6.155592882916511e-05,
588
- "loss": 0.0012,
589
- "step": 15931
590
- },
591
  {
592
  "epoch": 4.51,
593
- "learning_rate": 6.099912902824437e-05,
594
- "loss": 0.0012,
595
  "step": 16110
596
  },
597
- {
598
- "epoch": 4.56,
599
- "learning_rate": 6.0442329227323634e-05,
600
- "loss": 0.0008,
601
- "step": 16289
602
- },
603
  {
604
  "epoch": 4.61,
605
- "learning_rate": 5.9885529426402885e-05,
606
- "loss": 0.0009,
607
- "step": 16468
608
- },
609
- {
610
- "epoch": 4.66,
611
- "learning_rate": 5.932872962548215e-05,
612
  "loss": 0.0006,
613
- "step": 16647
614
  },
615
  {
616
  "epoch": 4.71,
617
- "learning_rate": 5.877192982456141e-05,
618
- "loss": 0.0008,
619
  "step": 16826
620
  },
621
- {
622
- "epoch": 4.76,
623
- "learning_rate": 5.821513002364066e-05,
624
- "loss": 0.0008,
625
- "step": 17005
626
- },
627
  {
628
  "epoch": 4.81,
629
- "learning_rate": 5.7658330222719926e-05,
630
- "loss": 0.0008,
631
  "step": 17184
632
  },
633
- {
634
- "epoch": 4.86,
635
- "learning_rate": 5.7101530421799176e-05,
636
- "loss": 0.0011,
637
- "step": 17363
638
- },
639
  {
640
  "epoch": 4.91,
641
- "learning_rate": 5.654473062087844e-05,
642
- "loss": 0.0008,
643
  "step": 17542
644
  },
645
- {
646
- "epoch": 4.96,
647
- "learning_rate": 5.5987930819957704e-05,
648
- "loss": 0.0008,
649
- "step": 17721
650
- },
651
  {
652
  "epoch": 5.0,
653
- "eval_loss": 2.0127217794652097e-06,
654
  "eval_max_distance": 0,
655
  "eval_mean_distance": 0,
656
- "eval_runtime": 2.7158,
657
- "eval_samples_per_second": 18.411,
658
- "eval_steps_per_second": 0.736,
659
- "step": 17860
660
  },
661
  {
662
  "epoch": 5.01,
663
- "learning_rate": 5.5431131019036954e-05,
664
- "loss": 0.0009,
665
  "step": 17900
666
  },
667
- {
668
- "epoch": 5.06,
669
- "learning_rate": 5.487433121811622e-05,
670
- "loss": 0.0012,
671
- "step": 18079
672
- },
673
  {
674
  "epoch": 5.11,
675
- "learning_rate": 5.4317531417195474e-05,
676
  "loss": 0.0007,
677
  "step": 18258
678
  },
679
- {
680
- "epoch": 5.16,
681
- "learning_rate": 5.376073161627473e-05,
682
- "loss": 0.0009,
683
- "step": 18437
684
- },
685
  {
686
  "epoch": 5.21,
687
- "learning_rate": 5.320393181535399e-05,
688
- "loss": 0.0007,
689
  "step": 18616
690
  },
691
- {
692
- "epoch": 5.26,
693
- "learning_rate": 5.264713201443325e-05,
694
- "loss": 0.0011,
695
- "step": 18795
696
- },
697
  {
698
  "epoch": 5.31,
699
- "learning_rate": 5.20903322135125e-05,
700
  "loss": 0.0006,
701
  "step": 18974
702
  },
703
- {
704
- "epoch": 5.36,
705
- "learning_rate": 5.1533532412591766e-05,
706
- "loss": 0.0008,
707
- "step": 19153
708
- },
709
  {
710
  "epoch": 5.41,
711
- "learning_rate": 5.0976732611671016e-05,
712
- "loss": 0.0008,
713
  "step": 19332
714
  },
715
- {
716
- "epoch": 5.46,
717
- "learning_rate": 5.041993281075028e-05,
718
- "loss": 0.0007,
719
- "step": 19511
720
- },
721
  {
722
  "epoch": 5.51,
723
- "learning_rate": 4.9863133009829544e-05,
724
- "loss": 0.0006,
725
  "step": 19690
726
  },
727
- {
728
- "epoch": 5.56,
729
- "learning_rate": 4.93063332089088e-05,
730
- "loss": 0.0007,
731
- "step": 19869
732
- },
733
  {
734
  "epoch": 5.61,
735
- "learning_rate": 4.874953340798806e-05,
736
- "loss": 0.0006,
737
  "step": 20048
738
  },
739
- {
740
- "epoch": 5.66,
741
- "learning_rate": 4.8192733607067314e-05,
742
- "loss": 0.0007,
743
- "step": 20227
744
- },
745
  {
746
  "epoch": 5.71,
747
- "learning_rate": 4.763593380614658e-05,
748
- "loss": 0.0008,
749
  "step": 20406
750
  },
751
- {
752
- "epoch": 5.76,
753
- "learning_rate": 4.7079134005225835e-05,
754
- "loss": 0.0008,
755
- "step": 20585
756
- },
757
  {
758
  "epoch": 5.81,
759
- "learning_rate": 4.652233420430509e-05,
760
- "loss": 0.0008,
761
  "step": 20764
762
  },
763
- {
764
- "epoch": 5.86,
765
- "learning_rate": 4.596553440338435e-05,
766
- "loss": 0.0008,
767
- "step": 20943
768
- },
769
  {
770
  "epoch": 5.91,
771
- "learning_rate": 4.540873460246361e-05,
772
- "loss": 0.0007,
773
  "step": 21122
774
  },
775
- {
776
- "epoch": 5.96,
777
- "learning_rate": 4.485193480154287e-05,
778
- "loss": 0.0007,
779
- "step": 21301
780
- },
781
  {
782
  "epoch": 6.0,
783
- "eval_loss": 3.2845150599314366e-06,
784
  "eval_max_distance": 0,
785
  "eval_mean_distance": 0,
786
- "eval_runtime": 2.7124,
787
- "eval_samples_per_second": 18.434,
788
- "eval_steps_per_second": 0.737,
789
- "step": 21432
790
  },
791
  {
792
  "epoch": 6.01,
793
- "learning_rate": 4.4295135000622127e-05,
794
- "loss": 0.0007,
795
  "step": 21480
796
  },
797
- {
798
- "epoch": 6.06,
799
- "learning_rate": 4.3738335199701384e-05,
800
- "loss": 0.0006,
801
- "step": 21659
802
- },
803
  {
804
  "epoch": 6.11,
805
- "learning_rate": 4.318153539878064e-05,
806
  "loss": 0.0005,
807
  "step": 21838
808
  },
809
- {
810
- "epoch": 6.16,
811
- "learning_rate": 4.26247355978599e-05,
812
- "loss": 0.0007,
813
- "step": 22017
814
- },
815
  {
816
  "epoch": 6.21,
817
- "learning_rate": 4.2067935796939154e-05,
818
  "loss": 0.0006,
819
  "step": 22196
820
  },
821
- {
822
- "epoch": 6.26,
823
- "learning_rate": 4.151113599601842e-05,
824
- "loss": 0.0008,
825
- "step": 22375
826
- },
827
  {
828
  "epoch": 6.31,
829
- "learning_rate": 4.0954336195097675e-05,
830
- "loss": 0.0008,
831
  "step": 22554
832
  },
833
- {
834
- "epoch": 6.36,
835
- "learning_rate": 4.039753639417693e-05,
836
- "loss": 0.0006,
837
- "step": 22733
838
- },
839
  {
840
  "epoch": 6.41,
841
- "learning_rate": 3.984073659325619e-05,
842
- "loss": 0.0007,
843
  "step": 22912
844
  },
845
- {
846
- "epoch": 6.46,
847
- "learning_rate": 3.928393679233545e-05,
848
- "loss": 0.0007,
849
- "step": 23091
850
- },
851
  {
852
  "epoch": 6.51,
853
- "learning_rate": 3.872713699141471e-05,
854
- "loss": 0.0011,
855
  "step": 23270
856
  },
857
- {
858
- "epoch": 6.56,
859
- "learning_rate": 3.817033719049397e-05,
860
- "loss": 0.001,
861
- "step": 23449
862
- },
863
  {
864
  "epoch": 6.61,
865
- "learning_rate": 3.761353738957323e-05,
866
  "loss": 0.0007,
867
  "step": 23628
868
  },
869
  {
870
- "epoch": 6.66,
871
- "learning_rate": 3.705673758865249e-05,
872
- "loss": 0.0006,
873
- "step": 23807
874
- },
875
- {
876
- "epoch": 6.72,
877
- "learning_rate": 3.6499937787731744e-05,
878
- "loss": 0.0006,
879
  "step": 23986
880
  },
881
  {
882
- "epoch": 6.77,
883
- "learning_rate": 3.5943137986811e-05,
884
- "loss": 0.0008,
885
- "step": 24165
886
  },
887
  {
888
- "epoch": 6.82,
889
- "learning_rate": 3.538633818589026e-05,
890
- "loss": 0.0008,
891
- "step": 24344
892
  },
893
  {
894
- "epoch": 6.87,
895
- "learning_rate": 3.4829538384969515e-05,
896
- "loss": 0.0008,
897
- "step": 24523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
898
  },
899
  {
900
- "epoch": 6.92,
901
- "learning_rate": 3.427273858404877e-05,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  "loss": 0.0006,
903
- "step": 24702
904
  },
905
  {
906
- "epoch": 6.97,
907
- "learning_rate": 3.3715938783128036e-05,
908
- "loss": 0.0007,
909
- "step": 24881
910
  },
911
  {
912
- "epoch": 7.0,
913
- "eval_loss": 3.155769718432566e-06,
 
 
 
 
 
 
914
  "eval_max_distance": 0,
915
  "eval_mean_distance": 0,
916
- "eval_runtime": 2.7115,
917
- "eval_samples_per_second": 18.44,
918
- "eval_steps_per_second": 0.738,
919
- "step": 25004
920
  },
921
  {
922
- "epoch": 7.02,
923
- "learning_rate": 3.315913898220729e-05,
924
- "loss": 0.0005,
925
- "step": 25060
926
  },
927
  {
928
- "epoch": 7.07,
929
- "learning_rate": 3.260233918128655e-05,
930
- "loss": 0.0007,
931
- "step": 25239
932
  },
933
  {
934
- "epoch": 7.12,
935
- "learning_rate": 3.204553938036581e-05,
936
- "loss": 0.0006,
937
- "step": 25418
938
  },
939
  {
940
- "epoch": 7.17,
941
- "learning_rate": 3.148873957944507e-05,
942
- "loss": 0.0007,
943
- "step": 25597
944
  },
945
  {
946
- "epoch": 7.22,
947
- "learning_rate": 3.093193977852433e-05,
948
- "loss": 0.0006,
949
- "step": 25776
950
  },
951
  {
952
- "epoch": 7.27,
953
- "learning_rate": 3.0375139977603584e-05,
954
- "loss": 0.0006,
955
- "step": 25955
956
  },
957
  {
958
- "epoch": 7.32,
959
- "learning_rate": 2.981834017668284e-05,
960
  "loss": 0.0006,
961
- "step": 26134
962
  },
963
  {
964
- "epoch": 7.37,
965
- "learning_rate": 2.92615403757621e-05,
966
- "loss": 0.0007,
967
- "step": 26313
968
  },
969
  {
970
- "epoch": 7.42,
971
- "learning_rate": 2.870474057484136e-05,
972
  "loss": 0.0005,
973
- "step": 26492
974
  },
975
  {
976
- "epoch": 7.47,
977
- "learning_rate": 2.8147940773920615e-05,
978
- "loss": 0.0006,
979
- "step": 26671
980
  },
981
  {
982
- "epoch": 7.52,
983
- "learning_rate": 2.759114097299988e-05,
984
- "loss": 0.0006,
985
- "step": 26850
 
 
 
 
986
  },
987
  {
988
- "epoch": 7.57,
989
- "learning_rate": 2.7034341172079136e-05,
990
- "loss": 0.0006,
991
- "step": 27029
992
  },
993
  {
994
- "epoch": 7.62,
995
- "learning_rate": 2.6477541371158393e-05,
996
  "loss": 0.0005,
997
- "step": 27208
998
  },
999
  {
1000
- "epoch": 7.67,
1001
- "learning_rate": 2.592074157023765e-05,
1002
  "loss": 0.0004,
1003
- "step": 27387
1004
  },
1005
  {
1006
- "epoch": 7.72,
1007
- "learning_rate": 2.5363941769316914e-05,
1008
- "loss": 0.0009,
1009
- "step": 27566
1010
  },
1011
  {
1012
- "epoch": 7.77,
1013
- "learning_rate": 2.480714196839617e-05,
1014
- "loss": 0.0007,
1015
- "step": 27745
1016
  },
1017
  {
1018
- "epoch": 7.82,
1019
- "learning_rate": 2.4250342167475428e-05,
1020
- "loss": 0.0007,
1021
- "step": 27924
1022
  },
1023
  {
1024
- "epoch": 7.87,
1025
- "learning_rate": 2.3693542366554685e-05,
1026
- "loss": 0.0006,
1027
- "step": 28103
1028
  },
1029
  {
1030
- "epoch": 7.92,
1031
- "learning_rate": 2.3136742565633945e-05,
1032
- "loss": 0.0007,
1033
- "step": 28282
1034
  },
1035
  {
1036
- "epoch": 7.97,
1037
- "learning_rate": 2.2579942764713202e-05,
1038
- "loss": 0.0008,
1039
- "step": 28461
1040
  },
1041
  {
1042
- "epoch": 8.0,
1043
- "eval_loss": 1.7536920040583936e-06,
 
 
 
 
 
 
1044
  "eval_max_distance": 0,
1045
  "eval_mean_distance": 0,
1046
- "eval_runtime": 2.7014,
1047
- "eval_samples_per_second": 18.509,
1048
- "eval_steps_per_second": 0.74,
1049
- "step": 28576
1050
  },
1051
  {
1052
- "epoch": 8.02,
1053
- "learning_rate": 2.2023142963792462e-05,
1054
- "loss": 0.0009,
1055
- "step": 28640
1056
  },
1057
  {
1058
- "epoch": 8.07,
1059
- "learning_rate": 2.146634316287172e-05,
1060
- "loss": 0.0008,
1061
- "step": 28819
1062
  },
1063
  {
1064
- "epoch": 8.12,
1065
- "learning_rate": 2.090954336195098e-05,
1066
- "loss": 0.0008,
1067
- "step": 28998
1068
  },
1069
  {
1070
- "epoch": 8.17,
1071
- "learning_rate": 2.0352743561030236e-05,
1072
  "loss": 0.0005,
1073
- "step": 29177
1074
  },
1075
  {
1076
- "epoch": 8.22,
1077
- "learning_rate": 1.9795943760109493e-05,
1078
- "loss": 0.0006,
1079
- "step": 29356
1080
  },
1081
  {
1082
- "epoch": 8.27,
1083
- "learning_rate": 1.923914395918875e-05,
1084
- "loss": 0.0007,
1085
- "step": 29535
1086
  },
1087
  {
1088
- "epoch": 8.32,
1089
- "learning_rate": 1.868234415826801e-05,
1090
- "loss": 0.0008,
1091
- "step": 29714
1092
  },
1093
  {
1094
- "epoch": 8.37,
1095
- "learning_rate": 1.812554435734727e-05,
1096
- "loss": 0.0006,
1097
- "step": 29893
1098
  },
1099
  {
1100
- "epoch": 8.42,
1101
- "learning_rate": 1.7568744556426528e-05,
1102
  "loss": 0.0006,
1103
- "step": 30072
1104
  },
1105
  {
1106
- "epoch": 8.47,
1107
- "learning_rate": 1.701194475550579e-05,
1108
  "loss": 0.0005,
1109
- "step": 30251
1110
  },
1111
  {
1112
- "epoch": 8.52,
1113
- "learning_rate": 1.6455144954585045e-05,
1114
- "loss": 0.0008,
1115
- "step": 30430
 
 
 
 
1116
  },
1117
  {
1118
- "epoch": 8.57,
1119
- "learning_rate": 1.5898345153664306e-05,
1120
  "loss": 0.0004,
1121
- "step": 30609
1122
  },
1123
  {
1124
- "epoch": 8.62,
1125
- "learning_rate": 1.5341545352743563e-05,
1126
- "loss": 0.0005,
1127
- "step": 30788
1128
  },
1129
  {
1130
- "epoch": 8.67,
1131
- "learning_rate": 1.4784745551822821e-05,
1132
- "loss": 0.0006,
1133
- "step": 30967
1134
  },
1135
  {
1136
- "epoch": 8.72,
1137
- "learning_rate": 1.4227945750902078e-05,
1138
- "loss": 0.0007,
1139
- "step": 31146
1140
  },
1141
  {
1142
- "epoch": 8.77,
1143
- "learning_rate": 1.3671145949981337e-05,
1144
  "loss": 0.0005,
1145
- "step": 31325
1146
  },
1147
  {
1148
- "epoch": 8.82,
1149
- "learning_rate": 1.3114346149060594e-05,
1150
- "loss": 0.001,
1151
- "step": 31504
1152
  },
1153
  {
1154
- "epoch": 8.87,
1155
- "learning_rate": 1.2557546348139854e-05,
1156
- "loss": 0.0005,
1157
- "step": 31683
1158
  },
1159
  {
1160
- "epoch": 8.92,
1161
- "learning_rate": 1.2000746547219113e-05,
1162
  "loss": 0.0005,
1163
- "step": 31862
1164
  },
1165
  {
1166
- "epoch": 8.97,
1167
- "learning_rate": 1.144394674629837e-05,
1168
- "loss": 0.0005,
1169
- "step": 32041
1170
  },
1171
  {
1172
- "epoch": 9.0,
1173
- "eval_loss": 2.188417056459002e-06,
 
 
 
 
 
 
1174
  "eval_max_distance": 0,
1175
  "eval_mean_distance": 0,
1176
- "eval_runtime": 2.7007,
1177
- "eval_samples_per_second": 18.514,
1178
- "eval_steps_per_second": 0.741,
1179
- "step": 32148
1180
  },
1181
  {
1182
- "epoch": 9.02,
1183
- "learning_rate": 1.0887146945377628e-05,
1184
  "loss": 0.0005,
1185
- "step": 32220
1186
  },
1187
  {
1188
- "epoch": 9.07,
1189
- "learning_rate": 1.0330347144456889e-05,
1190
- "loss": 0.0005,
1191
- "step": 32399
1192
  },
1193
  {
1194
- "epoch": 9.12,
1195
- "learning_rate": 9.773547343536146e-06,
1196
  "loss": 0.0004,
1197
- "step": 32578
1198
  },
1199
  {
1200
- "epoch": 9.17,
1201
- "learning_rate": 9.216747542615404e-06,
1202
- "loss": 0.0005,
1203
- "step": 32757
1204
  },
1205
  {
1206
- "epoch": 9.22,
1207
- "learning_rate": 8.659947741694663e-06,
1208
- "loss": 0.0005,
1209
- "step": 32936
1210
  },
1211
  {
1212
- "epoch": 9.27,
1213
- "learning_rate": 8.103147940773922e-06,
1214
- "loss": 0.0008,
1215
- "step": 33115
1216
  },
1217
  {
1218
- "epoch": 9.32,
1219
- "learning_rate": 7.546348139853179e-06,
1220
- "loss": 0.0009,
1221
- "step": 33294
1222
  },
1223
  {
1224
- "epoch": 9.37,
1225
- "learning_rate": 6.989548338932438e-06,
1226
- "loss": 0.0007,
1227
- "step": 33473
1228
  },
1229
  {
1230
- "epoch": 9.42,
1231
- "learning_rate": 6.432748538011696e-06,
1232
- "loss": 0.0006,
1233
- "step": 33652
1234
  },
1235
  {
1236
- "epoch": 9.47,
1237
- "learning_rate": 5.875948737090954e-06,
1238
- "loss": 0.0005,
1239
- "step": 33831
1240
  },
1241
  {
1242
- "epoch": 9.52,
1243
- "learning_rate": 5.319148936170213e-06,
 
 
 
 
 
 
 
 
 
 
1244
  "loss": 0.0005,
1245
- "step": 34010
1246
  },
1247
  {
1248
- "epoch": 9.57,
1249
- "learning_rate": 4.762349135249472e-06,
1250
- "loss": 0.0008,
1251
- "step": 34189
1252
  },
1253
  {
1254
- "epoch": 9.62,
1255
- "learning_rate": 4.2055493343287295e-06,
1256
- "loss": 0.0006,
1257
- "step": 34368
 
 
 
 
 
 
1258
  },
1259
  {
1260
- "epoch": 9.67,
1261
- "learning_rate": 3.6487495334079886e-06,
 
 
 
 
 
 
1262
  "loss": 0.0005,
1263
- "step": 34547
1264
  },
1265
  {
1266
- "epoch": 9.72,
1267
- "learning_rate": 3.0919497324872468e-06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1268
  "loss": 0.0005,
1269
- "step": 34726
1270
  },
1271
  {
1272
- "epoch": 9.77,
1273
- "learning_rate": 2.535149931566505e-06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274
  "loss": 0.0005,
1275
- "step": 34905
1276
  },
1277
  {
1278
- "epoch": 9.82,
1279
- "learning_rate": 1.9783501306457632e-06,
1280
- "loss": 0.0007,
1281
- "step": 35084
1282
  },
1283
  {
1284
- "epoch": 9.87,
1285
- "learning_rate": 1.4215503297250219e-06,
1286
- "loss": 0.0007,
1287
- "step": 35263
 
 
 
 
1288
  },
1289
  {
1290
- "epoch": 9.92,
1291
- "learning_rate": 8.647505288042802e-07,
1292
- "loss": 0.0006,
1293
- "step": 35442
1294
  },
1295
  {
1296
- "epoch": 9.97,
1297
- "learning_rate": 3.0795072788353863e-07,
1298
- "loss": 0.0007,
1299
- "step": 35621
1300
  },
1301
  {
1302
- "epoch": 10.0,
1303
- "eval_loss": 1.828535118875152e-06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1304
  "eval_max_distance": 0,
1305
  "eval_mean_distance": 0,
1306
- "eval_runtime": 2.7105,
1307
- "eval_samples_per_second": 18.446,
1308
- "eval_steps_per_second": 0.738,
1309
- "step": 35720
1310
  },
1311
  {
1312
- "epoch": 10.0,
1313
- "step": 35720,
1314
- "total_flos": 2.1372031794806784e+16,
1315
- "train_loss": 0.001247283529092882,
1316
- "train_runtime": 2969.2819,
1317
- "train_samples_per_second": 360.875,
1318
- "train_steps_per_second": 12.03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1319
  }
1320
  ],
1321
- "logging_steps": 179,
1322
- "max_steps": 35720,
1323
- "num_train_epochs": 10,
1324
- "save_steps": 358,
1325
- "total_flos": 2.1372031794806784e+16,
1326
  "trial_name": null,
1327
  "trial_params": null
1328
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 71480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1.3989927252378289e-08,
14
+ "loss": 0.0035,
15
  "step": 1
16
  },
 
 
 
 
 
 
17
  {
18
  "epoch": 0.1,
19
+ "learning_rate": 5.008393956351427e-06,
20
+ "loss": 0.0009,
21
  "step": 358
22
  },
 
 
 
 
 
 
23
  {
24
  "epoch": 0.2,
25
+ "learning_rate": 1.0016787912702854e-05,
26
+ "loss": 0.001,
27
  "step": 716
28
  },
 
 
 
 
 
 
29
  {
30
  "epoch": 0.3,
31
+ "learning_rate": 1.502518186905428e-05,
32
+ "loss": 0.0008,
33
  "step": 1074
34
  },
 
 
 
 
 
 
35
  {
36
  "epoch": 0.4,
37
+ "learning_rate": 2.0033575825405708e-05,
38
+ "loss": 0.0007,
39
  "step": 1432
40
  },
 
 
 
 
 
 
41
  {
42
  "epoch": 0.5,
43
+ "learning_rate": 2.5041969781757136e-05,
44
+ "loss": 0.0007,
45
  "step": 1790
46
  },
 
 
 
 
 
 
47
  {
48
  "epoch": 0.6,
49
+ "learning_rate": 3.005036373810856e-05,
50
+ "loss": 0.0007,
51
  "step": 2148
52
  },
 
 
 
 
 
 
53
  {
54
  "epoch": 0.7,
55
+ "learning_rate": 3.505875769445999e-05,
56
+ "loss": 0.0007,
57
  "step": 2506
58
  },
 
 
 
 
 
 
59
  {
60
  "epoch": 0.8,
61
+ "learning_rate": 4.0067151650811416e-05,
62
+ "loss": 0.0006,
63
  "step": 2864
64
  },
 
 
 
 
 
 
65
  {
66
  "epoch": 0.9,
67
+ "learning_rate": 4.507554560716285e-05,
68
+ "loss": 0.0006,
69
  "step": 3222
70
  },
 
 
 
 
 
 
71
  {
72
  "epoch": 1.0,
73
+ "eval_loss": 4.419351284923323e-07,
74
  "eval_max_distance": 0,
75
  "eval_mean_distance": 0,
76
+ "eval_runtime": 2.5506,
77
+ "eval_samples_per_second": 19.603,
78
+ "eval_steps_per_second": 0.784,
79
+ "step": 3574
80
  },
81
  {
82
  "epoch": 1.0,
83
+ "learning_rate": 5.008393956351427e-05,
84
+ "loss": 0.0007,
85
  "step": 3580
86
  },
 
 
 
 
 
 
87
  {
88
  "epoch": 1.1,
89
+ "learning_rate": 5.5092333519865704e-05,
90
+ "loss": 0.0006,
91
  "step": 3938
92
  },
 
 
 
 
 
 
93
  {
94
  "epoch": 1.2,
95
+ "learning_rate": 6.010072747621712e-05,
96
+ "loss": 0.0007,
97
  "step": 4296
98
  },
 
 
 
 
 
 
99
  {
100
  "epoch": 1.3,
101
+ "learning_rate": 6.510912143256855e-05,
102
+ "loss": 0.0007,
103
  "step": 4654
104
  },
 
 
 
 
 
 
105
  {
106
  "epoch": 1.4,
107
+ "learning_rate": 7.011751538891998e-05,
108
+ "loss": 0.0006,
109
  "step": 5012
110
  },
 
 
 
 
 
 
111
  {
112
  "epoch": 1.5,
113
+ "learning_rate": 7.51259093452714e-05,
114
+ "loss": 0.0006,
115
  "step": 5370
116
  },
 
 
 
 
 
 
117
  {
118
  "epoch": 1.6,
119
+ "learning_rate": 8.013430330162283e-05,
120
+ "loss": 0.0006,
121
  "step": 5728
122
  },
 
 
 
 
 
 
123
  {
124
  "epoch": 1.7,
125
+ "learning_rate": 8.514269725797426e-05,
126
+ "loss": 0.0007,
127
  "step": 6086
128
  },
 
 
 
 
 
 
129
  {
130
  "epoch": 1.8,
131
+ "learning_rate": 9.01510912143257e-05,
132
+ "loss": 0.0006,
133
  "step": 6444
134
  },
 
 
 
 
 
 
135
  {
136
  "epoch": 1.9,
137
+ "learning_rate": 9.515948517067713e-05,
138
+ "loss": 0.0005,
139
  "step": 6802
140
  },
 
 
 
 
 
 
141
  {
142
  "epoch": 2.0,
143
+ "eval_loss": 7.104979999894567e-07,
144
  "eval_max_distance": 0,
145
  "eval_mean_distance": 0,
146
+ "eval_runtime": 2.5602,
147
+ "eval_samples_per_second": 19.53,
148
+ "eval_steps_per_second": 0.781,
149
+ "step": 7148
150
  },
151
  {
152
  "epoch": 2.0,
153
+ "learning_rate": 9.99813467636635e-05,
154
+ "loss": 0.0008,
155
  "step": 7160
156
  },
 
 
 
 
 
 
157
  {
158
  "epoch": 2.1,
159
+ "learning_rate": 9.942485854629112e-05,
160
+ "loss": 0.0006,
161
  "step": 7518
162
  },
 
 
 
 
 
 
163
  {
164
  "epoch": 2.2,
165
+ "learning_rate": 9.886837032891874e-05,
166
+ "loss": 0.0007,
167
  "step": 7876
168
  },
169
  {
170
+ "epoch": 2.3,
171
+ "learning_rate": 9.831188211154637e-05,
172
+ "loss": 0.0006,
 
 
 
 
 
 
173
  "step": 8234
174
  },
175
  {
176
+ "epoch": 2.4,
177
+ "learning_rate": 9.775539389417398e-05,
178
+ "loss": 0.0006,
 
 
 
 
 
 
179
  "step": 8592
180
  },
181
  {
182
+ "epoch": 2.5,
183
+ "learning_rate": 9.71989056768016e-05,
184
+ "loss": 0.0007,
 
 
 
 
 
 
185
  "step": 8950
186
  },
187
  {
188
+ "epoch": 2.6,
189
+ "learning_rate": 9.664241745942921e-05,
190
+ "loss": 0.0007,
 
 
 
 
 
 
191
  "step": 9308
192
  },
193
  {
194
+ "epoch": 2.7,
195
+ "learning_rate": 9.608592924205684e-05,
196
+ "loss": 0.0006,
 
 
 
 
 
 
197
  "step": 9666
198
  },
199
  {
200
+ "epoch": 2.8,
201
+ "learning_rate": 9.552944102468445e-05,
202
+ "loss": 0.0007,
 
 
 
 
 
 
203
  "step": 10024
204
  },
205
  {
206
+ "epoch": 2.9,
207
+ "learning_rate": 9.497295280731208e-05,
208
+ "loss": 0.0007,
 
 
 
 
 
 
209
  "step": 10382
210
  },
 
 
 
 
 
 
211
  {
212
  "epoch": 3.0,
213
+ "eval_loss": 6.182289325806778e-06,
214
  "eval_max_distance": 0,
215
  "eval_mean_distance": 0,
216
+ "eval_runtime": 2.5755,
217
+ "eval_samples_per_second": 19.414,
218
+ "eval_steps_per_second": 0.777,
219
+ "step": 10722
220
  },
221
  {
222
  "epoch": 3.01,
223
+ "learning_rate": 9.441646458993969e-05,
 
 
 
 
 
 
224
  "loss": 0.0008,
225
+ "step": 10740
226
  },
227
  {
228
  "epoch": 3.11,
229
+ "learning_rate": 9.385997637256732e-05,
230
+ "loss": 0.0006,
231
  "step": 11098
232
  },
 
 
 
 
 
 
233
  {
234
  "epoch": 3.21,
235
+ "learning_rate": 9.330348815519493e-05,
236
+ "loss": 0.0006,
237
  "step": 11456
238
  },
 
 
 
 
 
 
239
  {
240
  "epoch": 3.31,
241
+ "learning_rate": 9.274699993782255e-05,
242
+ "loss": 0.0006,
243
  "step": 11814
244
  },
 
 
 
 
 
 
245
  {
246
  "epoch": 3.41,
247
+ "learning_rate": 9.219051172045016e-05,
248
+ "loss": 0.0006,
249
  "step": 12172
250
  },
 
 
 
 
 
 
251
  {
252
  "epoch": 3.51,
253
+ "learning_rate": 9.163402350307779e-05,
254
+ "loss": 0.0005,
255
  "step": 12530
256
  },
 
 
 
 
 
 
257
  {
258
  "epoch": 3.61,
259
+ "learning_rate": 9.10775352857054e-05,
260
+ "loss": 0.0005,
261
  "step": 12888
262
  },
 
 
 
 
 
 
263
  {
264
  "epoch": 3.71,
265
+ "learning_rate": 9.052104706833303e-05,
266
+ "loss": 0.0007,
267
  "step": 13246
268
  },
 
 
 
 
 
 
269
  {
270
  "epoch": 3.81,
271
+ "learning_rate": 8.996455885096065e-05,
272
+ "loss": 0.0006,
273
  "step": 13604
274
  },
 
 
 
 
 
 
275
  {
276
  "epoch": 3.91,
277
+ "learning_rate": 8.940807063358827e-05,
278
+ "loss": 0.0008,
279
  "step": 13962
280
  },
 
 
 
 
 
 
281
  {
282
  "epoch": 4.0,
283
+ "eval_loss": 2.7662836146191694e-06,
284
  "eval_max_distance": 0,
285
  "eval_mean_distance": 0,
286
+ "eval_runtime": 2.5596,
287
+ "eval_samples_per_second": 19.535,
288
+ "eval_steps_per_second": 0.781,
289
+ "step": 14296
290
  },
291
  {
292
  "epoch": 4.01,
293
+ "learning_rate": 8.885158241621589e-05,
294
+ "loss": 0.0007,
295
  "step": 14320
296
  },
 
 
 
 
 
 
297
  {
298
  "epoch": 4.11,
299
+ "learning_rate": 8.82950941988435e-05,
300
+ "loss": 0.0005,
301
  "step": 14678
302
  },
 
 
 
 
 
 
303
  {
304
  "epoch": 4.21,
305
+ "learning_rate": 8.773860598147112e-05,
306
+ "loss": 0.0006,
307
  "step": 15036
308
  },
 
 
 
 
 
 
309
  {
310
  "epoch": 4.31,
311
+ "learning_rate": 8.718211776409875e-05,
312
+ "loss": 0.0006,
313
  "step": 15394
314
  },
 
 
 
 
 
 
315
  {
316
  "epoch": 4.41,
317
+ "learning_rate": 8.662562954672636e-05,
318
+ "loss": 0.0007,
319
  "step": 15752
320
  },
 
 
 
 
 
 
321
  {
322
  "epoch": 4.51,
323
+ "learning_rate": 8.606914132935397e-05,
324
+ "loss": 0.0006,
325
  "step": 16110
326
  },
 
 
 
 
 
 
327
  {
328
  "epoch": 4.61,
329
+ "learning_rate": 8.55126531119816e-05,
 
 
 
 
 
 
330
  "loss": 0.0006,
331
+ "step": 16468
332
  },
333
  {
334
  "epoch": 4.71,
335
+ "learning_rate": 8.495616489460922e-05,
336
+ "loss": 0.0006,
337
  "step": 16826
338
  },
 
 
 
 
 
 
339
  {
340
  "epoch": 4.81,
341
+ "learning_rate": 8.439967667723684e-05,
342
+ "loss": 0.0007,
343
  "step": 17184
344
  },
 
 
 
 
 
 
345
  {
346
  "epoch": 4.91,
347
+ "learning_rate": 8.384318845986446e-05,
348
+ "loss": 0.0006,
349
  "step": 17542
350
  },
 
 
 
 
 
 
351
  {
352
  "epoch": 5.0,
353
+ "eval_loss": 9.952082109521143e-07,
354
  "eval_max_distance": 0,
355
  "eval_mean_distance": 0,
356
+ "eval_runtime": 2.5707,
357
+ "eval_samples_per_second": 19.45,
358
+ "eval_steps_per_second": 0.778,
359
+ "step": 17870
360
  },
361
  {
362
  "epoch": 5.01,
363
+ "learning_rate": 8.328670024249207e-05,
364
+ "loss": 0.0006,
365
  "step": 17900
366
  },
 
 
 
 
 
 
367
  {
368
  "epoch": 5.11,
369
+ "learning_rate": 8.273021202511969e-05,
370
  "loss": 0.0007,
371
  "step": 18258
372
  },
 
 
 
 
 
 
373
  {
374
  "epoch": 5.21,
375
+ "learning_rate": 8.217372380774731e-05,
376
+ "loss": 0.0006,
377
  "step": 18616
378
  },
 
 
 
 
 
 
379
  {
380
  "epoch": 5.31,
381
+ "learning_rate": 8.161723559037493e-05,
382
  "loss": 0.0006,
383
  "step": 18974
384
  },
 
 
 
 
 
 
385
  {
386
  "epoch": 5.41,
387
+ "learning_rate": 8.106074737300256e-05,
388
+ "loss": 0.0006,
389
  "step": 19332
390
  },
 
 
 
 
 
 
391
  {
392
  "epoch": 5.51,
393
+ "learning_rate": 8.050425915563017e-05,
394
+ "loss": 0.0005,
395
  "step": 19690
396
  },
 
 
 
 
 
 
397
  {
398
  "epoch": 5.61,
399
+ "learning_rate": 7.99477709382578e-05,
400
+ "loss": 0.0005,
401
  "step": 20048
402
  },
 
 
 
 
 
 
403
  {
404
  "epoch": 5.71,
405
+ "learning_rate": 7.939128272088541e-05,
406
+ "loss": 0.0005,
407
  "step": 20406
408
  },
 
 
 
 
 
 
409
  {
410
  "epoch": 5.81,
411
+ "learning_rate": 7.883479450351303e-05,
412
+ "loss": 0.0005,
413
  "step": 20764
414
  },
 
 
 
 
 
 
415
  {
416
  "epoch": 5.91,
417
+ "learning_rate": 7.827830628614064e-05,
418
+ "loss": 0.0006,
419
  "step": 21122
420
  },
 
 
 
 
 
 
421
  {
422
  "epoch": 6.0,
423
+ "eval_loss": 4.1270132555837336e-07,
424
  "eval_max_distance": 0,
425
  "eval_mean_distance": 0,
426
+ "eval_runtime": 2.5607,
427
+ "eval_samples_per_second": 19.526,
428
+ "eval_steps_per_second": 0.781,
429
+ "step": 21444
430
  },
431
  {
432
  "epoch": 6.01,
433
+ "learning_rate": 7.772181806876827e-05,
434
+ "loss": 0.0005,
435
  "step": 21480
436
  },
 
 
 
 
 
 
437
  {
438
  "epoch": 6.11,
439
+ "learning_rate": 7.716532985139588e-05,
440
  "loss": 0.0005,
441
  "step": 21838
442
  },
 
 
 
 
 
 
443
  {
444
  "epoch": 6.21,
445
+ "learning_rate": 7.660884163402351e-05,
446
  "loss": 0.0006,
447
  "step": 22196
448
  },
 
 
 
 
 
 
449
  {
450
  "epoch": 6.31,
451
+ "learning_rate": 7.605235341665112e-05,
452
+ "loss": 0.0007,
453
  "step": 22554
454
  },
 
 
 
 
 
 
455
  {
456
  "epoch": 6.41,
457
+ "learning_rate": 7.549586519927875e-05,
458
+ "loss": 0.0004,
459
  "step": 22912
460
  },
 
 
 
 
 
 
461
  {
462
  "epoch": 6.51,
463
+ "learning_rate": 7.493937698190637e-05,
464
+ "loss": 0.0005,
465
  "step": 23270
466
  },
 
 
 
 
 
 
467
  {
468
  "epoch": 6.61,
469
+ "learning_rate": 7.438288876453398e-05,
470
  "loss": 0.0007,
471
  "step": 23628
472
  },
473
  {
474
+ "epoch": 6.71,
475
+ "learning_rate": 7.38264005471616e-05,
476
+ "loss": 0.0004,
 
 
 
 
 
 
477
  "step": 23986
478
  },
479
  {
480
+ "epoch": 6.81,
481
+ "learning_rate": 7.326991232978922e-05,
482
+ "loss": 0.0005,
483
+ "step": 24344
484
  },
485
  {
486
+ "epoch": 6.91,
487
+ "learning_rate": 7.271342411241684e-05,
488
+ "loss": 0.0005,
489
+ "step": 24702
490
  },
491
  {
492
+ "epoch": 7.0,
493
+ "eval_loss": 3.9943967067301855e-07,
494
+ "eval_max_distance": 0,
495
+ "eval_mean_distance": 0,
496
+ "eval_runtime": 2.5544,
497
+ "eval_samples_per_second": 19.574,
498
+ "eval_steps_per_second": 0.783,
499
+ "step": 25018
500
+ },
501
+ {
502
+ "epoch": 7.01,
503
+ "learning_rate": 7.215693589504446e-05,
504
+ "loss": 0.0005,
505
+ "step": 25060
506
+ },
507
+ {
508
+ "epoch": 7.11,
509
+ "learning_rate": 7.160044767767208e-05,
510
+ "loss": 0.0005,
511
+ "step": 25418
512
+ },
513
+ {
514
+ "epoch": 7.21,
515
+ "learning_rate": 7.10439594602997e-05,
516
+ "loss": 0.0005,
517
+ "step": 25776
518
  },
519
  {
520
+ "epoch": 7.31,
521
+ "learning_rate": 7.048747124292732e-05,
522
+ "loss": 0.0005,
523
+ "step": 26134
524
+ },
525
+ {
526
+ "epoch": 7.41,
527
+ "learning_rate": 6.993098302555494e-05,
528
+ "loss": 0.0005,
529
+ "step": 26492
530
+ },
531
+ {
532
+ "epoch": 7.51,
533
+ "learning_rate": 6.937449480818255e-05,
534
+ "loss": 0.0004,
535
+ "step": 26850
536
+ },
537
+ {
538
+ "epoch": 7.61,
539
+ "learning_rate": 6.881800659081018e-05,
540
+ "loss": 0.0005,
541
+ "step": 27208
542
+ },
543
+ {
544
+ "epoch": 7.71,
545
+ "learning_rate": 6.826151837343779e-05,
546
  "loss": 0.0006,
547
+ "step": 27566
548
  },
549
  {
550
+ "epoch": 7.81,
551
+ "learning_rate": 6.770503015606542e-05,
552
+ "loss": 0.0004,
553
+ "step": 27924
554
  },
555
  {
556
+ "epoch": 7.91,
557
+ "learning_rate": 6.714854193869303e-05,
558
+ "loss": 0.0005,
559
+ "step": 28282
560
+ },
561
+ {
562
+ "epoch": 8.0,
563
+ "eval_loss": 2.2114301145848003e-07,
564
  "eval_max_distance": 0,
565
  "eval_mean_distance": 0,
566
+ "eval_runtime": 2.5528,
567
+ "eval_samples_per_second": 19.587,
568
+ "eval_steps_per_second": 0.783,
569
+ "step": 28592
570
  },
571
  {
572
+ "epoch": 8.01,
573
+ "learning_rate": 6.659205372132066e-05,
574
+ "loss": 0.0004,
575
+ "step": 28640
576
  },
577
  {
578
+ "epoch": 8.11,
579
+ "learning_rate": 6.603556550394828e-05,
580
+ "loss": 0.0005,
581
+ "step": 28998
582
  },
583
  {
584
+ "epoch": 8.21,
585
+ "learning_rate": 6.547907728657589e-05,
586
+ "loss": 0.0004,
587
+ "step": 29356
588
  },
589
  {
590
+ "epoch": 8.31,
591
+ "learning_rate": 6.49225890692035e-05,
592
+ "loss": 0.0005,
593
+ "step": 29714
594
  },
595
  {
596
+ "epoch": 8.41,
597
+ "learning_rate": 6.436610085183112e-05,
598
+ "loss": 0.0004,
599
+ "step": 30072
600
  },
601
  {
602
+ "epoch": 8.51,
603
+ "learning_rate": 6.380961263445875e-05,
604
+ "loss": 0.0005,
605
+ "step": 30430
606
  },
607
  {
608
+ "epoch": 8.61,
609
+ "learning_rate": 6.325312441708636e-05,
610
  "loss": 0.0006,
611
+ "step": 30788
612
  },
613
  {
614
+ "epoch": 8.71,
615
+ "learning_rate": 6.269663619971399e-05,
616
+ "loss": 0.0005,
617
+ "step": 31146
618
  },
619
  {
620
+ "epoch": 8.81,
621
+ "learning_rate": 6.21401479823416e-05,
622
  "loss": 0.0005,
623
+ "step": 31504
624
  },
625
  {
626
+ "epoch": 8.91,
627
+ "learning_rate": 6.158365976496923e-05,
628
+ "loss": 0.0005,
629
+ "step": 31862
630
  },
631
  {
632
+ "epoch": 9.0,
633
+ "eval_loss": 1.2389272114887717e-06,
634
+ "eval_max_distance": 0,
635
+ "eval_mean_distance": 0,
636
+ "eval_runtime": 2.565,
637
+ "eval_samples_per_second": 19.493,
638
+ "eval_steps_per_second": 0.78,
639
+ "step": 32166
640
  },
641
  {
642
+ "epoch": 9.02,
643
+ "learning_rate": 6.102717154759684e-05,
644
+ "loss": 0.0004,
645
+ "step": 32220
646
  },
647
  {
648
+ "epoch": 9.12,
649
+ "learning_rate": 6.0470683330224465e-05,
650
  "loss": 0.0005,
651
+ "step": 32578
652
  },
653
  {
654
+ "epoch": 9.22,
655
+ "learning_rate": 5.991419511285208e-05,
656
  "loss": 0.0004,
657
+ "step": 32936
658
  },
659
  {
660
+ "epoch": 9.32,
661
+ "learning_rate": 5.93577068954797e-05,
662
+ "loss": 0.0004,
663
+ "step": 33294
664
  },
665
  {
666
+ "epoch": 9.42,
667
+ "learning_rate": 5.8801218678107315e-05,
668
+ "loss": 0.0005,
669
+ "step": 33652
670
  },
671
  {
672
+ "epoch": 9.52,
673
+ "learning_rate": 5.824473046073494e-05,
674
+ "loss": 0.0004,
675
+ "step": 34010
676
  },
677
  {
678
+ "epoch": 9.62,
679
+ "learning_rate": 5.7688242243362557e-05,
680
+ "loss": 0.0005,
681
+ "step": 34368
682
  },
683
  {
684
+ "epoch": 9.72,
685
+ "learning_rate": 5.713175402599018e-05,
686
+ "loss": 0.0005,
687
+ "step": 34726
688
  },
689
  {
690
+ "epoch": 9.82,
691
+ "learning_rate": 5.657526580861779e-05,
692
+ "loss": 0.0005,
693
+ "step": 35084
694
  },
695
  {
696
+ "epoch": 9.92,
697
+ "learning_rate": 5.601877759124542e-05,
698
+ "loss": 0.0004,
699
+ "step": 35442
700
+ },
701
+ {
702
+ "epoch": 10.0,
703
+ "eval_loss": 5.33479237674328e-07,
704
  "eval_max_distance": 0,
705
  "eval_mean_distance": 0,
706
+ "eval_runtime": 2.5624,
707
+ "eval_samples_per_second": 19.513,
708
+ "eval_steps_per_second": 0.781,
709
+ "step": 35740
710
  },
711
  {
712
+ "epoch": 10.02,
713
+ "learning_rate": 5.5462289373873034e-05,
714
+ "loss": 0.0005,
715
+ "step": 35800
716
  },
717
  {
718
+ "epoch": 10.12,
719
+ "learning_rate": 5.4905801156500655e-05,
720
+ "loss": 0.0004,
721
+ "step": 36158
722
  },
723
  {
724
+ "epoch": 10.22,
725
+ "learning_rate": 5.434931293912827e-05,
726
+ "loss": 0.0005,
727
+ "step": 36516
728
  },
729
  {
730
+ "epoch": 10.32,
731
+ "learning_rate": 5.37928247217559e-05,
732
  "loss": 0.0005,
733
+ "step": 36874
734
  },
735
  {
736
+ "epoch": 10.42,
737
+ "learning_rate": 5.323633650438351e-05,
738
+ "loss": 0.0005,
739
+ "step": 37232
740
  },
741
  {
742
+ "epoch": 10.52,
743
+ "learning_rate": 5.267984828701113e-05,
744
+ "loss": 0.0004,
745
+ "step": 37590
746
  },
747
  {
748
+ "epoch": 10.62,
749
+ "learning_rate": 5.2123360069638746e-05,
750
+ "loss": 0.0005,
751
+ "step": 37948
752
  },
753
  {
754
+ "epoch": 10.72,
755
+ "learning_rate": 5.1566871852266374e-05,
756
+ "loss": 0.0004,
757
+ "step": 38306
758
  },
759
  {
760
+ "epoch": 10.82,
761
+ "learning_rate": 5.101038363489399e-05,
762
  "loss": 0.0006,
763
+ "step": 38664
764
  },
765
  {
766
+ "epoch": 10.92,
767
+ "learning_rate": 5.0453895417521616e-05,
768
  "loss": 0.0005,
769
+ "step": 39022
770
  },
771
  {
772
+ "epoch": 11.0,
773
+ "eval_loss": 3.300738171674311e-07,
774
+ "eval_max_distance": 0,
775
+ "eval_mean_distance": 0,
776
+ "eval_runtime": 2.5519,
777
+ "eval_samples_per_second": 19.593,
778
+ "eval_steps_per_second": 0.784,
779
+ "step": 39314
780
  },
781
  {
782
+ "epoch": 11.02,
783
+ "learning_rate": 4.9897407200149224e-05,
784
  "loss": 0.0004,
785
+ "step": 39380
786
  },
787
  {
788
+ "epoch": 11.12,
789
+ "learning_rate": 4.9340918982776845e-05,
790
+ "loss": 0.0004,
791
+ "step": 39738
792
  },
793
  {
794
+ "epoch": 11.22,
795
+ "learning_rate": 4.8784430765404466e-05,
796
+ "loss": 0.0005,
797
+ "step": 40096
798
  },
799
  {
800
+ "epoch": 11.32,
801
+ "learning_rate": 4.8227942548032087e-05,
802
+ "loss": 0.0004,
803
+ "step": 40454
804
  },
805
  {
806
+ "epoch": 11.42,
807
+ "learning_rate": 4.76714543306597e-05,
808
  "loss": 0.0005,
809
+ "step": 40812
810
  },
811
  {
812
+ "epoch": 11.52,
813
+ "learning_rate": 4.711496611328732e-05,
814
+ "loss": 0.0004,
815
+ "step": 41170
816
  },
817
  {
818
+ "epoch": 11.62,
819
+ "learning_rate": 4.655847789591494e-05,
820
+ "loss": 0.0004,
821
+ "step": 41528
822
  },
823
  {
824
+ "epoch": 11.72,
825
+ "learning_rate": 4.6001989678542564e-05,
826
  "loss": 0.0005,
827
+ "step": 41886
828
  },
829
  {
830
+ "epoch": 11.82,
831
+ "learning_rate": 4.544550146117018e-05,
832
+ "loss": 0.0004,
833
+ "step": 42244
834
  },
835
  {
836
+ "epoch": 11.92,
837
+ "learning_rate": 4.48890132437978e-05,
838
+ "loss": 0.0004,
839
+ "step": 42602
840
+ },
841
+ {
842
+ "epoch": 12.0,
843
+ "eval_loss": 3.465572717686882e-07,
844
  "eval_max_distance": 0,
845
  "eval_mean_distance": 0,
846
+ "eval_runtime": 2.5595,
847
+ "eval_samples_per_second": 19.535,
848
+ "eval_steps_per_second": 0.781,
849
+ "step": 42888
850
  },
851
  {
852
+ "epoch": 12.02,
853
+ "learning_rate": 4.433252502642542e-05,
854
  "loss": 0.0005,
855
+ "step": 42960
856
  },
857
  {
858
+ "epoch": 12.12,
859
+ "learning_rate": 4.377603680905304e-05,
860
+ "loss": 0.0004,
861
+ "step": 43318
862
  },
863
  {
864
+ "epoch": 12.22,
865
+ "learning_rate": 4.3219548591680655e-05,
866
  "loss": 0.0004,
867
+ "step": 43676
868
  },
869
  {
870
+ "epoch": 12.32,
871
+ "learning_rate": 4.2663060374308276e-05,
872
+ "loss": 0.0004,
873
+ "step": 44034
874
  },
875
  {
876
+ "epoch": 12.42,
877
+ "learning_rate": 4.21065721569359e-05,
878
+ "loss": 0.0004,
879
+ "step": 44392
880
  },
881
  {
882
+ "epoch": 12.52,
883
+ "learning_rate": 4.155008393956352e-05,
884
+ "loss": 0.0004,
885
+ "step": 44750
886
  },
887
  {
888
+ "epoch": 12.62,
889
+ "learning_rate": 4.099359572219114e-05,
890
+ "loss": 0.0003,
891
+ "step": 45108
892
  },
893
  {
894
+ "epoch": 12.72,
895
+ "learning_rate": 4.0437107504818753e-05,
896
+ "loss": 0.0004,
897
+ "step": 45466
898
  },
899
  {
900
+ "epoch": 12.82,
901
+ "learning_rate": 3.9880619287446375e-05,
902
+ "loss": 0.0004,
903
+ "step": 45824
904
  },
905
  {
906
+ "epoch": 12.92,
907
+ "learning_rate": 3.9324131070073996e-05,
908
+ "loss": 0.0004,
909
+ "step": 46182
910
  },
911
  {
912
+ "epoch": 13.0,
913
+ "eval_loss": 2.838061732290953e-07,
914
+ "eval_max_distance": 0,
915
+ "eval_mean_distance": 0,
916
+ "eval_runtime": 2.5426,
917
+ "eval_samples_per_second": 19.665,
918
+ "eval_steps_per_second": 0.787,
919
+ "step": 46462
920
+ },
921
+ {
922
+ "epoch": 13.02,
923
+ "learning_rate": 3.8767642852701617e-05,
924
  "loss": 0.0005,
925
+ "step": 46540
926
  },
927
  {
928
+ "epoch": 13.12,
929
+ "learning_rate": 3.821115463532923e-05,
930
+ "loss": 0.0004,
931
+ "step": 46898
932
  },
933
  {
934
+ "epoch": 13.22,
935
+ "learning_rate": 3.765466641795685e-05,
936
+ "loss": 0.0004,
937
+ "step": 47256
938
+ },
939
+ {
940
+ "epoch": 13.32,
941
+ "learning_rate": 3.709817820058447e-05,
942
+ "loss": 0.0004,
943
+ "step": 47614
944
  },
945
  {
946
+ "epoch": 13.42,
947
+ "learning_rate": 3.6541689983212094e-05,
948
+ "loss": 0.0003,
949
+ "step": 47972
950
+ },
951
+ {
952
+ "epoch": 13.52,
953
+ "learning_rate": 3.598520176583971e-05,
954
  "loss": 0.0005,
955
+ "step": 48330
956
  },
957
  {
958
+ "epoch": 13.62,
959
+ "learning_rate": 3.542871354846733e-05,
960
+ "loss": 0.0004,
961
+ "step": 48688
962
+ },
963
+ {
964
+ "epoch": 13.72,
965
+ "learning_rate": 3.487222533109495e-05,
966
+ "loss": 0.0004,
967
+ "step": 49046
968
+ },
969
+ {
970
+ "epoch": 13.82,
971
+ "learning_rate": 3.431573711372257e-05,
972
+ "loss": 0.0004,
973
+ "step": 49404
974
+ },
975
+ {
976
+ "epoch": 13.92,
977
+ "learning_rate": 3.3759248896350185e-05,
978
+ "loss": 0.0004,
979
+ "step": 49762
980
+ },
981
+ {
982
+ "epoch": 14.0,
983
+ "eval_loss": 2.3388430747672828e-07,
984
+ "eval_max_distance": 0,
985
+ "eval_mean_distance": 0,
986
+ "eval_runtime": 2.5618,
987
+ "eval_samples_per_second": 19.517,
988
+ "eval_steps_per_second": 0.781,
989
+ "step": 50036
990
+ },
991
+ {
992
+ "epoch": 14.02,
993
+ "learning_rate": 3.3202760678977806e-05,
994
+ "loss": 0.0004,
995
+ "step": 50120
996
+ },
997
+ {
998
+ "epoch": 14.12,
999
+ "learning_rate": 3.264627246160543e-05,
1000
+ "loss": 0.0004,
1001
+ "step": 50478
1002
+ },
1003
+ {
1004
+ "epoch": 14.22,
1005
+ "learning_rate": 3.208978424423304e-05,
1006
  "loss": 0.0005,
1007
+ "step": 50836
1008
  },
1009
  {
1010
+ "epoch": 14.32,
1011
+ "learning_rate": 3.153329602686066e-05,
1012
+ "loss": 0.0003,
1013
+ "step": 51194
1014
+ },
1015
+ {
1016
+ "epoch": 14.42,
1017
+ "learning_rate": 3.097680780948828e-05,
1018
+ "loss": 0.0004,
1019
+ "step": 51552
1020
+ },
1021
+ {
1022
+ "epoch": 14.52,
1023
+ "learning_rate": 3.0420319592115898e-05,
1024
+ "loss": 0.0004,
1025
+ "step": 51910
1026
+ },
1027
+ {
1028
+ "epoch": 14.62,
1029
+ "learning_rate": 2.986383137474352e-05,
1030
+ "loss": 0.0004,
1031
+ "step": 52268
1032
+ },
1033
+ {
1034
+ "epoch": 14.72,
1035
+ "learning_rate": 2.9307343157371136e-05,
1036
+ "loss": 0.0003,
1037
+ "step": 52626
1038
+ },
1039
+ {
1040
+ "epoch": 14.82,
1041
+ "learning_rate": 2.8750854939998757e-05,
1042
+ "loss": 0.0004,
1043
+ "step": 52984
1044
+ },
1045
+ {
1046
+ "epoch": 14.93,
1047
+ "learning_rate": 2.8194366722626375e-05,
1048
+ "loss": 0.0004,
1049
+ "step": 53342
1050
+ },
1051
+ {
1052
+ "epoch": 15.0,
1053
+ "eval_loss": 3.5529723163563176e-07,
1054
+ "eval_max_distance": 0,
1055
+ "eval_mean_distance": 0,
1056
+ "eval_runtime": 2.5603,
1057
+ "eval_samples_per_second": 19.529,
1058
+ "eval_steps_per_second": 0.781,
1059
+ "step": 53610
1060
+ },
1061
+ {
1062
+ "epoch": 15.03,
1063
+ "learning_rate": 2.7637878505253996e-05,
1064
+ "loss": 0.0004,
1065
+ "step": 53700
1066
+ },
1067
+ {
1068
+ "epoch": 15.13,
1069
+ "learning_rate": 2.7081390287881614e-05,
1070
+ "loss": 0.0004,
1071
+ "step": 54058
1072
+ },
1073
+ {
1074
+ "epoch": 15.23,
1075
+ "learning_rate": 2.6524902070509235e-05,
1076
+ "loss": 0.0003,
1077
+ "step": 54416
1078
+ },
1079
+ {
1080
+ "epoch": 15.33,
1081
+ "learning_rate": 2.5968413853136852e-05,
1082
+ "loss": 0.0003,
1083
+ "step": 54774
1084
+ },
1085
+ {
1086
+ "epoch": 15.43,
1087
+ "learning_rate": 2.5411925635764473e-05,
1088
+ "loss": 0.0003,
1089
+ "step": 55132
1090
+ },
1091
+ {
1092
+ "epoch": 15.53,
1093
+ "learning_rate": 2.485543741839209e-05,
1094
+ "loss": 0.0003,
1095
+ "step": 55490
1096
+ },
1097
+ {
1098
+ "epoch": 15.63,
1099
+ "learning_rate": 2.4298949201019712e-05,
1100
+ "loss": 0.0004,
1101
+ "step": 55848
1102
+ },
1103
+ {
1104
+ "epoch": 15.73,
1105
+ "learning_rate": 2.374246098364733e-05,
1106
+ "loss": 0.0004,
1107
+ "step": 56206
1108
+ },
1109
+ {
1110
+ "epoch": 15.83,
1111
+ "learning_rate": 2.318597276627495e-05,
1112
  "loss": 0.0005,
1113
+ "step": 56564
1114
  },
1115
  {
1116
+ "epoch": 15.93,
1117
+ "learning_rate": 2.2629484548902568e-05,
1118
+ "loss": 0.0004,
1119
+ "step": 56922
1120
  },
1121
  {
1122
+ "epoch": 16.0,
1123
+ "eval_loss": 1.6842723482568545e-07,
1124
+ "eval_max_distance": 0,
1125
+ "eval_mean_distance": 0,
1126
+ "eval_runtime": 2.5459,
1127
+ "eval_samples_per_second": 19.64,
1128
+ "eval_steps_per_second": 0.786,
1129
+ "step": 57184
1130
  },
1131
  {
1132
+ "epoch": 16.03,
1133
+ "learning_rate": 2.207299633153019e-05,
1134
+ "loss": 0.0004,
1135
+ "step": 57280
1136
  },
1137
  {
1138
+ "epoch": 16.13,
1139
+ "learning_rate": 2.1516508114157807e-05,
1140
+ "loss": 0.0004,
1141
+ "step": 57638
1142
  },
1143
  {
1144
+ "epoch": 16.23,
1145
+ "learning_rate": 2.0960019896785428e-05,
1146
+ "loss": 0.0003,
1147
+ "step": 57996
1148
+ },
1149
+ {
1150
+ "epoch": 16.33,
1151
+ "learning_rate": 2.0403531679413045e-05,
1152
+ "loss": 0.0004,
1153
+ "step": 58354
1154
+ },
1155
+ {
1156
+ "epoch": 16.43,
1157
+ "learning_rate": 1.9847043462040666e-05,
1158
+ "loss": 0.0003,
1159
+ "step": 58712
1160
+ },
1161
+ {
1162
+ "epoch": 16.53,
1163
+ "learning_rate": 1.9290555244668284e-05,
1164
+ "loss": 0.0003,
1165
+ "step": 59070
1166
+ },
1167
+ {
1168
+ "epoch": 16.63,
1169
+ "learning_rate": 1.8734067027295905e-05,
1170
+ "loss": 0.0003,
1171
+ "step": 59428
1172
+ },
1173
+ {
1174
+ "epoch": 16.73,
1175
+ "learning_rate": 1.8177578809923522e-05,
1176
+ "loss": 0.0004,
1177
+ "step": 59786
1178
+ },
1179
+ {
1180
+ "epoch": 16.83,
1181
+ "learning_rate": 1.7621090592551143e-05,
1182
+ "loss": 0.0003,
1183
+ "step": 60144
1184
+ },
1185
+ {
1186
+ "epoch": 16.93,
1187
+ "learning_rate": 1.706460237517876e-05,
1188
+ "loss": 0.0003,
1189
+ "step": 60502
1190
+ },
1191
+ {
1192
+ "epoch": 17.0,
1193
+ "eval_loss": 1.6692392534878309e-07,
1194
  "eval_max_distance": 0,
1195
  "eval_mean_distance": 0,
1196
+ "eval_runtime": 2.572,
1197
+ "eval_samples_per_second": 19.44,
1198
+ "eval_steps_per_second": 0.778,
1199
+ "step": 60758
1200
  },
1201
  {
1202
+ "epoch": 17.03,
1203
+ "learning_rate": 1.6508114157806382e-05,
1204
+ "loss": 0.0004,
1205
+ "step": 60860
1206
+ },
1207
+ {
1208
+ "epoch": 17.13,
1209
+ "learning_rate": 1.5951625940434e-05,
1210
+ "loss": 0.0003,
1211
+ "step": 61218
1212
+ },
1213
+ {
1214
+ "epoch": 17.23,
1215
+ "learning_rate": 1.5395137723061617e-05,
1216
+ "loss": 0.0003,
1217
+ "step": 61576
1218
+ },
1219
+ {
1220
+ "epoch": 17.33,
1221
+ "learning_rate": 1.4838649505689237e-05,
1222
+ "loss": 0.0004,
1223
+ "step": 61934
1224
+ },
1225
+ {
1226
+ "epoch": 17.43,
1227
+ "learning_rate": 1.4282161288316856e-05,
1228
+ "loss": 0.0003,
1229
+ "step": 62292
1230
+ },
1231
+ {
1232
+ "epoch": 17.53,
1233
+ "learning_rate": 1.3725673070944475e-05,
1234
+ "loss": 0.0004,
1235
+ "step": 62650
1236
+ },
1237
+ {
1238
+ "epoch": 17.63,
1239
+ "learning_rate": 1.3169184853572095e-05,
1240
+ "loss": 0.0003,
1241
+ "step": 63008
1242
+ },
1243
+ {
1244
+ "epoch": 17.73,
1245
+ "learning_rate": 1.2612696636199714e-05,
1246
+ "loss": 0.0003,
1247
+ "step": 63366
1248
+ },
1249
+ {
1250
+ "epoch": 17.83,
1251
+ "learning_rate": 1.2056208418827333e-05,
1252
+ "loss": 0.0004,
1253
+ "step": 63724
1254
+ },
1255
+ {
1256
+ "epoch": 17.93,
1257
+ "learning_rate": 1.1499720201454953e-05,
1258
+ "loss": 0.0003,
1259
+ "step": 64082
1260
+ },
1261
+ {
1262
+ "epoch": 18.0,
1263
+ "eval_loss": 1.2040422348036373e-07,
1264
+ "eval_max_distance": 0,
1265
+ "eval_mean_distance": 0,
1266
+ "eval_runtime": 2.5484,
1267
+ "eval_samples_per_second": 19.62,
1268
+ "eval_steps_per_second": 0.785,
1269
+ "step": 64332
1270
+ },
1271
+ {
1272
+ "epoch": 18.03,
1273
+ "learning_rate": 1.0943231984082572e-05,
1274
+ "loss": 0.0004,
1275
+ "step": 64440
1276
+ },
1277
+ {
1278
+ "epoch": 18.13,
1279
+ "learning_rate": 1.0386743766710191e-05,
1280
+ "loss": 0.0004,
1281
+ "step": 64798
1282
+ },
1283
+ {
1284
+ "epoch": 18.23,
1285
+ "learning_rate": 9.83025554933781e-06,
1286
+ "loss": 0.0004,
1287
+ "step": 65156
1288
+ },
1289
+ {
1290
+ "epoch": 18.33,
1291
+ "learning_rate": 9.27376733196543e-06,
1292
+ "loss": 0.0003,
1293
+ "step": 65514
1294
+ },
1295
+ {
1296
+ "epoch": 18.43,
1297
+ "learning_rate": 8.717279114593049e-06,
1298
+ "loss": 0.0004,
1299
+ "step": 65872
1300
+ },
1301
+ {
1302
+ "epoch": 18.53,
1303
+ "learning_rate": 8.160790897220668e-06,
1304
+ "loss": 0.0003,
1305
+ "step": 66230
1306
+ },
1307
+ {
1308
+ "epoch": 18.63,
1309
+ "learning_rate": 7.604302679848288e-06,
1310
+ "loss": 0.0003,
1311
+ "step": 66588
1312
+ },
1313
+ {
1314
+ "epoch": 18.73,
1315
+ "learning_rate": 7.047814462475906e-06,
1316
+ "loss": 0.0004,
1317
+ "step": 66946
1318
+ },
1319
+ {
1320
+ "epoch": 18.83,
1321
+ "learning_rate": 6.4913262451035254e-06,
1322
+ "loss": 0.0004,
1323
+ "step": 67304
1324
+ },
1325
+ {
1326
+ "epoch": 18.93,
1327
+ "learning_rate": 5.934838027731145e-06,
1328
+ "loss": 0.0003,
1329
+ "step": 67662
1330
+ },
1331
+ {
1332
+ "epoch": 19.0,
1333
+ "eval_loss": 1.5021944932414044e-07,
1334
+ "eval_max_distance": 0,
1335
+ "eval_mean_distance": 0,
1336
+ "eval_runtime": 2.5611,
1337
+ "eval_samples_per_second": 19.523,
1338
+ "eval_steps_per_second": 0.781,
1339
+ "step": 67906
1340
+ },
1341
+ {
1342
+ "epoch": 19.03,
1343
+ "learning_rate": 5.378349810358764e-06,
1344
+ "loss": 0.0004,
1345
+ "step": 68020
1346
+ },
1347
+ {
1348
+ "epoch": 19.13,
1349
+ "learning_rate": 4.821861592986383e-06,
1350
+ "loss": 0.0003,
1351
+ "step": 68378
1352
+ },
1353
+ {
1354
+ "epoch": 19.23,
1355
+ "learning_rate": 4.265373375614003e-06,
1356
+ "loss": 0.0003,
1357
+ "step": 68736
1358
+ },
1359
+ {
1360
+ "epoch": 19.33,
1361
+ "learning_rate": 3.708885158241622e-06,
1362
+ "loss": 0.0004,
1363
+ "step": 69094
1364
+ },
1365
+ {
1366
+ "epoch": 19.43,
1367
+ "learning_rate": 3.152396940869241e-06,
1368
+ "loss": 0.0003,
1369
+ "step": 69452
1370
+ },
1371
+ {
1372
+ "epoch": 19.53,
1373
+ "learning_rate": 2.59590872349686e-06,
1374
+ "loss": 0.0004,
1375
+ "step": 69810
1376
+ },
1377
+ {
1378
+ "epoch": 19.63,
1379
+ "learning_rate": 2.0394205061244795e-06,
1380
+ "loss": 0.0004,
1381
+ "step": 70168
1382
+ },
1383
+ {
1384
+ "epoch": 19.73,
1385
+ "learning_rate": 1.4829322887520986e-06,
1386
+ "loss": 0.0005,
1387
+ "step": 70526
1388
+ },
1389
+ {
1390
+ "epoch": 19.83,
1391
+ "learning_rate": 9.264440713797177e-07,
1392
+ "loss": 0.0003,
1393
+ "step": 70884
1394
+ },
1395
+ {
1396
+ "epoch": 19.93,
1397
+ "learning_rate": 3.6995585400733695e-07,
1398
+ "loss": 0.0004,
1399
+ "step": 71242
1400
+ },
1401
+ {
1402
+ "epoch": 20.0,
1403
+ "eval_loss": 1.4148230320643052e-07,
1404
+ "eval_max_distance": 0,
1405
+ "eval_mean_distance": 0,
1406
+ "eval_runtime": 2.545,
1407
+ "eval_samples_per_second": 19.646,
1408
+ "eval_steps_per_second": 0.786,
1409
+ "step": 71480
1410
+ },
1411
+ {
1412
+ "epoch": 20.0,
1413
+ "step": 71480,
1414
+ "total_flos": 4.280579565218611e+16,
1415
+ "train_loss": 0.0004841739220773227,
1416
+ "train_runtime": 6809.3357,
1417
+ "train_samples_per_second": 314.85,
1418
+ "train_steps_per_second": 10.497
1419
  }
1420
  ],
1421
+ "logging_steps": 358,
1422
+ "max_steps": 71480,
1423
+ "num_train_epochs": 20,
1424
+ "save_steps": 715,
1425
+ "total_flos": 4.280579565218611e+16,
1426
  "trial_name": null,
1427
  "trial_params": null
1428
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68bef9cf8ef55bfd4fe65bd1c9417e82a2b93577caa732329b9bbd2c854e5ba1
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fad6d5ebb012fd5dfec0383c6b518298f290367ffb6e59deb09b8d7588a4e59
3
  size 4091