alexue4 commited on
Commit
6d4fe45
1 Parent(s): 8168262

End of training

Browse files
Files changed (4) hide show
  1. README.md +7 -17
  2. pytorch_model.bin +1 -1
  3. trainer_state.json +835 -935
  4. training_args.bin +1 -1
README.md CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0366
19
  - Mean Distance: 0
20
  - Max Distance: 8
21
 
@@ -43,27 +43,17 @@ The following hyperparameters were used during training:
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 15
47
 
48
  ### Training results
49
 
50
  | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
  |:-------------:|:-----:|:------:|:---------------:|:-------------:|:------------:|
52
- | 0.0052 | 1.0 | 22916 | 0.0271 | 0 | 9 |
53
- | 0.0051 | 2.0 | 45832 | 0.0261 | 0 | 8 |
54
- | 0.0043 | 3.0 | 68748 | 0.0313 | 0 | 8 |
55
- | 0.0041 | 4.0 | 91664 | 0.0278 | 0 | 10 |
56
- | 0.0037 | 5.0 | 114580 | 0.0280 | 0 | 8 |
57
- | 0.0032 | 6.0 | 137496 | 0.0288 | 0 | 8 |
58
- | 0.003 | 7.0 | 160412 | 0.0308 | 0 | 8 |
59
- | 0.0025 | 8.0 | 183328 | 0.0305 | 0 | 8 |
60
- | 0.0025 | 9.0 | 206244 | 0.0303 | 0 | 8 |
61
- | 0.0023 | 10.0 | 229160 | 0.0341 | 0 | 8 |
62
- | 0.0022 | 11.0 | 252076 | 0.0329 | 0 | 8 |
63
- | 0.0019 | 12.0 | 274992 | 0.0336 | 0 | 8 |
64
- | 0.002 | 13.0 | 297908 | 0.0358 | 0 | 8 |
65
- | 0.0018 | 14.0 | 320824 | 0.0355 | 0 | 8 |
66
- | 0.0019 | 15.0 | 343740 | 0.0366 | 0 | 8 |
67
 
68
 
69
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0279
19
  - Mean Distance: 0
20
  - Max Distance: 8
21
 
 
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 5
47
 
48
  ### Training results
49
 
50
  | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
  |:-------------:|:-----:|:------:|:---------------:|:-------------:|:------------:|
52
+ | 0.0024 | 1.0 | 22994 | 0.0264 | 0 | 8 |
53
+ | 0.0022 | 2.0 | 45988 | 0.0259 | 0 | 8 |
54
+ | 0.0019 | 3.0 | 68982 | 0.0292 | 0 | 8 |
55
+ | 0.0016 | 4.0 | 91976 | 0.0281 | 0 | 8 |
56
+ | 0.0016 | 5.0 | 114970 | 0.0279 | 0 | 8 |
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f02dda551ed1f056d9fed08e40df3447ff8597cf88883f5f1ca2067d54133a61
3
  size 258643461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a57c7242c971b094f9debfa498d92a23843f114d3e18a09c850885d9c933eb
3
  size 258643461
trainer_state.json CHANGED
@@ -1,1378 +1,1278 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 15.0,
5
  "eval_steps": 500,
6
- "global_step": 343740,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 2.909175539652063e-09,
14
- "loss": 0.0085,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.08,
19
- "learning_rate": 5.000872752661896e-06,
20
- "loss": 0.0066,
21
- "step": 1719
22
- },
23
- {
24
- "epoch": 0.15,
25
- "learning_rate": 1.0001745505323792e-05,
26
- "loss": 0.0061,
27
- "step": 3438
28
- },
29
- {
30
- "epoch": 0.23,
31
- "learning_rate": 1.5002618257985687e-05,
32
- "loss": 0.0059,
33
- "step": 5157
34
  },
35
  {
36
- "epoch": 0.3,
37
- "learning_rate": 2.0003491010647585e-05,
38
- "loss": 0.0056,
39
- "step": 6876
40
  },
41
  {
42
- "epoch": 0.38,
43
- "learning_rate": 2.500436376330948e-05,
44
- "loss": 0.0052,
45
- "step": 8595
46
  },
47
  {
48
- "epoch": 0.45,
49
- "learning_rate": 3.0005236515971374e-05,
50
- "loss": 0.0052,
51
- "step": 10314
52
  },
53
  {
54
- "epoch": 0.53,
55
- "learning_rate": 3.500610926863327e-05,
56
- "loss": 0.0049,
57
- "step": 12033
58
  },
59
  {
60
- "epoch": 0.6,
61
- "learning_rate": 4.000698202129517e-05,
62
- "loss": 0.0049,
63
- "step": 13752
64
  },
65
  {
66
- "epoch": 0.68,
67
- "learning_rate": 4.5007854773957064e-05,
68
- "loss": 0.0052,
69
- "step": 15471
70
  },
71
  {
72
- "epoch": 0.75,
73
- "learning_rate": 5.000872752661896e-05,
74
- "loss": 0.0051,
75
- "step": 17190
76
  },
77
  {
78
- "epoch": 0.83,
79
- "learning_rate": 5.500960027928086e-05,
80
- "loss": 0.0052,
81
- "step": 18909
82
  },
83
  {
84
- "epoch": 0.9,
85
- "learning_rate": 6.001047303194275e-05,
86
- "loss": 0.0048,
87
- "step": 20628
88
  },
89
  {
90
- "epoch": 0.98,
91
- "learning_rate": 6.501134578460465e-05,
92
- "loss": 0.0052,
93
- "step": 22347
94
  },
95
  {
96
- "epoch": 1.0,
97
- "eval_loss": 0.027106985449790955,
98
- "eval_max_distance": 9,
99
- "eval_mean_distance": 0,
100
- "eval_runtime": 14.4629,
101
- "eval_samples_per_second": 17.355,
102
- "eval_steps_per_second": 1.175,
103
- "step": 22916
104
  },
105
  {
106
- "epoch": 1.05,
107
- "learning_rate": 7.001221853726654e-05,
108
- "loss": 0.0045,
109
- "step": 24066
110
  },
111
  {
112
- "epoch": 1.13,
113
- "learning_rate": 7.501309128992844e-05,
114
- "loss": 0.0044,
115
- "step": 25785
116
  },
117
  {
118
- "epoch": 1.2,
119
- "learning_rate": 8.001396404259034e-05,
120
- "loss": 0.0048,
121
- "step": 27504
122
  },
123
  {
124
- "epoch": 1.28,
125
- "learning_rate": 8.501483679525223e-05,
126
- "loss": 0.0045,
127
- "step": 29223
128
  },
129
  {
130
- "epoch": 1.35,
131
- "learning_rate": 9.001570954791413e-05,
132
- "loss": 0.0044,
133
- "step": 30942
134
  },
135
  {
136
- "epoch": 1.43,
137
- "learning_rate": 9.501658230057602e-05,
138
- "loss": 0.0047,
139
- "step": 32661
140
  },
141
  {
142
- "epoch": 1.5,
143
- "learning_rate": 9.999806054964024e-05,
144
- "loss": 0.005,
145
- "step": 34380
146
  },
147
  {
148
- "epoch": 1.58,
149
- "learning_rate": 9.944240802156669e-05,
150
- "loss": 0.0049,
151
- "step": 36099
152
  },
153
  {
154
- "epoch": 1.65,
155
- "learning_rate": 9.888675549349314e-05,
156
- "loss": 0.0053,
157
- "step": 37818
158
  },
159
  {
160
- "epoch": 1.73,
161
- "learning_rate": 9.83311029654196e-05,
162
- "loss": 0.0047,
163
- "step": 39537
164
  },
165
  {
166
- "epoch": 1.8,
167
- "learning_rate": 9.777545043734606e-05,
168
- "loss": 0.005,
169
- "step": 41256
170
  },
171
  {
172
- "epoch": 1.88,
173
- "learning_rate": 9.721979790927251e-05,
174
- "loss": 0.0048,
175
- "step": 42975
176
  },
177
  {
178
- "epoch": 1.95,
179
- "learning_rate": 9.666414538119898e-05,
180
- "loss": 0.0051,
181
- "step": 44694
182
  },
183
  {
184
- "epoch": 2.0,
185
- "eval_loss": 0.026126669719815254,
186
- "eval_max_distance": 8,
187
- "eval_mean_distance": 0,
188
- "eval_runtime": 13.5883,
189
- "eval_samples_per_second": 18.472,
190
- "eval_steps_per_second": 1.251,
191
- "step": 45832
192
  },
193
  {
194
- "epoch": 2.03,
195
- "learning_rate": 9.610849285312543e-05,
196
- "loss": 0.0047,
197
- "step": 46413
198
  },
199
  {
200
- "epoch": 2.1,
201
- "learning_rate": 9.555284032505189e-05,
202
- "loss": 0.004,
203
- "step": 48132
204
  },
205
  {
206
- "epoch": 2.18,
207
- "learning_rate": 9.499718779697834e-05,
208
- "loss": 0.004,
209
- "step": 49851
210
  },
211
  {
212
- "epoch": 2.25,
213
- "learning_rate": 9.44415352689048e-05,
214
- "loss": 0.0042,
215
- "step": 51570
216
  },
217
  {
218
- "epoch": 2.33,
219
- "learning_rate": 9.388588274083125e-05,
220
- "loss": 0.004,
221
- "step": 53289
222
  },
223
  {
224
- "epoch": 2.4,
225
- "learning_rate": 9.333023021275771e-05,
226
- "loss": 0.0043,
227
- "step": 55008
228
  },
229
  {
230
- "epoch": 2.48,
231
- "learning_rate": 9.277457768468416e-05,
232
- "loss": 0.0042,
233
- "step": 56727
234
  },
235
  {
236
- "epoch": 2.55,
237
- "learning_rate": 9.221892515661063e-05,
238
- "loss": 0.004,
239
- "step": 58446
240
  },
241
  {
242
- "epoch": 2.63,
243
- "learning_rate": 9.166327262853708e-05,
244
- "loss": 0.0045,
245
- "step": 60165
246
  },
247
  {
248
- "epoch": 2.7,
249
- "learning_rate": 9.110762010046352e-05,
250
- "loss": 0.0044,
251
- "step": 61884
252
  },
253
  {
254
- "epoch": 2.78,
255
- "learning_rate": 9.055196757238999e-05,
256
- "loss": 0.0044,
257
- "step": 63603
258
  },
259
  {
260
- "epoch": 2.85,
261
- "learning_rate": 8.999631504431645e-05,
262
- "loss": 0.0044,
263
- "step": 65322
264
  },
265
  {
266
- "epoch": 2.93,
267
- "learning_rate": 8.94406625162429e-05,
268
- "loss": 0.0043,
269
- "step": 67041
270
  },
271
  {
272
- "epoch": 3.0,
273
- "eval_loss": 0.03130079433321953,
274
  "eval_max_distance": 8,
275
  "eval_mean_distance": 0,
276
- "eval_runtime": 13.4308,
277
- "eval_samples_per_second": 18.688,
278
- "eval_steps_per_second": 1.266,
279
- "step": 68748
280
- },
281
- {
282
- "epoch": 3.0,
283
- "learning_rate": 8.888500998816935e-05,
284
- "loss": 0.0046,
285
- "step": 68760
286
  },
287
  {
288
- "epoch": 3.08,
289
- "learning_rate": 8.832935746009581e-05,
290
- "loss": 0.0036,
291
- "step": 70479
292
- },
293
- {
294
- "epoch": 3.15,
295
- "learning_rate": 8.777370493202228e-05,
296
- "loss": 0.0036,
297
- "step": 72198
298
  },
299
  {
300
- "epoch": 3.23,
301
- "learning_rate": 8.721805240394872e-05,
302
- "loss": 0.0038,
303
- "step": 73917
304
  },
305
  {
306
- "epoch": 3.3,
307
- "learning_rate": 8.666239987587517e-05,
308
- "loss": 0.0036,
309
- "step": 75636
310
  },
311
  {
312
- "epoch": 3.38,
313
- "learning_rate": 8.610674734780164e-05,
314
- "loss": 0.0038,
315
- "step": 77355
316
  },
317
  {
318
- "epoch": 3.45,
319
- "learning_rate": 8.55510948197281e-05,
320
- "loss": 0.0038,
321
- "step": 79074
322
  },
323
  {
324
- "epoch": 3.53,
325
- "learning_rate": 8.499544229165455e-05,
326
- "loss": 0.0038,
327
- "step": 80793
328
  },
329
  {
330
- "epoch": 3.6,
331
- "learning_rate": 8.4439789763581e-05,
332
- "loss": 0.004,
333
- "step": 82512
334
  },
335
  {
336
- "epoch": 3.68,
337
- "learning_rate": 8.388413723550746e-05,
338
- "loss": 0.0037,
339
- "step": 84231
340
  },
341
  {
342
- "epoch": 3.75,
343
- "learning_rate": 8.332848470743392e-05,
344
- "loss": 0.0038,
345
- "step": 85950
346
  },
347
  {
348
- "epoch": 3.83,
349
- "learning_rate": 8.277283217936037e-05,
350
- "loss": 0.0039,
351
- "step": 87669
352
  },
353
  {
354
- "epoch": 3.9,
355
- "learning_rate": 8.221717965128682e-05,
356
- "loss": 0.0039,
357
- "step": 89388
358
  },
359
  {
360
- "epoch": 3.98,
361
- "learning_rate": 8.166152712321329e-05,
362
- "loss": 0.0041,
363
- "step": 91107
364
  },
365
  {
366
- "epoch": 4.0,
367
- "eval_loss": 0.02780107595026493,
368
- "eval_max_distance": 10,
369
- "eval_mean_distance": 0,
370
- "eval_runtime": 13.351,
371
- "eval_samples_per_second": 18.8,
372
- "eval_steps_per_second": 1.273,
373
- "step": 91664
374
  },
375
  {
376
- "epoch": 4.05,
377
- "learning_rate": 8.110587459513974e-05,
378
- "loss": 0.0037,
379
- "step": 92826
380
  },
381
  {
382
- "epoch": 4.13,
383
- "learning_rate": 8.05502220670662e-05,
384
- "loss": 0.0032,
385
- "step": 94545
386
  },
387
  {
388
- "epoch": 4.2,
389
- "learning_rate": 7.999456953899266e-05,
390
- "loss": 0.0034,
391
- "step": 96264
392
  },
393
  {
394
- "epoch": 4.28,
395
- "learning_rate": 7.943891701091911e-05,
396
- "loss": 0.0034,
397
- "step": 97983
398
  },
399
  {
400
- "epoch": 4.35,
401
- "learning_rate": 7.888326448284556e-05,
402
- "loss": 0.0035,
403
- "step": 99702
404
  },
405
  {
406
- "epoch": 4.43,
407
- "learning_rate": 7.832761195477202e-05,
408
- "loss": 0.0034,
409
- "step": 101421
410
  },
411
  {
412
- "epoch": 4.5,
413
- "learning_rate": 7.777195942669849e-05,
414
- "loss": 0.0036,
415
- "step": 103140
416
  },
417
  {
418
- "epoch": 4.58,
419
- "learning_rate": 7.721630689862494e-05,
420
- "loss": 0.0035,
421
- "step": 104859
422
  },
423
  {
424
- "epoch": 4.65,
425
- "learning_rate": 7.666065437055139e-05,
426
- "loss": 0.0034,
427
- "step": 106578
428
  },
429
  {
430
- "epoch": 4.73,
431
- "learning_rate": 7.610500184247783e-05,
432
- "loss": 0.0034,
433
- "step": 108297
434
  },
435
  {
436
- "epoch": 4.8,
437
- "learning_rate": 7.554934931440431e-05,
438
- "loss": 0.0036,
439
- "step": 110016
440
  },
441
  {
442
- "epoch": 4.88,
443
- "learning_rate": 7.499369678633076e-05,
444
- "loss": 0.0034,
445
- "step": 111735
446
  },
447
  {
448
- "epoch": 4.95,
449
- "learning_rate": 7.443804425825721e-05,
450
- "loss": 0.0037,
451
- "step": 113454
452
  },
453
  {
454
- "epoch": 5.0,
455
- "eval_loss": 0.028013188391923904,
456
- "eval_max_distance": 8,
457
- "eval_mean_distance": 0,
458
- "eval_runtime": 13.4584,
459
- "eval_samples_per_second": 18.65,
460
- "eval_steps_per_second": 1.263,
461
- "step": 114580
462
  },
463
  {
464
- "epoch": 5.03,
465
- "learning_rate": 7.388239173018366e-05,
466
- "loss": 0.0033,
467
- "step": 115173
468
  },
469
  {
470
- "epoch": 5.1,
471
- "learning_rate": 7.332673920211012e-05,
472
- "loss": 0.0031,
473
- "step": 116892
474
  },
475
  {
476
- "epoch": 5.18,
477
- "learning_rate": 7.277108667403659e-05,
478
- "loss": 0.0031,
479
- "step": 118611
480
  },
481
  {
482
- "epoch": 5.25,
483
- "learning_rate": 7.221543414596303e-05,
484
- "loss": 0.0032,
485
- "step": 120330
486
  },
487
  {
488
- "epoch": 5.33,
489
- "learning_rate": 7.16597816178895e-05,
490
- "loss": 0.0031,
491
- "step": 122049
492
  },
493
  {
494
- "epoch": 5.4,
495
- "learning_rate": 7.110412908981595e-05,
496
- "loss": 0.0032,
497
- "step": 123768
498
  },
499
  {
500
- "epoch": 5.48,
501
- "learning_rate": 7.054847656174241e-05,
502
- "loss": 0.0031,
503
- "step": 125487
504
  },
505
  {
506
- "epoch": 5.55,
507
- "learning_rate": 6.999282403366886e-05,
508
- "loss": 0.0032,
509
- "step": 127206
510
  },
511
  {
512
- "epoch": 5.63,
513
- "learning_rate": 6.943717150559532e-05,
514
- "loss": 0.0032,
515
- "step": 128925
516
  },
517
  {
518
- "epoch": 5.7,
519
- "learning_rate": 6.888151897752177e-05,
520
- "loss": 0.0032,
521
- "step": 130644
522
  },
523
  {
524
- "epoch": 5.78,
525
- "learning_rate": 6.832586644944823e-05,
526
- "loss": 0.0031,
527
- "step": 132363
528
  },
529
  {
530
- "epoch": 5.85,
531
- "learning_rate": 6.777021392137468e-05,
532
- "loss": 0.0031,
533
- "step": 134082
534
  },
535
  {
536
- "epoch": 5.93,
537
- "learning_rate": 6.721456139330115e-05,
538
- "loss": 0.0032,
539
- "step": 135801
540
  },
541
  {
542
- "epoch": 6.0,
543
- "eval_loss": 0.028835317119956017,
544
  "eval_max_distance": 8,
545
  "eval_mean_distance": 0,
546
- "eval_runtime": 13.4137,
547
- "eval_samples_per_second": 18.712,
548
- "eval_steps_per_second": 1.267,
549
- "step": 137496
550
- },
551
- {
552
- "epoch": 6.0,
553
- "learning_rate": 6.66589088652276e-05,
554
- "loss": 0.0033,
555
- "step": 137520
556
- },
557
- {
558
- "epoch": 6.08,
559
- "learning_rate": 6.610325633715405e-05,
560
- "loss": 0.0028,
561
- "step": 139239
562
  },
563
  {
564
- "epoch": 6.15,
565
- "learning_rate": 6.554760380908051e-05,
566
- "loss": 0.0026,
567
- "step": 140958
568
  },
569
  {
570
- "epoch": 6.23,
571
- "learning_rate": 6.499195128100697e-05,
572
- "loss": 0.0027,
573
- "step": 142677
574
  },
575
  {
576
- "epoch": 6.3,
577
- "learning_rate": 6.443629875293342e-05,
578
- "loss": 0.0029,
579
- "step": 144396
580
  },
581
  {
582
- "epoch": 6.38,
583
- "learning_rate": 6.388064622485987e-05,
584
- "loss": 0.0029,
585
- "step": 146115
586
  },
587
  {
588
- "epoch": 6.45,
589
- "learning_rate": 6.332499369678633e-05,
590
- "loss": 0.0029,
591
- "step": 147834
592
  },
593
  {
594
- "epoch": 6.53,
595
- "learning_rate": 6.27693411687128e-05,
596
- "loss": 0.0028,
597
- "step": 149553
598
  },
599
  {
600
- "epoch": 6.6,
601
- "learning_rate": 6.221368864063925e-05,
602
- "loss": 0.0029,
603
- "step": 151272
604
  },
605
  {
606
- "epoch": 6.68,
607
- "learning_rate": 6.16580361125657e-05,
608
- "loss": 0.0029,
609
- "step": 152991
610
  },
611
  {
612
- "epoch": 6.75,
613
- "learning_rate": 6.110238358449216e-05,
614
- "loss": 0.0029,
615
- "step": 154710
616
  },
617
  {
618
- "epoch": 6.83,
619
- "learning_rate": 6.0546731056418614e-05,
620
- "loss": 0.0028,
621
- "step": 156429
622
  },
623
  {
624
- "epoch": 6.9,
625
- "learning_rate": 5.999107852834507e-05,
626
- "loss": 0.0029,
627
- "step": 158148
628
  },
629
  {
630
- "epoch": 6.98,
631
- "learning_rate": 5.943542600027152e-05,
632
- "loss": 0.003,
633
- "step": 159867
634
  },
635
  {
636
- "epoch": 7.0,
637
- "eval_loss": 0.030847659334540367,
638
- "eval_max_distance": 8,
639
- "eval_mean_distance": 0,
640
- "eval_runtime": 13.4895,
641
- "eval_samples_per_second": 18.607,
642
- "eval_steps_per_second": 1.26,
643
- "step": 160412
644
  },
645
  {
646
- "epoch": 7.05,
647
- "learning_rate": 5.887977347219798e-05,
648
- "loss": 0.0027,
649
- "step": 161586
650
  },
651
  {
652
- "epoch": 7.13,
653
- "learning_rate": 5.832412094412444e-05,
654
- "loss": 0.0025,
655
- "step": 163305
656
  },
657
  {
658
- "epoch": 7.2,
659
- "learning_rate": 5.7768468416050895e-05,
660
- "loss": 0.0026,
661
- "step": 165024
662
  },
663
  {
664
- "epoch": 7.28,
665
- "learning_rate": 5.7212815887977344e-05,
666
- "loss": 0.0027,
667
- "step": 166743
668
  },
669
  {
670
- "epoch": 7.35,
671
- "learning_rate": 5.665716335990381e-05,
672
- "loss": 0.0025,
673
- "step": 168462
674
  },
675
  {
676
- "epoch": 7.43,
677
- "learning_rate": 5.610151083183026e-05,
678
- "loss": 0.0026,
679
- "step": 170181
680
  },
681
  {
682
- "epoch": 7.5,
683
- "learning_rate": 5.554585830375671e-05,
684
- "loss": 0.0026,
685
- "step": 171900
686
  },
687
  {
688
- "epoch": 7.58,
689
- "learning_rate": 5.499020577568318e-05,
690
- "loss": 0.0026,
691
- "step": 173619
692
  },
693
  {
694
- "epoch": 7.65,
695
- "learning_rate": 5.443455324760963e-05,
696
- "loss": 0.0026,
697
- "step": 175338
698
  },
699
  {
700
- "epoch": 7.73,
701
- "learning_rate": 5.387890071953609e-05,
702
- "loss": 0.0025,
703
- "step": 177057
704
  },
705
  {
706
- "epoch": 7.8,
707
- "learning_rate": 5.332324819146254e-05,
708
- "loss": 0.0027,
709
- "step": 178776
710
  },
711
  {
712
- "epoch": 7.88,
713
- "learning_rate": 5.2767595663389e-05,
714
- "loss": 0.0028,
715
- "step": 180495
716
  },
717
  {
718
- "epoch": 7.95,
719
- "learning_rate": 5.2211943135315456e-05,
720
- "loss": 0.0025,
721
- "step": 182214
722
  },
723
  {
724
- "epoch": 8.0,
725
- "eval_loss": 0.03048335202038288,
726
- "eval_max_distance": 8,
727
- "eval_mean_distance": 0,
728
- "eval_runtime": 13.5077,
729
- "eval_samples_per_second": 18.582,
730
- "eval_steps_per_second": 1.259,
731
- "step": 183328
732
  },
733
  {
734
- "epoch": 8.03,
735
- "learning_rate": 5.1656290607241906e-05,
736
- "loss": 0.0026,
737
- "step": 183933
738
  },
739
  {
740
- "epoch": 8.1,
741
- "learning_rate": 5.110063807916836e-05,
742
- "loss": 0.0023,
743
- "step": 185652
744
  },
745
  {
746
- "epoch": 8.18,
747
- "learning_rate": 5.0544985551094825e-05,
748
- "loss": 0.0024,
749
- "step": 187371
750
  },
751
  {
752
- "epoch": 8.25,
753
- "learning_rate": 4.998933302302128e-05,
754
- "loss": 0.0023,
755
- "step": 189090
756
  },
757
  {
758
- "epoch": 8.33,
759
- "learning_rate": 4.943368049494773e-05,
760
- "loss": 0.0024,
761
- "step": 190809
762
  },
763
  {
764
- "epoch": 8.4,
765
- "learning_rate": 4.887802796687419e-05,
766
- "loss": 0.0024,
767
- "step": 192528
768
  },
769
  {
770
- "epoch": 8.48,
771
- "learning_rate": 4.832237543880065e-05,
772
- "loss": 0.0025,
773
- "step": 194247
774
  },
775
  {
776
- "epoch": 8.55,
777
- "learning_rate": 4.77667229107271e-05,
778
- "loss": 0.0024,
779
- "step": 195966
780
  },
781
  {
782
- "epoch": 8.63,
783
- "learning_rate": 4.721107038265356e-05,
784
- "loss": 0.0026,
785
- "step": 197685
786
  },
787
  {
788
- "epoch": 8.7,
789
- "learning_rate": 4.665541785458001e-05,
790
- "loss": 0.0023,
791
- "step": 199404
792
  },
793
  {
794
- "epoch": 8.78,
795
- "learning_rate": 4.6099765326506474e-05,
796
- "loss": 0.0024,
797
- "step": 201123
798
  },
799
  {
800
- "epoch": 8.85,
801
- "learning_rate": 4.5544112798432924e-05,
802
- "loss": 0.0024,
803
- "step": 202842
804
  },
805
  {
806
- "epoch": 8.93,
807
- "learning_rate": 4.4988460270359386e-05,
808
- "loss": 0.0025,
809
- "step": 204561
810
  },
811
  {
812
- "epoch": 9.0,
813
- "eval_loss": 0.030335595831274986,
814
  "eval_max_distance": 8,
815
  "eval_mean_distance": 0,
816
- "eval_runtime": 13.4109,
817
- "eval_samples_per_second": 18.716,
818
- "eval_steps_per_second": 1.268,
819
- "step": 206244
820
  },
821
  {
822
- "epoch": 9.0,
823
- "learning_rate": 4.4432807742285836e-05,
824
- "loss": 0.0026,
825
- "step": 206280
826
- },
827
- {
828
- "epoch": 9.08,
829
- "learning_rate": 4.38771552142123e-05,
830
  "loss": 0.0021,
831
- "step": 207999
832
  },
833
  {
834
- "epoch": 9.15,
835
- "learning_rate": 4.332150268613875e-05,
836
- "loss": 0.0022,
837
- "step": 209718
838
- },
839
- {
840
- "epoch": 9.23,
841
- "learning_rate": 4.2765850158065204e-05,
842
- "loss": 0.0022,
843
- "step": 211437
844
- },
845
- {
846
- "epoch": 9.3,
847
- "learning_rate": 4.221019762999166e-05,
848
- "loss": 0.0023,
849
- "step": 213156
850
  },
851
  {
852
- "epoch": 9.38,
853
- "learning_rate": 4.165454510191812e-05,
854
- "loss": 0.0023,
855
- "step": 214875
856
  },
857
  {
858
- "epoch": 9.45,
859
- "learning_rate": 4.109889257384457e-05,
860
- "loss": 0.0023,
861
- "step": 216594
862
  },
863
  {
864
- "epoch": 9.53,
865
- "learning_rate": 4.054324004577103e-05,
866
- "loss": 0.0023,
867
- "step": 218313
868
  },
869
  {
870
- "epoch": 9.6,
871
- "learning_rate": 3.998758751769749e-05,
872
- "loss": 0.0024,
873
- "step": 220032
874
  },
875
  {
876
- "epoch": 9.68,
877
- "learning_rate": 3.943193498962394e-05,
878
- "loss": 0.0024,
879
- "step": 221751
880
  },
881
  {
882
- "epoch": 9.75,
883
- "learning_rate": 3.88762824615504e-05,
884
- "loss": 0.0022,
885
- "step": 223470
886
  },
887
  {
888
- "epoch": 9.83,
889
- "learning_rate": 3.8320629933476854e-05,
890
- "loss": 0.0023,
891
- "step": 225189
892
  },
893
  {
894
- "epoch": 9.9,
895
- "learning_rate": 3.776497740540331e-05,
896
- "loss": 0.0024,
897
- "step": 226908
898
  },
899
  {
900
- "epoch": 9.98,
901
- "learning_rate": 3.7209324877329766e-05,
902
- "loss": 0.0023,
903
- "step": 228627
904
  },
905
  {
906
- "epoch": 10.0,
907
- "eval_loss": 0.034065987914800644,
908
- "eval_max_distance": 8,
909
- "eval_mean_distance": 0,
910
- "eval_runtime": 13.4726,
911
- "eval_samples_per_second": 18.63,
912
- "eval_steps_per_second": 1.262,
913
- "step": 229160
914
  },
915
  {
916
- "epoch": 10.05,
917
- "learning_rate": 3.665367234925622e-05,
918
- "loss": 0.0021,
919
- "step": 230346
920
  },
921
  {
922
- "epoch": 10.13,
923
- "learning_rate": 3.609801982118268e-05,
924
- "loss": 0.0021,
925
- "step": 232065
926
  },
927
  {
928
- "epoch": 10.2,
929
- "learning_rate": 3.5542367293109135e-05,
930
- "loss": 0.0021,
931
- "step": 233784
932
  },
933
  {
934
- "epoch": 10.28,
935
- "learning_rate": 3.498671476503559e-05,
936
- "loss": 0.0022,
937
- "step": 235503
938
  },
939
  {
940
- "epoch": 10.35,
941
- "learning_rate": 3.443106223696205e-05,
942
- "loss": 0.0021,
943
- "step": 237222
944
  },
945
  {
946
- "epoch": 10.43,
947
- "learning_rate": 3.38754097088885e-05,
948
- "loss": 0.0022,
949
- "step": 238941
950
  },
951
  {
952
- "epoch": 10.5,
953
- "learning_rate": 3.331975718081496e-05,
954
- "loss": 0.0021,
955
- "step": 240660
956
  },
957
  {
958
- "epoch": 10.58,
959
- "learning_rate": 3.276410465274141e-05,
960
- "loss": 0.0021,
961
- "step": 242379
962
  },
963
  {
964
- "epoch": 10.65,
965
- "learning_rate": 3.220845212466787e-05,
966
- "loss": 0.0021,
967
- "step": 244098
968
  },
969
  {
970
- "epoch": 10.73,
971
- "learning_rate": 3.165279959659433e-05,
972
- "loss": 0.0022,
973
- "step": 245817
974
  },
975
  {
976
- "epoch": 10.8,
977
- "learning_rate": 3.1097147068520784e-05,
978
- "loss": 0.0022,
979
- "step": 247536
980
  },
981
  {
982
- "epoch": 10.88,
983
- "learning_rate": 3.054149454044724e-05,
984
  "loss": 0.002,
985
- "step": 249255
986
  },
987
  {
988
- "epoch": 10.95,
989
- "learning_rate": 2.9985842012373693e-05,
990
- "loss": 0.0022,
991
- "step": 250974
992
  },
993
  {
994
- "epoch": 11.0,
995
- "eval_loss": 0.03288768604397774,
996
- "eval_max_distance": 8,
997
- "eval_mean_distance": 0,
998
- "eval_runtime": 13.3832,
999
- "eval_samples_per_second": 18.755,
1000
- "eval_steps_per_second": 1.27,
1001
- "step": 252076
1002
  },
1003
  {
1004
- "epoch": 11.03,
1005
- "learning_rate": 2.9430189484300152e-05,
1006
- "loss": 0.0022,
1007
- "step": 252693
1008
  },
1009
  {
1010
- "epoch": 11.1,
1011
- "learning_rate": 2.8874536956226605e-05,
1012
- "loss": 0.002,
1013
- "step": 254412
1014
  },
1015
  {
1016
- "epoch": 11.18,
1017
- "learning_rate": 2.8318884428153065e-05,
1018
- "loss": 0.002,
1019
- "step": 256131
1020
  },
1021
  {
1022
- "epoch": 11.25,
1023
- "learning_rate": 2.7763231900079517e-05,
1024
- "loss": 0.002,
1025
- "step": 257850
1026
  },
1027
  {
1028
- "epoch": 11.33,
1029
- "learning_rate": 2.7207579372005977e-05,
1030
- "loss": 0.0021,
1031
- "step": 259569
1032
  },
1033
  {
1034
- "epoch": 11.4,
1035
- "learning_rate": 2.665192684393243e-05,
1036
  "loss": 0.0019,
1037
- "step": 261288
1038
  },
1039
  {
1040
- "epoch": 11.48,
1041
- "learning_rate": 2.6096274315858886e-05,
1042
  "loss": 0.002,
1043
- "step": 263007
1044
  },
1045
  {
1046
- "epoch": 11.55,
1047
- "learning_rate": 2.5540621787785342e-05,
1048
- "loss": 0.002,
1049
- "step": 264726
1050
  },
1051
  {
1052
- "epoch": 11.63,
1053
- "learning_rate": 2.4984969259711798e-05,
1054
- "loss": 0.002,
1055
- "step": 266445
1056
  },
1057
  {
1058
- "epoch": 11.7,
1059
- "learning_rate": 2.4429316731638254e-05,
1060
- "loss": 0.0019,
1061
- "step": 268164
1062
  },
1063
  {
1064
- "epoch": 11.78,
1065
- "learning_rate": 2.387366420356471e-05,
1066
- "loss": 0.0021,
1067
- "step": 269883
1068
  },
1069
  {
1070
- "epoch": 11.85,
1071
- "learning_rate": 2.3318011675491167e-05,
1072
- "loss": 0.0021,
1073
- "step": 271602
1074
  },
1075
  {
1076
- "epoch": 11.93,
1077
- "learning_rate": 2.2762359147417623e-05,
1078
  "loss": 0.0019,
1079
- "step": 273321
 
 
 
 
 
 
1080
  },
1081
  {
1082
- "epoch": 12.0,
1083
- "eval_loss": 0.03355114161968231,
1084
  "eval_max_distance": 8,
1085
  "eval_mean_distance": 0,
1086
- "eval_runtime": 13.4567,
1087
- "eval_samples_per_second": 18.652,
1088
- "eval_steps_per_second": 1.263,
1089
- "step": 274992
1090
  },
1091
  {
1092
- "epoch": 12.0,
1093
- "learning_rate": 2.220670661934408e-05,
1094
- "loss": 0.0021,
1095
- "step": 275040
1096
  },
1097
  {
1098
- "epoch": 12.08,
1099
- "learning_rate": 2.1651054091270535e-05,
1100
- "loss": 0.002,
1101
- "step": 276759
1102
  },
1103
  {
1104
- "epoch": 12.15,
1105
- "learning_rate": 2.109540156319699e-05,
1106
- "loss": 0.002,
1107
- "step": 278478
1108
  },
1109
  {
1110
- "epoch": 12.23,
1111
- "learning_rate": 2.0539749035123444e-05,
1112
- "loss": 0.0018,
1113
- "step": 280197
1114
  },
1115
  {
1116
- "epoch": 12.3,
1117
- "learning_rate": 1.9984096507049904e-05,
1118
- "loss": 0.002,
1119
- "step": 281916
1120
  },
1121
  {
1122
- "epoch": 12.38,
1123
- "learning_rate": 1.942844397897636e-05,
1124
- "loss": 0.0019,
1125
- "step": 283635
1126
  },
1127
  {
1128
- "epoch": 12.45,
1129
- "learning_rate": 1.8872791450902816e-05,
1130
- "loss": 0.0018,
1131
- "step": 285354
1132
  },
1133
  {
1134
- "epoch": 12.53,
1135
- "learning_rate": 1.8317138922829272e-05,
1136
- "loss": 0.0019,
1137
- "step": 287073
1138
  },
1139
  {
1140
- "epoch": 12.6,
1141
- "learning_rate": 1.776148639475573e-05,
1142
- "loss": 0.0018,
1143
- "step": 288792
1144
  },
1145
  {
1146
- "epoch": 12.68,
1147
- "learning_rate": 1.7205833866682185e-05,
1148
- "loss": 0.0019,
1149
- "step": 290511
1150
  },
1151
  {
1152
- "epoch": 12.75,
1153
- "learning_rate": 1.665018133860864e-05,
1154
- "loss": 0.0019,
1155
- "step": 292230
1156
  },
1157
  {
1158
- "epoch": 12.83,
1159
- "learning_rate": 1.6094528810535094e-05,
1160
- "loss": 0.0021,
1161
- "step": 293949
1162
  },
1163
  {
1164
- "epoch": 12.9,
1165
- "learning_rate": 1.553887628246155e-05,
1166
- "loss": 0.0019,
1167
- "step": 295668
1168
  },
1169
  {
1170
- "epoch": 12.98,
1171
- "learning_rate": 1.4983223754388006e-05,
1172
- "loss": 0.002,
1173
- "step": 297387
1174
  },
1175
  {
1176
- "epoch": 13.0,
1177
- "eval_loss": 0.035788267850875854,
1178
- "eval_max_distance": 8,
1179
- "eval_mean_distance": 0,
1180
- "eval_runtime": 13.4958,
1181
- "eval_samples_per_second": 18.598,
1182
- "eval_steps_per_second": 1.26,
1183
- "step": 297908
1184
  },
1185
  {
1186
- "epoch": 13.05,
1187
- "learning_rate": 1.4427571226314462e-05,
1188
  "loss": 0.0019,
1189
- "step": 299106
1190
- },
1191
- {
1192
- "epoch": 13.13,
1193
- "learning_rate": 1.3871918698240918e-05,
1194
- "loss": 0.0018,
1195
- "step": 300825
1196
  },
1197
  {
1198
- "epoch": 13.2,
1199
- "learning_rate": 1.3316266170167374e-05,
1200
- "loss": 0.0018,
1201
- "step": 302544
1202
- },
1203
- {
1204
- "epoch": 13.28,
1205
- "learning_rate": 1.276061364209383e-05,
1206
- "loss": 0.0018,
1207
- "step": 304263
1208
  },
1209
  {
1210
- "epoch": 13.35,
1211
- "learning_rate": 1.2204961114020287e-05,
1212
- "loss": 0.0018,
1213
- "step": 305982
1214
  },
1215
  {
1216
- "epoch": 13.43,
1217
- "learning_rate": 1.1649308585946743e-05,
1218
- "loss": 0.0018,
1219
- "step": 307701
1220
  },
1221
  {
1222
- "epoch": 13.5,
1223
- "learning_rate": 1.1093656057873199e-05,
1224
- "loss": 0.0019,
1225
- "step": 309420
1226
  },
1227
  {
1228
- "epoch": 13.58,
1229
- "learning_rate": 1.0538003529799655e-05,
1230
- "loss": 0.0018,
1231
- "step": 311139
1232
  },
1233
  {
1234
- "epoch": 13.65,
1235
- "learning_rate": 9.982351001726111e-06,
1236
  "loss": 0.0017,
1237
- "step": 312858
1238
  },
1239
  {
1240
- "epoch": 13.73,
1241
- "learning_rate": 9.426698473652567e-06,
1242
- "loss": 0.0018,
1243
- "step": 314577
1244
  },
1245
  {
1246
- "epoch": 13.8,
1247
- "learning_rate": 8.871045945579024e-06,
1248
- "loss": 0.002,
1249
- "step": 316296
1250
  },
1251
  {
1252
- "epoch": 13.88,
1253
- "learning_rate": 8.31539341750548e-06,
1254
- "loss": 0.0018,
1255
- "step": 318015
1256
  },
1257
  {
1258
- "epoch": 13.95,
1259
- "learning_rate": 7.759740889431934e-06,
1260
- "loss": 0.0018,
1261
- "step": 319734
1262
  },
1263
  {
1264
- "epoch": 14.0,
1265
- "eval_loss": 0.03550655022263527,
1266
- "eval_max_distance": 8,
1267
- "eval_mean_distance": 0,
1268
- "eval_runtime": 13.4713,
1269
- "eval_samples_per_second": 18.632,
1270
- "eval_steps_per_second": 1.262,
1271
- "step": 320824
1272
  },
1273
  {
1274
- "epoch": 14.03,
1275
- "learning_rate": 7.204088361358391e-06,
1276
  "loss": 0.0017,
1277
- "step": 321453
1278
  },
1279
  {
1280
- "epoch": 14.1,
1281
- "learning_rate": 6.648435833284847e-06,
1282
- "loss": 0.0018,
1283
- "step": 323172
1284
  },
1285
  {
1286
- "epoch": 14.18,
1287
- "learning_rate": 6.092783305211304e-06,
1288
- "loss": 0.0018,
1289
- "step": 324891
1290
  },
1291
  {
1292
- "epoch": 14.25,
1293
- "learning_rate": 5.537130777137759e-06,
1294
  "loss": 0.0017,
1295
- "step": 326610
1296
  },
1297
  {
1298
- "epoch": 14.33,
1299
- "learning_rate": 4.981478249064216e-06,
1300
  "loss": 0.0018,
1301
- "step": 328329
1302
  },
1303
  {
1304
- "epoch": 14.4,
1305
- "learning_rate": 4.425825720990671e-06,
1306
- "loss": 0.0018,
1307
- "step": 330048
1308
- },
1309
- {
1310
- "epoch": 14.48,
1311
- "learning_rate": 3.8701731929171274e-06,
1312
- "loss": 0.0018,
1313
- "step": 331767
1314
  },
1315
  {
1316
- "epoch": 14.55,
1317
- "learning_rate": 3.3145206648435836e-06,
1318
- "loss": 0.0019,
1319
- "step": 333486
1320
  },
1321
  {
1322
- "epoch": 14.63,
1323
- "learning_rate": 2.7588681367700398e-06,
1324
- "loss": 0.0017,
1325
- "step": 335205
1326
  },
1327
  {
1328
- "epoch": 14.7,
1329
- "learning_rate": 2.2032156086964955e-06,
1330
  "loss": 0.0018,
1331
- "step": 336924
1332
  },
1333
  {
1334
- "epoch": 14.78,
1335
- "learning_rate": 1.6475630806229517e-06,
1336
  "loss": 0.0017,
1337
- "step": 338643
1338
  },
1339
  {
1340
- "epoch": 14.85,
1341
- "learning_rate": 1.0919105525494076e-06,
1342
  "loss": 0.0018,
1343
- "step": 340362
1344
  },
1345
  {
1346
- "epoch": 14.93,
1347
- "learning_rate": 5.362580244758636e-07,
1348
- "loss": 0.0019,
1349
- "step": 342081
1350
  },
1351
  {
1352
- "epoch": 15.0,
1353
- "eval_loss": 0.03661360964179039,
 
 
 
 
 
 
1354
  "eval_max_distance": 8,
1355
  "eval_mean_distance": 0,
1356
- "eval_runtime": 13.3536,
1357
- "eval_samples_per_second": 18.796,
1358
- "eval_steps_per_second": 1.273,
1359
- "step": 343740
1360
- },
1361
- {
1362
- "epoch": 15.0,
1363
- "step": 343740,
1364
- "total_flos": 8.727792619277722e+16,
1365
- "train_loss": 0.0029792904397642345,
1366
- "train_runtime": 24306.5697,
1367
- "train_samples_per_second": 212.119,
1368
- "train_steps_per_second": 14.142
1369
  }
1370
  ],
1371
- "logging_steps": 1719,
1372
- "max_steps": 343740,
1373
- "num_train_epochs": 15,
1374
- "save_steps": 3438,
1375
- "total_flos": 8.727792619277722e+16,
1376
  "trial_name": null,
1377
  "trial_params": null
1378
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 114970,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 8.697921196833957e-09,
14
+ "loss": 0.0,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.03,
19
+ "learning_rate": 5.001304688179525e-06,
20
+ "loss": 0.0024,
21
+ "step": 575
 
 
 
 
 
 
 
 
 
 
 
 
22
  },
23
  {
24
+ "epoch": 0.05,
25
+ "learning_rate": 1.000260937635905e-05,
26
+ "loss": 0.0025,
27
+ "step": 1150
28
  },
29
  {
30
+ "epoch": 0.08,
31
+ "learning_rate": 1.5003914064538576e-05,
32
+ "loss": 0.0021,
33
+ "step": 1725
34
  },
35
  {
36
+ "epoch": 0.1,
37
+ "learning_rate": 2.00052187527181e-05,
38
+ "loss": 0.002,
39
+ "step": 2300
40
  },
41
  {
42
+ "epoch": 0.13,
43
+ "learning_rate": 2.500652344089763e-05,
44
+ "loss": 0.0024,
45
+ "step": 2875
46
  },
47
  {
48
+ "epoch": 0.15,
49
+ "learning_rate": 3.0007828129077153e-05,
50
+ "loss": 0.0021,
51
+ "step": 3450
52
  },
53
  {
54
+ "epoch": 0.18,
55
+ "learning_rate": 3.500913281725668e-05,
56
+ "loss": 0.0024,
57
+ "step": 4025
58
  },
59
  {
60
+ "epoch": 0.2,
61
+ "learning_rate": 4.00104375054362e-05,
62
+ "loss": 0.0021,
63
+ "step": 4600
64
  },
65
  {
66
+ "epoch": 0.23,
67
+ "learning_rate": 4.501174219361573e-05,
68
+ "loss": 0.0021,
69
+ "step": 5175
70
  },
71
  {
72
+ "epoch": 0.25,
73
+ "learning_rate": 5.001304688179526e-05,
74
+ "loss": 0.0021,
75
+ "step": 5750
76
  },
77
  {
78
+ "epoch": 0.28,
79
+ "learning_rate": 5.501435156997478e-05,
80
+ "loss": 0.0022,
81
+ "step": 6325
82
  },
83
  {
84
+ "epoch": 0.3,
85
+ "learning_rate": 6.0015656258154306e-05,
86
+ "loss": 0.002,
87
+ "step": 6900
 
 
 
 
88
  },
89
  {
90
+ "epoch": 0.33,
91
+ "learning_rate": 6.501696094633383e-05,
92
+ "loss": 0.0021,
93
+ "step": 7475
94
  },
95
  {
96
+ "epoch": 0.35,
97
+ "learning_rate": 7.001826563451336e-05,
98
+ "loss": 0.0018,
99
+ "step": 8050
100
  },
101
  {
102
+ "epoch": 0.38,
103
+ "learning_rate": 7.501957032269288e-05,
104
+ "loss": 0.0021,
105
+ "step": 8625
106
  },
107
  {
108
+ "epoch": 0.4,
109
+ "learning_rate": 8.00208750108724e-05,
110
+ "loss": 0.0019,
111
+ "step": 9200
112
  },
113
  {
114
+ "epoch": 0.43,
115
+ "learning_rate": 8.502217969905193e-05,
116
+ "loss": 0.0023,
117
+ "step": 9775
118
  },
119
  {
120
+ "epoch": 0.45,
121
+ "learning_rate": 9.002348438723146e-05,
122
+ "loss": 0.0021,
123
+ "step": 10350
124
  },
125
  {
126
+ "epoch": 0.48,
127
+ "learning_rate": 9.502478907541099e-05,
128
+ "loss": 0.0021,
129
+ "step": 10925
130
  },
131
  {
132
+ "epoch": 0.5,
133
+ "learning_rate": 9.99971006929344e-05,
134
+ "loss": 0.0022,
135
+ "step": 11500
136
  },
137
  {
138
+ "epoch": 0.53,
139
+ "learning_rate": 9.944140017202556e-05,
140
+ "loss": 0.0025,
141
+ "step": 12075
142
  },
143
  {
144
+ "epoch": 0.55,
145
+ "learning_rate": 9.888569965111673e-05,
146
+ "loss": 0.0022,
147
+ "step": 12650
148
  },
149
  {
150
+ "epoch": 0.58,
151
+ "learning_rate": 9.832999913020789e-05,
152
+ "loss": 0.0022,
153
+ "step": 13225
154
  },
155
  {
156
+ "epoch": 0.6,
157
+ "learning_rate": 9.777429860929905e-05,
158
+ "loss": 0.002,
159
+ "step": 13800
160
  },
161
  {
162
+ "epoch": 0.63,
163
+ "learning_rate": 9.721859808839022e-05,
164
+ "loss": 0.0021,
165
+ "step": 14375
166
  },
167
  {
168
+ "epoch": 0.65,
169
+ "learning_rate": 9.666289756748138e-05,
170
+ "loss": 0.0023,
171
+ "step": 14950
 
 
 
 
172
  },
173
  {
174
+ "epoch": 0.68,
175
+ "learning_rate": 9.610719704657253e-05,
176
+ "loss": 0.0023,
177
+ "step": 15525
178
  },
179
  {
180
+ "epoch": 0.7,
181
+ "learning_rate": 9.55514965256637e-05,
182
+ "loss": 0.0023,
183
+ "step": 16100
184
  },
185
  {
186
+ "epoch": 0.73,
187
+ "learning_rate": 9.499579600475486e-05,
188
+ "loss": 0.002,
189
+ "step": 16675
190
  },
191
  {
192
+ "epoch": 0.75,
193
+ "learning_rate": 9.444009548384603e-05,
194
+ "loss": 0.0025,
195
+ "step": 17250
196
  },
197
  {
198
+ "epoch": 0.78,
199
+ "learning_rate": 9.388439496293719e-05,
200
+ "loss": 0.0023,
201
+ "step": 17825
202
  },
203
  {
204
+ "epoch": 0.8,
205
+ "learning_rate": 9.332869444202836e-05,
206
+ "loss": 0.0026,
207
+ "step": 18400
208
  },
209
  {
210
+ "epoch": 0.83,
211
+ "learning_rate": 9.277299392111952e-05,
212
+ "loss": 0.0025,
213
+ "step": 18975
214
  },
215
  {
216
+ "epoch": 0.85,
217
+ "learning_rate": 9.221729340021069e-05,
218
+ "loss": 0.0027,
219
+ "step": 19550
220
  },
221
  {
222
+ "epoch": 0.88,
223
+ "learning_rate": 9.166159287930185e-05,
224
+ "loss": 0.0024,
225
+ "step": 20125
226
  },
227
  {
228
+ "epoch": 0.9,
229
+ "learning_rate": 9.110589235839302e-05,
230
+ "loss": 0.0022,
231
+ "step": 20700
232
  },
233
  {
234
+ "epoch": 0.93,
235
+ "learning_rate": 9.055019183748418e-05,
236
+ "loss": 0.0026,
237
+ "step": 21275
238
  },
239
  {
240
+ "epoch": 0.95,
241
+ "learning_rate": 8.999449131657535e-05,
242
+ "loss": 0.0025,
243
+ "step": 21850
244
  },
245
  {
246
+ "epoch": 0.98,
247
+ "learning_rate": 8.94387907956665e-05,
248
+ "loss": 0.0024,
249
+ "step": 22425
250
  },
251
  {
252
+ "epoch": 1.0,
253
+ "eval_loss": 0.02637363225221634,
254
  "eval_max_distance": 8,
255
  "eval_mean_distance": 0,
256
+ "eval_runtime": 23.0018,
257
+ "eval_samples_per_second": 11.217,
258
+ "eval_steps_per_second": 0.783,
259
+ "step": 22994
 
 
 
 
 
 
260
  },
261
  {
262
+ "epoch": 1.0,
263
+ "learning_rate": 8.888309027475768e-05,
264
+ "loss": 0.0026,
265
+ "step": 23000
 
 
 
 
 
 
266
  },
267
  {
268
+ "epoch": 1.03,
269
+ "learning_rate": 8.832738975384883e-05,
270
+ "loss": 0.0019,
271
+ "step": 23575
272
  },
273
  {
274
+ "epoch": 1.05,
275
+ "learning_rate": 8.777168923294e-05,
276
+ "loss": 0.002,
277
+ "step": 24150
278
  },
279
  {
280
+ "epoch": 1.08,
281
+ "learning_rate": 8.721598871203116e-05,
282
+ "loss": 0.002,
283
+ "step": 24725
284
  },
285
  {
286
+ "epoch": 1.1,
287
+ "learning_rate": 8.666028819112233e-05,
288
+ "loss": 0.0021,
289
+ "step": 25300
290
  },
291
  {
292
+ "epoch": 1.13,
293
+ "learning_rate": 8.610458767021349e-05,
294
+ "loss": 0.0022,
295
+ "step": 25875
296
  },
297
  {
298
+ "epoch": 1.15,
299
+ "learning_rate": 8.554888714930466e-05,
300
+ "loss": 0.0022,
301
+ "step": 26450
302
  },
303
  {
304
+ "epoch": 1.18,
305
+ "learning_rate": 8.499318662839582e-05,
306
+ "loss": 0.0023,
307
+ "step": 27025
308
  },
309
  {
310
+ "epoch": 1.2,
311
+ "learning_rate": 8.443748610748699e-05,
312
+ "loss": 0.0023,
313
+ "step": 27600
314
  },
315
  {
316
+ "epoch": 1.23,
317
+ "learning_rate": 8.388178558657815e-05,
318
+ "loss": 0.002,
319
+ "step": 28175
320
  },
321
  {
322
+ "epoch": 1.25,
323
+ "learning_rate": 8.33260850656693e-05,
324
+ "loss": 0.0022,
325
+ "step": 28750
326
  },
327
  {
328
+ "epoch": 1.28,
329
+ "learning_rate": 8.277038454476048e-05,
330
+ "loss": 0.0022,
331
+ "step": 29325
332
  },
333
  {
334
+ "epoch": 1.3,
335
+ "learning_rate": 8.221468402385163e-05,
336
+ "loss": 0.002,
337
+ "step": 29900
 
 
 
 
338
  },
339
  {
340
+ "epoch": 1.33,
341
+ "learning_rate": 8.165898350294279e-05,
342
+ "loss": 0.0023,
343
+ "step": 30475
344
  },
345
  {
346
+ "epoch": 1.35,
347
+ "learning_rate": 8.110328298203396e-05,
348
+ "loss": 0.0026,
349
+ "step": 31050
350
  },
351
  {
352
+ "epoch": 1.38,
353
+ "learning_rate": 8.054758246112512e-05,
354
+ "loss": 0.0022,
355
+ "step": 31625
356
  },
357
  {
358
+ "epoch": 1.4,
359
+ "learning_rate": 7.999188194021629e-05,
360
+ "loss": 0.0023,
361
+ "step": 32200
362
  },
363
  {
364
+ "epoch": 1.43,
365
+ "learning_rate": 7.943618141930745e-05,
366
+ "loss": 0.0023,
367
+ "step": 32775
368
  },
369
  {
370
+ "epoch": 1.45,
371
+ "learning_rate": 7.888048089839862e-05,
372
+ "loss": 0.0024,
373
+ "step": 33350
374
  },
375
  {
376
+ "epoch": 1.48,
377
+ "learning_rate": 7.832478037748978e-05,
378
+ "loss": 0.0024,
379
+ "step": 33925
380
  },
381
  {
382
+ "epoch": 1.5,
383
+ "learning_rate": 7.776907985658095e-05,
384
+ "loss": 0.0025,
385
+ "step": 34500
386
  },
387
  {
388
+ "epoch": 1.53,
389
+ "learning_rate": 7.721337933567211e-05,
390
+ "loss": 0.0023,
391
+ "step": 35075
392
  },
393
  {
394
+ "epoch": 1.55,
395
+ "learning_rate": 7.665767881476328e-05,
396
+ "loss": 0.0021,
397
+ "step": 35650
398
  },
399
  {
400
+ "epoch": 1.58,
401
+ "learning_rate": 7.610197829385444e-05,
402
+ "loss": 0.0022,
403
+ "step": 36225
404
  },
405
  {
406
+ "epoch": 1.6,
407
+ "learning_rate": 7.554627777294561e-05,
408
+ "loss": 0.002,
409
+ "step": 36800
410
  },
411
  {
412
+ "epoch": 1.63,
413
+ "learning_rate": 7.499057725203676e-05,
414
+ "loss": 0.0021,
415
+ "step": 37375
416
  },
417
  {
418
+ "epoch": 1.65,
419
+ "learning_rate": 7.443487673112794e-05,
420
+ "loss": 0.0024,
421
+ "step": 37950
 
 
 
 
422
  },
423
  {
424
+ "epoch": 1.68,
425
+ "learning_rate": 7.38791762102191e-05,
426
+ "loss": 0.0024,
427
+ "step": 38525
428
  },
429
  {
430
+ "epoch": 1.7,
431
+ "learning_rate": 7.332347568931026e-05,
432
+ "loss": 0.0022,
433
+ "step": 39100
434
  },
435
  {
436
+ "epoch": 1.73,
437
+ "learning_rate": 7.276777516840142e-05,
438
+ "loss": 0.0024,
439
+ "step": 39675
440
  },
441
  {
442
+ "epoch": 1.75,
443
+ "learning_rate": 7.221207464749259e-05,
444
+ "loss": 0.0023,
445
+ "step": 40250
446
  },
447
  {
448
+ "epoch": 1.78,
449
+ "learning_rate": 7.165637412658375e-05,
450
+ "loss": 0.0023,
451
+ "step": 40825
452
  },
453
  {
454
+ "epoch": 1.8,
455
+ "learning_rate": 7.110067360567492e-05,
456
+ "loss": 0.0022,
457
+ "step": 41400
458
  },
459
  {
460
+ "epoch": 1.83,
461
+ "learning_rate": 7.054497308476608e-05,
462
+ "loss": 0.0023,
463
+ "step": 41975
464
  },
465
  {
466
+ "epoch": 1.85,
467
+ "learning_rate": 6.998927256385725e-05,
468
+ "loss": 0.0023,
469
+ "step": 42550
470
  },
471
  {
472
+ "epoch": 1.88,
473
+ "learning_rate": 6.943357204294841e-05,
474
+ "loss": 0.0023,
475
+ "step": 43125
476
  },
477
  {
478
+ "epoch": 1.9,
479
+ "learning_rate": 6.887787152203957e-05,
480
+ "loss": 0.0024,
481
+ "step": 43700
482
  },
483
  {
484
+ "epoch": 1.93,
485
+ "learning_rate": 6.832217100113074e-05,
486
+ "loss": 0.0024,
487
+ "step": 44275
488
  },
489
  {
490
+ "epoch": 1.95,
491
+ "learning_rate": 6.77664704802219e-05,
492
+ "loss": 0.0022,
493
+ "step": 44850
494
  },
495
  {
496
+ "epoch": 1.98,
497
+ "learning_rate": 6.721076995931305e-05,
498
+ "loss": 0.0022,
499
+ "step": 45425
500
  },
501
  {
502
+ "epoch": 2.0,
503
+ "eval_loss": 0.025934860110282898,
504
  "eval_max_distance": 8,
505
  "eval_mean_distance": 0,
506
+ "eval_runtime": 21.6745,
507
+ "eval_samples_per_second": 11.903,
508
+ "eval_steps_per_second": 0.83,
509
+ "step": 45988
 
 
 
 
 
 
 
 
 
 
 
 
510
  },
511
  {
512
+ "epoch": 2.0,
513
+ "learning_rate": 6.665506943840422e-05,
514
+ "loss": 0.0024,
515
+ "step": 46000
516
  },
517
  {
518
+ "epoch": 2.03,
519
+ "learning_rate": 6.609936891749538e-05,
520
+ "loss": 0.0019,
521
+ "step": 46575
522
  },
523
  {
524
+ "epoch": 2.05,
525
+ "learning_rate": 6.554366839658655e-05,
526
+ "loss": 0.0021,
527
+ "step": 47150
528
  },
529
  {
530
+ "epoch": 2.08,
531
+ "learning_rate": 6.498796787567771e-05,
532
+ "loss": 0.0019,
533
+ "step": 47725
534
  },
535
  {
536
+ "epoch": 2.1,
537
+ "learning_rate": 6.443226735476888e-05,
538
+ "loss": 0.0021,
539
+ "step": 48300
540
  },
541
  {
542
+ "epoch": 2.13,
543
+ "learning_rate": 6.387656683386004e-05,
544
+ "loss": 0.0018,
545
+ "step": 48875
546
  },
547
  {
548
+ "epoch": 2.15,
549
+ "learning_rate": 6.332086631295121e-05,
550
+ "loss": 0.002,
551
+ "step": 49450
552
  },
553
  {
554
+ "epoch": 2.18,
555
+ "learning_rate": 6.276516579204237e-05,
556
+ "loss": 0.0021,
557
+ "step": 50025
558
  },
559
  {
560
+ "epoch": 2.2,
561
+ "learning_rate": 6.220946527113354e-05,
562
+ "loss": 0.002,
563
+ "step": 50600
564
  },
565
  {
566
+ "epoch": 2.23,
567
+ "learning_rate": 6.16537647502247e-05,
568
+ "loss": 0.0019,
569
+ "step": 51175
570
  },
571
  {
572
+ "epoch": 2.25,
573
+ "learning_rate": 6.109806422931587e-05,
574
+ "loss": 0.002,
575
+ "step": 51750
576
  },
577
  {
578
+ "epoch": 2.28,
579
+ "learning_rate": 6.0542363708407024e-05,
580
+ "loss": 0.0021,
581
+ "step": 52325
582
  },
583
  {
584
+ "epoch": 2.3,
585
+ "learning_rate": 5.9986663187498195e-05,
586
+ "loss": 0.0021,
587
+ "step": 52900
 
 
 
 
588
  },
589
  {
590
+ "epoch": 2.33,
591
+ "learning_rate": 5.943096266658935e-05,
592
+ "loss": 0.0019,
593
+ "step": 53475
594
  },
595
  {
596
+ "epoch": 2.35,
597
+ "learning_rate": 5.8875262145680524e-05,
598
+ "loss": 0.0018,
599
+ "step": 54050
600
  },
601
  {
602
+ "epoch": 2.38,
603
+ "learning_rate": 5.831956162477168e-05,
604
+ "loss": 0.0021,
605
+ "step": 54625
606
  },
607
  {
608
+ "epoch": 2.4,
609
+ "learning_rate": 5.7763861103862846e-05,
610
+ "loss": 0.0021,
611
+ "step": 55200
612
  },
613
  {
614
+ "epoch": 2.43,
615
+ "learning_rate": 5.720816058295401e-05,
616
+ "loss": 0.0019,
617
+ "step": 55775
618
  },
619
  {
620
+ "epoch": 2.45,
621
+ "learning_rate": 5.6652460062045174e-05,
622
+ "loss": 0.002,
623
+ "step": 56350
624
  },
625
  {
626
+ "epoch": 2.48,
627
+ "learning_rate": 5.609675954113633e-05,
628
+ "loss": 0.0022,
629
+ "step": 56925
630
  },
631
  {
632
+ "epoch": 2.5,
633
+ "learning_rate": 5.55410590202275e-05,
634
+ "loss": 0.0018,
635
+ "step": 57500
636
  },
637
  {
638
+ "epoch": 2.53,
639
+ "learning_rate": 5.498535849931866e-05,
640
+ "loss": 0.0023,
641
+ "step": 58075
642
  },
643
  {
644
+ "epoch": 2.55,
645
+ "learning_rate": 5.442965797840983e-05,
646
+ "loss": 0.0021,
647
+ "step": 58650
648
  },
649
  {
650
+ "epoch": 2.58,
651
+ "learning_rate": 5.387395745750099e-05,
652
+ "loss": 0.002,
653
+ "step": 59225
654
  },
655
  {
656
+ "epoch": 2.6,
657
+ "learning_rate": 5.331825693659216e-05,
658
+ "loss": 0.0019,
659
+ "step": 59800
660
  },
661
  {
662
+ "epoch": 2.63,
663
+ "learning_rate": 5.276255641568332e-05,
664
+ "loss": 0.0021,
665
+ "step": 60375
666
  },
667
  {
668
+ "epoch": 2.65,
669
+ "learning_rate": 5.220685589477449e-05,
670
+ "loss": 0.0022,
671
+ "step": 60950
 
 
 
 
672
  },
673
  {
674
+ "epoch": 2.68,
675
+ "learning_rate": 5.1651155373865647e-05,
676
+ "loss": 0.0017,
677
+ "step": 61525
678
  },
679
  {
680
+ "epoch": 2.7,
681
+ "learning_rate": 5.109545485295682e-05,
682
+ "loss": 0.0019,
683
+ "step": 62100
684
  },
685
  {
686
+ "epoch": 2.73,
687
+ "learning_rate": 5.0539754332047975e-05,
688
+ "loss": 0.002,
689
+ "step": 62675
690
  },
691
  {
692
+ "epoch": 2.75,
693
+ "learning_rate": 4.998405381113914e-05,
694
+ "loss": 0.0019,
695
+ "step": 63250
696
  },
697
  {
698
+ "epoch": 2.78,
699
+ "learning_rate": 4.9428353290230304e-05,
700
+ "loss": 0.0022,
701
+ "step": 63825
702
  },
703
  {
704
+ "epoch": 2.8,
705
+ "learning_rate": 4.887265276932147e-05,
706
+ "loss": 0.0021,
707
+ "step": 64400
708
  },
709
  {
710
+ "epoch": 2.83,
711
+ "learning_rate": 4.831695224841263e-05,
712
+ "loss": 0.0023,
713
+ "step": 64975
714
  },
715
  {
716
+ "epoch": 2.85,
717
+ "learning_rate": 4.77612517275038e-05,
718
+ "loss": 0.002,
719
+ "step": 65550
720
  },
721
  {
722
+ "epoch": 2.88,
723
+ "learning_rate": 4.720555120659496e-05,
724
+ "loss": 0.002,
725
+ "step": 66125
726
  },
727
  {
728
+ "epoch": 2.9,
729
+ "learning_rate": 4.6649850685686126e-05,
730
+ "loss": 0.0021,
731
+ "step": 66700
732
  },
733
  {
734
+ "epoch": 2.93,
735
+ "learning_rate": 4.609415016477729e-05,
736
+ "loss": 0.0021,
737
+ "step": 67275
738
  },
739
  {
740
+ "epoch": 2.95,
741
+ "learning_rate": 4.5538449643868454e-05,
742
+ "loss": 0.002,
743
+ "step": 67850
744
  },
745
  {
746
+ "epoch": 2.98,
747
+ "learning_rate": 4.498274912295962e-05,
748
+ "loss": 0.0019,
749
+ "step": 68425
750
  },
751
  {
752
+ "epoch": 3.0,
753
+ "eval_loss": 0.029171258211135864,
754
  "eval_max_distance": 8,
755
  "eval_mean_distance": 0,
756
+ "eval_runtime": 20.5415,
757
+ "eval_samples_per_second": 12.56,
758
+ "eval_steps_per_second": 0.876,
759
+ "step": 68982
760
  },
761
  {
762
+ "epoch": 3.0,
763
+ "learning_rate": 4.442704860205078e-05,
 
 
 
 
 
 
764
  "loss": 0.0021,
765
+ "step": 69000
766
  },
767
  {
768
+ "epoch": 3.03,
769
+ "learning_rate": 4.387134808114195e-05,
770
+ "loss": 0.0017,
771
+ "step": 69575
 
 
 
 
 
 
 
 
 
 
 
 
772
  },
773
  {
774
+ "epoch": 3.05,
775
+ "learning_rate": 4.3315647560233105e-05,
776
+ "loss": 0.002,
777
+ "step": 70150
778
  },
779
  {
780
+ "epoch": 3.08,
781
+ "learning_rate": 4.275994703932427e-05,
782
+ "loss": 0.0016,
783
+ "step": 70725
784
  },
785
  {
786
+ "epoch": 3.1,
787
+ "learning_rate": 4.2204246518415434e-05,
788
+ "loss": 0.0018,
789
+ "step": 71300
790
  },
791
  {
792
+ "epoch": 3.13,
793
+ "learning_rate": 4.16485459975066e-05,
794
+ "loss": 0.0018,
795
+ "step": 71875
796
  },
797
  {
798
+ "epoch": 3.15,
799
+ "learning_rate": 4.109284547659776e-05,
800
+ "loss": 0.0018,
801
+ "step": 72450
802
  },
803
  {
804
+ "epoch": 3.18,
805
+ "learning_rate": 4.0537144955688927e-05,
806
+ "loss": 0.0018,
807
+ "step": 73025
808
  },
809
  {
810
+ "epoch": 3.2,
811
+ "learning_rate": 3.998144443478009e-05,
812
+ "loss": 0.002,
813
+ "step": 73600
814
  },
815
  {
816
+ "epoch": 3.23,
817
+ "learning_rate": 3.9425743913871255e-05,
818
+ "loss": 0.002,
819
+ "step": 74175
820
  },
821
  {
822
+ "epoch": 3.25,
823
+ "learning_rate": 3.887004339296242e-05,
824
+ "loss": 0.0019,
825
+ "step": 74750
826
  },
827
  {
828
+ "epoch": 3.28,
829
+ "learning_rate": 3.8314342872053584e-05,
830
+ "loss": 0.0018,
831
+ "step": 75325
 
 
 
 
832
  },
833
  {
834
+ "epoch": 3.3,
835
+ "learning_rate": 3.775864235114475e-05,
836
+ "loss": 0.0018,
837
+ "step": 75900
838
  },
839
  {
840
+ "epoch": 3.33,
841
+ "learning_rate": 3.720294183023591e-05,
842
+ "loss": 0.0017,
843
+ "step": 76475
844
  },
845
  {
846
+ "epoch": 3.35,
847
+ "learning_rate": 3.664724130932708e-05,
848
+ "loss": 0.0017,
849
+ "step": 77050
850
  },
851
  {
852
+ "epoch": 3.38,
853
+ "learning_rate": 3.6091540788418234e-05,
854
+ "loss": 0.0018,
855
+ "step": 77625
856
  },
857
  {
858
+ "epoch": 3.4,
859
+ "learning_rate": 3.55358402675094e-05,
860
+ "loss": 0.0018,
861
+ "step": 78200
862
  },
863
  {
864
+ "epoch": 3.43,
865
+ "learning_rate": 3.498013974660056e-05,
866
+ "loss": 0.0016,
867
+ "step": 78775
868
  },
869
  {
870
+ "epoch": 3.45,
871
+ "learning_rate": 3.442443922569173e-05,
872
+ "loss": 0.0016,
873
+ "step": 79350
874
  },
875
  {
876
+ "epoch": 3.48,
877
+ "learning_rate": 3.386873870478289e-05,
878
+ "loss": 0.0018,
879
+ "step": 79925
880
  },
881
  {
882
+ "epoch": 3.5,
883
+ "learning_rate": 3.3313038183874056e-05,
884
+ "loss": 0.0017,
885
+ "step": 80500
886
  },
887
  {
888
+ "epoch": 3.53,
889
+ "learning_rate": 3.275733766296522e-05,
890
+ "loss": 0.0017,
891
+ "step": 81075
892
  },
893
  {
894
+ "epoch": 3.55,
895
+ "learning_rate": 3.2201637142056385e-05,
896
+ "loss": 0.0016,
897
+ "step": 81650
898
  },
899
  {
900
+ "epoch": 3.58,
901
+ "learning_rate": 3.164593662114755e-05,
902
  "loss": 0.002,
903
+ "step": 82225
904
  },
905
  {
906
+ "epoch": 3.6,
907
+ "learning_rate": 3.1090236100238714e-05,
908
+ "loss": 0.0018,
909
+ "step": 82800
910
  },
911
  {
912
+ "epoch": 3.63,
913
+ "learning_rate": 3.053453557932988e-05,
914
+ "loss": 0.0016,
915
+ "step": 83375
 
 
 
 
916
  },
917
  {
918
+ "epoch": 3.65,
919
+ "learning_rate": 2.997883505842104e-05,
920
+ "loss": 0.0017,
921
+ "step": 83950
922
  },
923
  {
924
+ "epoch": 3.68,
925
+ "learning_rate": 2.9423134537512203e-05,
926
+ "loss": 0.0019,
927
+ "step": 84525
928
  },
929
  {
930
+ "epoch": 3.7,
931
+ "learning_rate": 2.8867434016603368e-05,
932
+ "loss": 0.0018,
933
+ "step": 85100
934
  },
935
  {
936
+ "epoch": 3.73,
937
+ "learning_rate": 2.8311733495694532e-05,
938
+ "loss": 0.0017,
939
+ "step": 85675
940
  },
941
  {
942
+ "epoch": 3.75,
943
+ "learning_rate": 2.7756032974785696e-05,
944
+ "loss": 0.0017,
945
+ "step": 86250
946
  },
947
  {
948
+ "epoch": 3.78,
949
+ "learning_rate": 2.720033245387686e-05,
950
  "loss": 0.0019,
951
+ "step": 86825
952
  },
953
  {
954
+ "epoch": 3.8,
955
+ "learning_rate": 2.664463193296802e-05,
956
  "loss": 0.002,
957
+ "step": 87400
958
  },
959
  {
960
+ "epoch": 3.83,
961
+ "learning_rate": 2.6088931412059186e-05,
962
+ "loss": 0.0019,
963
+ "step": 87975
964
  },
965
  {
966
+ "epoch": 3.85,
967
+ "learning_rate": 2.553323089115035e-05,
968
+ "loss": 0.0017,
969
+ "step": 88550
970
  },
971
  {
972
+ "epoch": 3.88,
973
+ "learning_rate": 2.4977530370241514e-05,
974
+ "loss": 0.0018,
975
+ "step": 89125
976
  },
977
  {
978
+ "epoch": 3.9,
979
+ "learning_rate": 2.442182984933268e-05,
980
+ "loss": 0.0018,
981
+ "step": 89700
982
  },
983
  {
984
+ "epoch": 3.93,
985
+ "learning_rate": 2.3866129328423843e-05,
986
+ "loss": 0.0018,
987
+ "step": 90275
988
  },
989
  {
990
+ "epoch": 3.95,
991
+ "learning_rate": 2.3310428807515004e-05,
992
  "loss": 0.0019,
993
+ "step": 90850
994
+ },
995
+ {
996
+ "epoch": 3.98,
997
+ "learning_rate": 2.275472828660617e-05,
998
+ "loss": 0.0016,
999
+ "step": 91425
1000
  },
1001
  {
1002
+ "epoch": 4.0,
1003
+ "eval_loss": 0.02807791158556938,
1004
  "eval_max_distance": 8,
1005
  "eval_mean_distance": 0,
1006
+ "eval_runtime": 20.9058,
1007
+ "eval_samples_per_second": 12.341,
1008
+ "eval_steps_per_second": 0.861,
1009
+ "step": 91976
1010
  },
1011
  {
1012
+ "epoch": 4.0,
1013
+ "learning_rate": 2.2199027765697333e-05,
1014
+ "loss": 0.0019,
1015
+ "step": 92000
1016
  },
1017
  {
1018
+ "epoch": 4.03,
1019
+ "learning_rate": 2.1643327244788497e-05,
1020
+ "loss": 0.0015,
1021
+ "step": 92575
1022
  },
1023
  {
1024
+ "epoch": 4.05,
1025
+ "learning_rate": 2.108762672387966e-05,
1026
+ "loss": 0.0016,
1027
+ "step": 93150
1028
  },
1029
  {
1030
+ "epoch": 4.08,
1031
+ "learning_rate": 2.0531926202970826e-05,
1032
+ "loss": 0.0015,
1033
+ "step": 93725
1034
  },
1035
  {
1036
+ "epoch": 4.1,
1037
+ "learning_rate": 1.997622568206199e-05,
1038
+ "loss": 0.0017,
1039
+ "step": 94300
1040
  },
1041
  {
1042
+ "epoch": 4.13,
1043
+ "learning_rate": 1.942052516115315e-05,
1044
+ "loss": 0.0016,
1045
+ "step": 94875
1046
  },
1047
  {
1048
+ "epoch": 4.15,
1049
+ "learning_rate": 1.8864824640244315e-05,
1050
+ "loss": 0.0017,
1051
+ "step": 95450
1052
  },
1053
  {
1054
+ "epoch": 4.18,
1055
+ "learning_rate": 1.830912411933548e-05,
1056
+ "loss": 0.0017,
1057
+ "step": 96025
1058
  },
1059
  {
1060
+ "epoch": 4.2,
1061
+ "learning_rate": 1.7753423598426644e-05,
1062
+ "loss": 0.0015,
1063
+ "step": 96600
1064
  },
1065
  {
1066
+ "epoch": 4.23,
1067
+ "learning_rate": 1.719772307751781e-05,
1068
+ "loss": 0.0018,
1069
+ "step": 97175
1070
  },
1071
  {
1072
+ "epoch": 4.25,
1073
+ "learning_rate": 1.6642022556608973e-05,
1074
+ "loss": 0.0018,
1075
+ "step": 97750
1076
  },
1077
  {
1078
+ "epoch": 4.28,
1079
+ "learning_rate": 1.6086322035700134e-05,
1080
+ "loss": 0.0014,
1081
+ "step": 98325
1082
  },
1083
  {
1084
+ "epoch": 4.3,
1085
+ "learning_rate": 1.5530621514791298e-05,
1086
+ "loss": 0.0016,
1087
+ "step": 98900
1088
  },
1089
  {
1090
+ "epoch": 4.33,
1091
+ "learning_rate": 1.4974920993882462e-05,
1092
+ "loss": 0.0018,
1093
+ "step": 99475
1094
  },
1095
  {
1096
+ "epoch": 4.35,
1097
+ "learning_rate": 1.4419220472973627e-05,
1098
+ "loss": 0.0015,
1099
+ "step": 100050
 
 
 
 
1100
  },
1101
  {
1102
+ "epoch": 4.38,
1103
+ "learning_rate": 1.3863519952064791e-05,
1104
  "loss": 0.0019,
1105
+ "step": 100625
 
 
 
 
 
 
1106
  },
1107
  {
1108
+ "epoch": 4.4,
1109
+ "learning_rate": 1.3307819431155954e-05,
1110
+ "loss": 0.0016,
1111
+ "step": 101200
 
 
 
 
 
 
1112
  },
1113
  {
1114
+ "epoch": 4.43,
1115
+ "learning_rate": 1.2752118910247118e-05,
1116
+ "loss": 0.0015,
1117
+ "step": 101775
1118
  },
1119
  {
1120
+ "epoch": 4.45,
1121
+ "learning_rate": 1.2196418389338282e-05,
1122
+ "loss": 0.0017,
1123
+ "step": 102350
1124
  },
1125
  {
1126
+ "epoch": 4.48,
1127
+ "learning_rate": 1.1640717868429447e-05,
1128
+ "loss": 0.0016,
1129
+ "step": 102925
1130
  },
1131
  {
1132
+ "epoch": 4.5,
1133
+ "learning_rate": 1.108501734752061e-05,
1134
+ "loss": 0.0015,
1135
+ "step": 103500
1136
  },
1137
  {
1138
+ "epoch": 4.53,
1139
+ "learning_rate": 1.0529316826611774e-05,
1140
  "loss": 0.0017,
1141
+ "step": 104075
1142
  },
1143
  {
1144
+ "epoch": 4.55,
1145
+ "learning_rate": 9.973616305702938e-06,
1146
+ "loss": 0.0016,
1147
+ "step": 104650
1148
  },
1149
  {
1150
+ "epoch": 4.58,
1151
+ "learning_rate": 9.4179157847941e-06,
1152
+ "loss": 0.0016,
1153
+ "step": 105225
1154
  },
1155
  {
1156
+ "epoch": 4.6,
1157
+ "learning_rate": 8.862215263885265e-06,
1158
+ "loss": 0.0017,
1159
+ "step": 105800
1160
  },
1161
  {
1162
+ "epoch": 4.63,
1163
+ "learning_rate": 8.30651474297643e-06,
1164
+ "loss": 0.0015,
1165
+ "step": 106375
1166
  },
1167
  {
1168
+ "epoch": 4.65,
1169
+ "learning_rate": 7.750814222067592e-06,
1170
+ "loss": 0.0014,
1171
+ "step": 106950
 
 
 
 
1172
  },
1173
  {
1174
+ "epoch": 4.68,
1175
+ "learning_rate": 7.195113701158756e-06,
1176
  "loss": 0.0017,
1177
+ "step": 107525
1178
  },
1179
  {
1180
+ "epoch": 4.7,
1181
+ "learning_rate": 6.639413180249921e-06,
1182
+ "loss": 0.0017,
1183
+ "step": 108100
1184
  },
1185
  {
1186
+ "epoch": 4.73,
1187
+ "learning_rate": 6.083712659341084e-06,
1188
+ "loss": 0.0016,
1189
+ "step": 108675
1190
  },
1191
  {
1192
+ "epoch": 4.75,
1193
+ "learning_rate": 5.528012138432248e-06,
1194
  "loss": 0.0017,
1195
+ "step": 109250
1196
  },
1197
  {
1198
+ "epoch": 4.78,
1199
+ "learning_rate": 4.972311617523412e-06,
1200
  "loss": 0.0018,
1201
+ "step": 109825
1202
  },
1203
  {
1204
+ "epoch": 4.8,
1205
+ "learning_rate": 4.4166110966145756e-06,
1206
+ "loss": 0.0015,
1207
+ "step": 110400
 
 
 
 
 
 
1208
  },
1209
  {
1210
+ "epoch": 4.83,
1211
+ "learning_rate": 3.86091057570574e-06,
1212
+ "loss": 0.0017,
1213
+ "step": 110975
1214
  },
1215
  {
1216
+ "epoch": 4.85,
1217
+ "learning_rate": 3.3052100547969034e-06,
1218
+ "loss": 0.0014,
1219
+ "step": 111550
1220
  },
1221
  {
1222
+ "epoch": 4.88,
1223
+ "learning_rate": 2.7495095338880677e-06,
1224
  "loss": 0.0018,
1225
+ "step": 112125
1226
  },
1227
  {
1228
+ "epoch": 4.9,
1229
+ "learning_rate": 2.1938090129792312e-06,
1230
  "loss": 0.0017,
1231
+ "step": 112700
1232
  },
1233
  {
1234
+ "epoch": 4.93,
1235
+ "learning_rate": 1.6381084920703951e-06,
1236
  "loss": 0.0018,
1237
+ "step": 113275
1238
  },
1239
  {
1240
+ "epoch": 4.95,
1241
+ "learning_rate": 1.082407971161559e-06,
1242
+ "loss": 0.0015,
1243
+ "step": 113850
1244
  },
1245
  {
1246
+ "epoch": 4.98,
1247
+ "learning_rate": 5.26707450252723e-07,
1248
+ "loss": 0.0016,
1249
+ "step": 114425
1250
+ },
1251
+ {
1252
+ "epoch": 5.0,
1253
+ "eval_loss": 0.027882983908057213,
1254
  "eval_max_distance": 8,
1255
  "eval_mean_distance": 0,
1256
+ "eval_runtime": 20.563,
1257
+ "eval_samples_per_second": 12.547,
1258
+ "eval_steps_per_second": 0.875,
1259
+ "step": 114970
1260
+ },
1261
+ {
1262
+ "epoch": 5.0,
1263
+ "step": 114970,
1264
+ "total_flos": 2.9139726999711744e+16,
1265
+ "train_loss": 0.0019824859863435676,
1266
+ "train_runtime": 8494.2801,
1267
+ "train_samples_per_second": 203.021,
1268
+ "train_steps_per_second": 13.535
1269
  }
1270
  ],
1271
+ "logging_steps": 575,
1272
+ "max_steps": 114970,
1273
+ "num_train_epochs": 5,
1274
+ "save_steps": 1150,
1275
+ "total_flos": 2.9139726999711744e+16,
1276
  "trial_name": null,
1277
  "trial_params": null
1278
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19cad5fdc011eae68aae9d7cd252dcf011f18199df3fd5c6b107c8e3cbed177f
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8318e24b1ced526ec88f5a701462bec50052cffbe6f8dcc3d2adf56c581c256
3
  size 4091