CocoRoF commited on
Commit
b32154b
·
verified ·
1 Parent(s): 577fcde

Training in progress, step 9521, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34fe23e2546f350bdc3ea7cd098359c61876cfd9860bbc4e904fff96718928df
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0f8d65cedaf44e4ac5bad3713ac7c3f605e9a0d2835b275497af127aebea82
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc0d1fd974fead158de0eadb70e7f57c959f5cfaef326177d457bb3324066005
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e7067080a90c90e2510ef46294c006830456dcdecf0678da97a2d600c67f826
3
  size 1107079290
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:069d595c560369c6f0dc5e92d7d7a49b75f77981476650e93e924eb0ecc848f4
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:178f30be8493d5268a297ee3466f58f6af4b6a964eca4182cdc4fc0db2805cb6
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7876923076923077,
5
  "eval_steps": 2500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5281,6 +5281,1420 @@
5281
  "eval_samples_per_second": 1527.965,
5282
  "eval_steps_per_second": 47.752,
5283
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5284
  }
5285
  ],
5286
  "logging_steps": 10,
@@ -5295,12 +6709,12 @@
5295
  "should_evaluate": false,
5296
  "should_log": false,
5297
  "should_save": true,
5298
- "should_training_stop": false
5299
  },
5300
  "attributes": {}
5301
  }
5302
  },
5303
- "total_flos": 2.0708374333095936e+19,
5304
  "train_batch_size": 4,
5305
  "trial_name": null,
5306
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999491282051282,
5
  "eval_steps": 2500,
6
+ "global_step": 9521,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5281
  "eval_samples_per_second": 1527.965,
5282
  "eval_steps_per_second": 47.752,
5283
  "step": 7500
5284
+ },
5285
+ {
5286
+ "epoch": 0.788742564102564,
5287
+ "grad_norm": 78.0625,
5288
+ "learning_rate": 9.98459487179487e-07,
5289
+ "loss": 132.7844,
5290
+ "step": 7510
5291
+ },
5292
+ {
5293
+ "epoch": 0.7897928205128205,
5294
+ "grad_norm": 76.1875,
5295
+ "learning_rate": 9.984574358974358e-07,
5296
+ "loss": 132.6412,
5297
+ "step": 7520
5298
+ },
5299
+ {
5300
+ "epoch": 0.7908430769230769,
5301
+ "grad_norm": 79.4375,
5302
+ "learning_rate": 9.984553846153847e-07,
5303
+ "loss": 133.3601,
5304
+ "step": 7530
5305
+ },
5306
+ {
5307
+ "epoch": 0.7918933333333333,
5308
+ "grad_norm": 74.9375,
5309
+ "learning_rate": 9.984533333333332e-07,
5310
+ "loss": 133.7195,
5311
+ "step": 7540
5312
+ },
5313
+ {
5314
+ "epoch": 0.7929435897435897,
5315
+ "grad_norm": 76.1875,
5316
+ "learning_rate": 9.98451282051282e-07,
5317
+ "loss": 133.1729,
5318
+ "step": 7550
5319
+ },
5320
+ {
5321
+ "epoch": 0.7939938461538462,
5322
+ "grad_norm": 79.125,
5323
+ "learning_rate": 9.984492307692308e-07,
5324
+ "loss": 133.8151,
5325
+ "step": 7560
5326
+ },
5327
+ {
5328
+ "epoch": 0.7950441025641025,
5329
+ "grad_norm": 76.8125,
5330
+ "learning_rate": 9.984471794871795e-07,
5331
+ "loss": 134.1982,
5332
+ "step": 7570
5333
+ },
5334
+ {
5335
+ "epoch": 0.796094358974359,
5336
+ "grad_norm": 80.0,
5337
+ "learning_rate": 9.984451282051281e-07,
5338
+ "loss": 133.2489,
5339
+ "step": 7580
5340
+ },
5341
+ {
5342
+ "epoch": 0.7971446153846153,
5343
+ "grad_norm": 75.0,
5344
+ "learning_rate": 9.984430769230768e-07,
5345
+ "loss": 134.1791,
5346
+ "step": 7590
5347
+ },
5348
+ {
5349
+ "epoch": 0.7981948717948718,
5350
+ "grad_norm": 77.375,
5351
+ "learning_rate": 9.984410256410255e-07,
5352
+ "loss": 134.0118,
5353
+ "step": 7600
5354
+ },
5355
+ {
5356
+ "epoch": 0.7992451282051282,
5357
+ "grad_norm": 82.3125,
5358
+ "learning_rate": 9.984389743589742e-07,
5359
+ "loss": 132.9163,
5360
+ "step": 7610
5361
+ },
5362
+ {
5363
+ "epoch": 0.8002953846153846,
5364
+ "grad_norm": 78.25,
5365
+ "learning_rate": 9.984369230769231e-07,
5366
+ "loss": 133.4907,
5367
+ "step": 7620
5368
+ },
5369
+ {
5370
+ "epoch": 0.801345641025641,
5371
+ "grad_norm": 72.375,
5372
+ "learning_rate": 9.984348717948718e-07,
5373
+ "loss": 134.2763,
5374
+ "step": 7630
5375
+ },
5376
+ {
5377
+ "epoch": 0.8023958974358975,
5378
+ "grad_norm": 77.0625,
5379
+ "learning_rate": 9.984328205128205e-07,
5380
+ "loss": 133.4557,
5381
+ "step": 7640
5382
+ },
5383
+ {
5384
+ "epoch": 0.8034461538461538,
5385
+ "grad_norm": 75.4375,
5386
+ "learning_rate": 9.984307692307692e-07,
5387
+ "loss": 133.6842,
5388
+ "step": 7650
5389
+ },
5390
+ {
5391
+ "epoch": 0.8044964102564103,
5392
+ "grad_norm": 74.75,
5393
+ "learning_rate": 9.98428717948718e-07,
5394
+ "loss": 133.6221,
5395
+ "step": 7660
5396
+ },
5397
+ {
5398
+ "epoch": 0.8055466666666666,
5399
+ "grad_norm": 74.1875,
5400
+ "learning_rate": 9.984266666666666e-07,
5401
+ "loss": 133.9818,
5402
+ "step": 7670
5403
+ },
5404
+ {
5405
+ "epoch": 0.8065969230769231,
5406
+ "grad_norm": 80.9375,
5407
+ "learning_rate": 9.984246153846153e-07,
5408
+ "loss": 134.001,
5409
+ "step": 7680
5410
+ },
5411
+ {
5412
+ "epoch": 0.8076471794871795,
5413
+ "grad_norm": 73.4375,
5414
+ "learning_rate": 9.98422564102564e-07,
5415
+ "loss": 134.0589,
5416
+ "step": 7690
5417
+ },
5418
+ {
5419
+ "epoch": 0.8086974358974359,
5420
+ "grad_norm": 74.3125,
5421
+ "learning_rate": 9.984205128205127e-07,
5422
+ "loss": 133.7496,
5423
+ "step": 7700
5424
+ },
5425
+ {
5426
+ "epoch": 0.8097476923076923,
5427
+ "grad_norm": 74.3125,
5428
+ "learning_rate": 9.984184615384616e-07,
5429
+ "loss": 133.3738,
5430
+ "step": 7710
5431
+ },
5432
+ {
5433
+ "epoch": 0.8107979487179487,
5434
+ "grad_norm": 72.0,
5435
+ "learning_rate": 9.984164102564103e-07,
5436
+ "loss": 133.1667,
5437
+ "step": 7720
5438
+ },
5439
+ {
5440
+ "epoch": 0.8118482051282051,
5441
+ "grad_norm": 75.875,
5442
+ "learning_rate": 9.98414358974359e-07,
5443
+ "loss": 132.9567,
5444
+ "step": 7730
5445
+ },
5446
+ {
5447
+ "epoch": 0.8128984615384616,
5448
+ "grad_norm": 73.3125,
5449
+ "learning_rate": 9.984123076923077e-07,
5450
+ "loss": 134.7816,
5451
+ "step": 7740
5452
+ },
5453
+ {
5454
+ "epoch": 0.8139487179487179,
5455
+ "grad_norm": 75.875,
5456
+ "learning_rate": 9.984102564102564e-07,
5457
+ "loss": 133.8593,
5458
+ "step": 7750
5459
+ },
5460
+ {
5461
+ "epoch": 0.8149989743589744,
5462
+ "grad_norm": 78.5625,
5463
+ "learning_rate": 9.98408205128205e-07,
5464
+ "loss": 133.8173,
5465
+ "step": 7760
5466
+ },
5467
+ {
5468
+ "epoch": 0.8160492307692307,
5469
+ "grad_norm": 72.25,
5470
+ "learning_rate": 9.984061538461538e-07,
5471
+ "loss": 133.5827,
5472
+ "step": 7770
5473
+ },
5474
+ {
5475
+ "epoch": 0.8170994871794872,
5476
+ "grad_norm": 77.6875,
5477
+ "learning_rate": 9.984041025641027e-07,
5478
+ "loss": 132.9576,
5479
+ "step": 7780
5480
+ },
5481
+ {
5482
+ "epoch": 0.8181497435897436,
5483
+ "grad_norm": 71.6875,
5484
+ "learning_rate": 9.984020512820511e-07,
5485
+ "loss": 133.3698,
5486
+ "step": 7790
5487
+ },
5488
+ {
5489
+ "epoch": 0.8192,
5490
+ "grad_norm": 72.8125,
5491
+ "learning_rate": 9.983999999999998e-07,
5492
+ "loss": 133.9683,
5493
+ "step": 7800
5494
+ },
5495
+ {
5496
+ "epoch": 0.8202502564102564,
5497
+ "grad_norm": 73.25,
5498
+ "learning_rate": 9.983979487179487e-07,
5499
+ "loss": 133.8763,
5500
+ "step": 7810
5501
+ },
5502
+ {
5503
+ "epoch": 0.8213005128205129,
5504
+ "grad_norm": 71.875,
5505
+ "learning_rate": 9.983958974358974e-07,
5506
+ "loss": 132.3762,
5507
+ "step": 7820
5508
+ },
5509
+ {
5510
+ "epoch": 0.8223507692307692,
5511
+ "grad_norm": 77.3125,
5512
+ "learning_rate": 9.983938461538461e-07,
5513
+ "loss": 133.6236,
5514
+ "step": 7830
5515
+ },
5516
+ {
5517
+ "epoch": 0.8234010256410257,
5518
+ "grad_norm": 83.8125,
5519
+ "learning_rate": 9.983917948717948e-07,
5520
+ "loss": 132.3988,
5521
+ "step": 7840
5522
+ },
5523
+ {
5524
+ "epoch": 0.824451282051282,
5525
+ "grad_norm": 73.375,
5526
+ "learning_rate": 9.983897435897435e-07,
5527
+ "loss": 133.4139,
5528
+ "step": 7850
5529
+ },
5530
+ {
5531
+ "epoch": 0.8255015384615385,
5532
+ "grad_norm": 79.3125,
5533
+ "learning_rate": 9.983876923076922e-07,
5534
+ "loss": 134.3546,
5535
+ "step": 7860
5536
+ },
5537
+ {
5538
+ "epoch": 0.8265517948717949,
5539
+ "grad_norm": 78.4375,
5540
+ "learning_rate": 9.98385641025641e-07,
5541
+ "loss": 134.4989,
5542
+ "step": 7870
5543
+ },
5544
+ {
5545
+ "epoch": 0.8276020512820513,
5546
+ "grad_norm": 78.0,
5547
+ "learning_rate": 9.983835897435898e-07,
5548
+ "loss": 133.5446,
5549
+ "step": 7880
5550
+ },
5551
+ {
5552
+ "epoch": 0.8286523076923077,
5553
+ "grad_norm": 77.5,
5554
+ "learning_rate": 9.983815384615383e-07,
5555
+ "loss": 133.1525,
5556
+ "step": 7890
5557
+ },
5558
+ {
5559
+ "epoch": 0.8297025641025642,
5560
+ "grad_norm": 84.1875,
5561
+ "learning_rate": 9.983794871794872e-07,
5562
+ "loss": 133.6593,
5563
+ "step": 7900
5564
+ },
5565
+ {
5566
+ "epoch": 0.8307528205128205,
5567
+ "grad_norm": 74.1875,
5568
+ "learning_rate": 9.983774358974359e-07,
5569
+ "loss": 133.9224,
5570
+ "step": 7910
5571
+ },
5572
+ {
5573
+ "epoch": 0.831803076923077,
5574
+ "grad_norm": 76.0,
5575
+ "learning_rate": 9.983753846153846e-07,
5576
+ "loss": 134.1779,
5577
+ "step": 7920
5578
+ },
5579
+ {
5580
+ "epoch": 0.8328533333333333,
5581
+ "grad_norm": 73.8125,
5582
+ "learning_rate": 9.983733333333333e-07,
5583
+ "loss": 135.2825,
5584
+ "step": 7930
5585
+ },
5586
+ {
5587
+ "epoch": 0.8339035897435897,
5588
+ "grad_norm": 72.875,
5589
+ "learning_rate": 9.98371282051282e-07,
5590
+ "loss": 133.2882,
5591
+ "step": 7940
5592
+ },
5593
+ {
5594
+ "epoch": 0.8349538461538462,
5595
+ "grad_norm": 72.4375,
5596
+ "learning_rate": 9.983692307692307e-07,
5597
+ "loss": 134.7268,
5598
+ "step": 7950
5599
+ },
5600
+ {
5601
+ "epoch": 0.8360041025641025,
5602
+ "grad_norm": 78.8125,
5603
+ "learning_rate": 9.983671794871794e-07,
5604
+ "loss": 133.8474,
5605
+ "step": 7960
5606
+ },
5607
+ {
5608
+ "epoch": 0.837054358974359,
5609
+ "grad_norm": 74.5,
5610
+ "learning_rate": 9.983651282051283e-07,
5611
+ "loss": 131.9032,
5612
+ "step": 7970
5613
+ },
5614
+ {
5615
+ "epoch": 0.8381046153846153,
5616
+ "grad_norm": 77.5625,
5617
+ "learning_rate": 9.98363076923077e-07,
5618
+ "loss": 134.0065,
5619
+ "step": 7980
5620
+ },
5621
+ {
5622
+ "epoch": 0.8391548717948718,
5623
+ "grad_norm": 78.25,
5624
+ "learning_rate": 9.983610256410257e-07,
5625
+ "loss": 133.8622,
5626
+ "step": 7990
5627
+ },
5628
+ {
5629
+ "epoch": 0.8402051282051282,
5630
+ "grad_norm": 76.3125,
5631
+ "learning_rate": 9.983589743589743e-07,
5632
+ "loss": 133.5396,
5633
+ "step": 8000
5634
+ },
5635
+ {
5636
+ "epoch": 0.8412553846153846,
5637
+ "grad_norm": 83.25,
5638
+ "learning_rate": 9.98356923076923e-07,
5639
+ "loss": 134.0301,
5640
+ "step": 8010
5641
+ },
5642
+ {
5643
+ "epoch": 0.842305641025641,
5644
+ "grad_norm": 77.5625,
5645
+ "learning_rate": 9.983548717948717e-07,
5646
+ "loss": 133.8874,
5647
+ "step": 8020
5648
+ },
5649
+ {
5650
+ "epoch": 0.8433558974358975,
5651
+ "grad_norm": 76.25,
5652
+ "learning_rate": 9.983528205128204e-07,
5653
+ "loss": 133.7713,
5654
+ "step": 8030
5655
+ },
5656
+ {
5657
+ "epoch": 0.8444061538461538,
5658
+ "grad_norm": 85.25,
5659
+ "learning_rate": 9.983507692307691e-07,
5660
+ "loss": 132.7225,
5661
+ "step": 8040
5662
+ },
5663
+ {
5664
+ "epoch": 0.8454564102564103,
5665
+ "grad_norm": 77.9375,
5666
+ "learning_rate": 9.983487179487178e-07,
5667
+ "loss": 132.4973,
5668
+ "step": 8050
5669
+ },
5670
+ {
5671
+ "epoch": 0.8465066666666666,
5672
+ "grad_norm": 77.5,
5673
+ "learning_rate": 9.983466666666667e-07,
5674
+ "loss": 134.7827,
5675
+ "step": 8060
5676
+ },
5677
+ {
5678
+ "epoch": 0.8475569230769231,
5679
+ "grad_norm": 78.9375,
5680
+ "learning_rate": 9.983446153846154e-07,
5681
+ "loss": 134.2993,
5682
+ "step": 8070
5683
+ },
5684
+ {
5685
+ "epoch": 0.8486071794871795,
5686
+ "grad_norm": 75.875,
5687
+ "learning_rate": 9.983425641025641e-07,
5688
+ "loss": 132.517,
5689
+ "step": 8080
5690
+ },
5691
+ {
5692
+ "epoch": 0.8496574358974359,
5693
+ "grad_norm": 72.8125,
5694
+ "learning_rate": 9.983405128205128e-07,
5695
+ "loss": 132.6084,
5696
+ "step": 8090
5697
+ },
5698
+ {
5699
+ "epoch": 0.8507076923076923,
5700
+ "grad_norm": 75.1875,
5701
+ "learning_rate": 9.983384615384615e-07,
5702
+ "loss": 133.5499,
5703
+ "step": 8100
5704
+ },
5705
+ {
5706
+ "epoch": 0.8517579487179487,
5707
+ "grad_norm": 76.0,
5708
+ "learning_rate": 9.983364102564102e-07,
5709
+ "loss": 131.2289,
5710
+ "step": 8110
5711
+ },
5712
+ {
5713
+ "epoch": 0.8528082051282051,
5714
+ "grad_norm": 74.625,
5715
+ "learning_rate": 9.983343589743589e-07,
5716
+ "loss": 133.3548,
5717
+ "step": 8120
5718
+ },
5719
+ {
5720
+ "epoch": 0.8538584615384616,
5721
+ "grad_norm": 80.9375,
5722
+ "learning_rate": 9.983323076923078e-07,
5723
+ "loss": 132.2297,
5724
+ "step": 8130
5725
+ },
5726
+ {
5727
+ "epoch": 0.8549087179487179,
5728
+ "grad_norm": 73.125,
5729
+ "learning_rate": 9.983302564102563e-07,
5730
+ "loss": 133.2755,
5731
+ "step": 8140
5732
+ },
5733
+ {
5734
+ "epoch": 0.8559589743589744,
5735
+ "grad_norm": 75.3125,
5736
+ "learning_rate": 9.98328205128205e-07,
5737
+ "loss": 134.9403,
5738
+ "step": 8150
5739
+ },
5740
+ {
5741
+ "epoch": 0.8570092307692307,
5742
+ "grad_norm": 73.75,
5743
+ "learning_rate": 9.983261538461539e-07,
5744
+ "loss": 133.4854,
5745
+ "step": 8160
5746
+ },
5747
+ {
5748
+ "epoch": 0.8580594871794872,
5749
+ "grad_norm": 75.4375,
5750
+ "learning_rate": 9.983241025641026e-07,
5751
+ "loss": 133.9221,
5752
+ "step": 8170
5753
+ },
5754
+ {
5755
+ "epoch": 0.8591097435897436,
5756
+ "grad_norm": 73.0625,
5757
+ "learning_rate": 9.983220512820513e-07,
5758
+ "loss": 133.0202,
5759
+ "step": 8180
5760
+ },
5761
+ {
5762
+ "epoch": 0.86016,
5763
+ "grad_norm": 77.8125,
5764
+ "learning_rate": 9.9832e-07,
5765
+ "loss": 133.9612,
5766
+ "step": 8190
5767
+ },
5768
+ {
5769
+ "epoch": 0.8612102564102564,
5770
+ "grad_norm": 74.75,
5771
+ "learning_rate": 9.983179487179486e-07,
5772
+ "loss": 132.866,
5773
+ "step": 8200
5774
+ },
5775
+ {
5776
+ "epoch": 0.8622605128205129,
5777
+ "grad_norm": 78.6875,
5778
+ "learning_rate": 9.983158974358973e-07,
5779
+ "loss": 131.9501,
5780
+ "step": 8210
5781
+ },
5782
+ {
5783
+ "epoch": 0.8633107692307692,
5784
+ "grad_norm": 72.5,
5785
+ "learning_rate": 9.98313846153846e-07,
5786
+ "loss": 133.986,
5787
+ "step": 8220
5788
+ },
5789
+ {
5790
+ "epoch": 0.8643610256410257,
5791
+ "grad_norm": 77.5625,
5792
+ "learning_rate": 9.98311794871795e-07,
5793
+ "loss": 134.4183,
5794
+ "step": 8230
5795
+ },
5796
+ {
5797
+ "epoch": 0.865411282051282,
5798
+ "grad_norm": 79.1875,
5799
+ "learning_rate": 9.983097435897434e-07,
5800
+ "loss": 133.7371,
5801
+ "step": 8240
5802
+ },
5803
+ {
5804
+ "epoch": 0.8664615384615385,
5805
+ "grad_norm": 83.9375,
5806
+ "learning_rate": 9.983076923076923e-07,
5807
+ "loss": 131.5742,
5808
+ "step": 8250
5809
+ },
5810
+ {
5811
+ "epoch": 0.8675117948717949,
5812
+ "grad_norm": 78.125,
5813
+ "learning_rate": 9.98305641025641e-07,
5814
+ "loss": 133.3313,
5815
+ "step": 8260
5816
+ },
5817
+ {
5818
+ "epoch": 0.8685620512820513,
5819
+ "grad_norm": 81.25,
5820
+ "learning_rate": 9.983035897435897e-07,
5821
+ "loss": 133.0588,
5822
+ "step": 8270
5823
+ },
5824
+ {
5825
+ "epoch": 0.8696123076923077,
5826
+ "grad_norm": 74.875,
5827
+ "learning_rate": 9.983015384615384e-07,
5828
+ "loss": 132.9553,
5829
+ "step": 8280
5830
+ },
5831
+ {
5832
+ "epoch": 0.8706625641025642,
5833
+ "grad_norm": 76.25,
5834
+ "learning_rate": 9.982994871794871e-07,
5835
+ "loss": 132.9017,
5836
+ "step": 8290
5837
+ },
5838
+ {
5839
+ "epoch": 0.8717128205128205,
5840
+ "grad_norm": 80.25,
5841
+ "learning_rate": 9.982974358974358e-07,
5842
+ "loss": 132.7887,
5843
+ "step": 8300
5844
+ },
5845
+ {
5846
+ "epoch": 0.872763076923077,
5847
+ "grad_norm": 82.5,
5848
+ "learning_rate": 9.982953846153845e-07,
5849
+ "loss": 131.9362,
5850
+ "step": 8310
5851
+ },
5852
+ {
5853
+ "epoch": 0.8738133333333333,
5854
+ "grad_norm": 74.6875,
5855
+ "learning_rate": 9.982933333333334e-07,
5856
+ "loss": 133.4346,
5857
+ "step": 8320
5858
+ },
5859
+ {
5860
+ "epoch": 0.8748635897435898,
5861
+ "grad_norm": 74.4375,
5862
+ "learning_rate": 9.98291282051282e-07,
5863
+ "loss": 133.0578,
5864
+ "step": 8330
5865
+ },
5866
+ {
5867
+ "epoch": 0.8759138461538462,
5868
+ "grad_norm": 74.75,
5869
+ "learning_rate": 9.982892307692308e-07,
5870
+ "loss": 133.309,
5871
+ "step": 8340
5872
+ },
5873
+ {
5874
+ "epoch": 0.8769641025641025,
5875
+ "grad_norm": 78.125,
5876
+ "learning_rate": 9.982871794871795e-07,
5877
+ "loss": 134.0469,
5878
+ "step": 8350
5879
+ },
5880
+ {
5881
+ "epoch": 0.878014358974359,
5882
+ "grad_norm": 82.125,
5883
+ "learning_rate": 9.982851282051282e-07,
5884
+ "loss": 131.7594,
5885
+ "step": 8360
5886
+ },
5887
+ {
5888
+ "epoch": 0.8790646153846153,
5889
+ "grad_norm": 80.9375,
5890
+ "learning_rate": 9.982830769230769e-07,
5891
+ "loss": 134.0729,
5892
+ "step": 8370
5893
+ },
5894
+ {
5895
+ "epoch": 0.8801148717948718,
5896
+ "grad_norm": 80.75,
5897
+ "learning_rate": 9.982810256410256e-07,
5898
+ "loss": 133.4013,
5899
+ "step": 8380
5900
+ },
5901
+ {
5902
+ "epoch": 0.8811651282051282,
5903
+ "grad_norm": 74.6875,
5904
+ "learning_rate": 9.982789743589743e-07,
5905
+ "loss": 132.9633,
5906
+ "step": 8390
5907
+ },
5908
+ {
5909
+ "epoch": 0.8822153846153846,
5910
+ "grad_norm": 74.3125,
5911
+ "learning_rate": 9.98276923076923e-07,
5912
+ "loss": 133.8356,
5913
+ "step": 8400
5914
+ },
5915
+ {
5916
+ "epoch": 0.883265641025641,
5917
+ "grad_norm": 75.0,
5918
+ "learning_rate": 9.982748717948719e-07,
5919
+ "loss": 133.2425,
5920
+ "step": 8410
5921
+ },
5922
+ {
5923
+ "epoch": 0.8843158974358974,
5924
+ "grad_norm": 73.25,
5925
+ "learning_rate": 9.982728205128206e-07,
5926
+ "loss": 132.7144,
5927
+ "step": 8420
5928
+ },
5929
+ {
5930
+ "epoch": 0.8853661538461538,
5931
+ "grad_norm": 78.625,
5932
+ "learning_rate": 9.98270769230769e-07,
5933
+ "loss": 134.2052,
5934
+ "step": 8430
5935
+ },
5936
+ {
5937
+ "epoch": 0.8864164102564103,
5938
+ "grad_norm": 71.0,
5939
+ "learning_rate": 9.98268717948718e-07,
5940
+ "loss": 133.7353,
5941
+ "step": 8440
5942
+ },
5943
+ {
5944
+ "epoch": 0.8874666666666666,
5945
+ "grad_norm": 77.3125,
5946
+ "learning_rate": 9.982666666666666e-07,
5947
+ "loss": 134.3944,
5948
+ "step": 8450
5949
+ },
5950
+ {
5951
+ "epoch": 0.8885169230769231,
5952
+ "grad_norm": 79.5,
5953
+ "learning_rate": 9.982646153846153e-07,
5954
+ "loss": 134.6475,
5955
+ "step": 8460
5956
+ },
5957
+ {
5958
+ "epoch": 0.8895671794871794,
5959
+ "grad_norm": 78.375,
5960
+ "learning_rate": 9.98262564102564e-07,
5961
+ "loss": 133.3185,
5962
+ "step": 8470
5963
+ },
5964
+ {
5965
+ "epoch": 0.8906174358974359,
5966
+ "grad_norm": 83.3125,
5967
+ "learning_rate": 9.98260512820513e-07,
5968
+ "loss": 133.362,
5969
+ "step": 8480
5970
+ },
5971
+ {
5972
+ "epoch": 0.8916676923076923,
5973
+ "grad_norm": 75.875,
5974
+ "learning_rate": 9.982584615384614e-07,
5975
+ "loss": 132.8726,
5976
+ "step": 8490
5977
+ },
5978
+ {
5979
+ "epoch": 0.8927179487179487,
5980
+ "grad_norm": 73.5625,
5981
+ "learning_rate": 9.9825641025641e-07,
5982
+ "loss": 132.8631,
5983
+ "step": 8500
5984
+ },
5985
+ {
5986
+ "epoch": 0.8937682051282051,
5987
+ "grad_norm": 73.4375,
5988
+ "learning_rate": 9.98254358974359e-07,
5989
+ "loss": 132.5936,
5990
+ "step": 8510
5991
+ },
5992
+ {
5993
+ "epoch": 0.8948184615384616,
5994
+ "grad_norm": 80.75,
5995
+ "learning_rate": 9.982523076923077e-07,
5996
+ "loss": 133.115,
5997
+ "step": 8520
5998
+ },
5999
+ {
6000
+ "epoch": 0.8958687179487179,
6001
+ "grad_norm": 78.25,
6002
+ "learning_rate": 9.982502564102564e-07,
6003
+ "loss": 132.0712,
6004
+ "step": 8530
6005
+ },
6006
+ {
6007
+ "epoch": 0.8969189743589744,
6008
+ "grad_norm": 71.3125,
6009
+ "learning_rate": 9.98248205128205e-07,
6010
+ "loss": 134.2035,
6011
+ "step": 8540
6012
+ },
6013
+ {
6014
+ "epoch": 0.8979692307692307,
6015
+ "grad_norm": 75.8125,
6016
+ "learning_rate": 9.982461538461538e-07,
6017
+ "loss": 134.253,
6018
+ "step": 8550
6019
+ },
6020
+ {
6021
+ "epoch": 0.8990194871794872,
6022
+ "grad_norm": 76.125,
6023
+ "learning_rate": 9.982441025641025e-07,
6024
+ "loss": 132.0401,
6025
+ "step": 8560
6026
+ },
6027
+ {
6028
+ "epoch": 0.9000697435897436,
6029
+ "grad_norm": 77.625,
6030
+ "learning_rate": 9.982420512820512e-07,
6031
+ "loss": 133.6438,
6032
+ "step": 8570
6033
+ },
6034
+ {
6035
+ "epoch": 0.90112,
6036
+ "grad_norm": 74.8125,
6037
+ "learning_rate": 9.9824e-07,
6038
+ "loss": 134.649,
6039
+ "step": 8580
6040
+ },
6041
+ {
6042
+ "epoch": 0.9021702564102564,
6043
+ "grad_norm": 78.8125,
6044
+ "learning_rate": 9.982379487179486e-07,
6045
+ "loss": 133.015,
6046
+ "step": 8590
6047
+ },
6048
+ {
6049
+ "epoch": 0.9032205128205129,
6050
+ "grad_norm": 79.5,
6051
+ "learning_rate": 9.982358974358975e-07,
6052
+ "loss": 133.7255,
6053
+ "step": 8600
6054
+ },
6055
+ {
6056
+ "epoch": 0.9042707692307692,
6057
+ "grad_norm": 76.375,
6058
+ "learning_rate": 9.982338461538462e-07,
6059
+ "loss": 133.1744,
6060
+ "step": 8610
6061
+ },
6062
+ {
6063
+ "epoch": 0.9053210256410257,
6064
+ "grad_norm": 76.8125,
6065
+ "learning_rate": 9.982317948717949e-07,
6066
+ "loss": 133.6467,
6067
+ "step": 8620
6068
+ },
6069
+ {
6070
+ "epoch": 0.906371282051282,
6071
+ "grad_norm": 75.5,
6072
+ "learning_rate": 9.982297435897435e-07,
6073
+ "loss": 133.2113,
6074
+ "step": 8630
6075
+ },
6076
+ {
6077
+ "epoch": 0.9074215384615385,
6078
+ "grad_norm": 90.875,
6079
+ "learning_rate": 9.982276923076922e-07,
6080
+ "loss": 134.016,
6081
+ "step": 8640
6082
+ },
6083
+ {
6084
+ "epoch": 0.9084717948717949,
6085
+ "grad_norm": 75.625,
6086
+ "learning_rate": 9.98225641025641e-07,
6087
+ "loss": 132.7446,
6088
+ "step": 8650
6089
+ },
6090
+ {
6091
+ "epoch": 0.9095220512820513,
6092
+ "grad_norm": 83.25,
6093
+ "learning_rate": 9.982235897435896e-07,
6094
+ "loss": 134.9909,
6095
+ "step": 8660
6096
+ },
6097
+ {
6098
+ "epoch": 0.9105723076923077,
6099
+ "grad_norm": 84.3125,
6100
+ "learning_rate": 9.982215384615385e-07,
6101
+ "loss": 133.9167,
6102
+ "step": 8670
6103
+ },
6104
+ {
6105
+ "epoch": 0.9116225641025641,
6106
+ "grad_norm": 77.375,
6107
+ "learning_rate": 9.98219487179487e-07,
6108
+ "loss": 133.2151,
6109
+ "step": 8680
6110
+ },
6111
+ {
6112
+ "epoch": 0.9126728205128205,
6113
+ "grad_norm": 74.375,
6114
+ "learning_rate": 9.98217435897436e-07,
6115
+ "loss": 132.7321,
6116
+ "step": 8690
6117
+ },
6118
+ {
6119
+ "epoch": 0.913723076923077,
6120
+ "grad_norm": 76.25,
6121
+ "learning_rate": 9.982153846153846e-07,
6122
+ "loss": 133.1595,
6123
+ "step": 8700
6124
+ },
6125
+ {
6126
+ "epoch": 0.9147733333333333,
6127
+ "grad_norm": 74.125,
6128
+ "learning_rate": 9.982133333333333e-07,
6129
+ "loss": 133.1293,
6130
+ "step": 8710
6131
+ },
6132
+ {
6133
+ "epoch": 0.9158235897435898,
6134
+ "grad_norm": 75.5625,
6135
+ "learning_rate": 9.98211282051282e-07,
6136
+ "loss": 133.3949,
6137
+ "step": 8720
6138
+ },
6139
+ {
6140
+ "epoch": 0.9168738461538462,
6141
+ "grad_norm": 77.0,
6142
+ "learning_rate": 9.982092307692307e-07,
6143
+ "loss": 133.7157,
6144
+ "step": 8730
6145
+ },
6146
+ {
6147
+ "epoch": 0.9179241025641026,
6148
+ "grad_norm": 69.6875,
6149
+ "learning_rate": 9.982071794871794e-07,
6150
+ "loss": 131.9628,
6151
+ "step": 8740
6152
+ },
6153
+ {
6154
+ "epoch": 0.918974358974359,
6155
+ "grad_norm": 76.75,
6156
+ "learning_rate": 9.98205128205128e-07,
6157
+ "loss": 133.9219,
6158
+ "step": 8750
6159
+ },
6160
+ {
6161
+ "epoch": 0.9200246153846153,
6162
+ "grad_norm": 77.0,
6163
+ "learning_rate": 9.98203076923077e-07,
6164
+ "loss": 133.5647,
6165
+ "step": 8760
6166
+ },
6167
+ {
6168
+ "epoch": 0.9210748717948718,
6169
+ "grad_norm": 73.3125,
6170
+ "learning_rate": 9.982010256410257e-07,
6171
+ "loss": 133.8773,
6172
+ "step": 8770
6173
+ },
6174
+ {
6175
+ "epoch": 0.9221251282051282,
6176
+ "grad_norm": 81.875,
6177
+ "learning_rate": 9.981989743589742e-07,
6178
+ "loss": 133.6441,
6179
+ "step": 8780
6180
+ },
6181
+ {
6182
+ "epoch": 0.9231753846153846,
6183
+ "grad_norm": 77.375,
6184
+ "learning_rate": 9.98196923076923e-07,
6185
+ "loss": 134.1967,
6186
+ "step": 8790
6187
+ },
6188
+ {
6189
+ "epoch": 0.924225641025641,
6190
+ "grad_norm": 75.75,
6191
+ "learning_rate": 9.981948717948718e-07,
6192
+ "loss": 133.6943,
6193
+ "step": 8800
6194
+ },
6195
+ {
6196
+ "epoch": 0.9252758974358974,
6197
+ "grad_norm": 74.5,
6198
+ "learning_rate": 9.981928205128205e-07,
6199
+ "loss": 131.9684,
6200
+ "step": 8810
6201
+ },
6202
+ {
6203
+ "epoch": 0.9263261538461538,
6204
+ "grad_norm": 77.9375,
6205
+ "learning_rate": 9.981907692307692e-07,
6206
+ "loss": 132.8939,
6207
+ "step": 8820
6208
+ },
6209
+ {
6210
+ "epoch": 0.9273764102564103,
6211
+ "grad_norm": 82.5625,
6212
+ "learning_rate": 9.981887179487178e-07,
6213
+ "loss": 132.9447,
6214
+ "step": 8830
6215
+ },
6216
+ {
6217
+ "epoch": 0.9284266666666666,
6218
+ "grad_norm": 78.5625,
6219
+ "learning_rate": 9.981866666666665e-07,
6220
+ "loss": 133.4321,
6221
+ "step": 8840
6222
+ },
6223
+ {
6224
+ "epoch": 0.9294769230769231,
6225
+ "grad_norm": 78.875,
6226
+ "learning_rate": 9.981846153846152e-07,
6227
+ "loss": 133.6094,
6228
+ "step": 8850
6229
+ },
6230
+ {
6231
+ "epoch": 0.9305271794871794,
6232
+ "grad_norm": 71.25,
6233
+ "learning_rate": 9.981825641025641e-07,
6234
+ "loss": 132.4788,
6235
+ "step": 8860
6236
+ },
6237
+ {
6238
+ "epoch": 0.9315774358974359,
6239
+ "grad_norm": 73.9375,
6240
+ "learning_rate": 9.981805128205128e-07,
6241
+ "loss": 132.1331,
6242
+ "step": 8870
6243
+ },
6244
+ {
6245
+ "epoch": 0.9326276923076923,
6246
+ "grad_norm": 74.6875,
6247
+ "learning_rate": 9.981784615384615e-07,
6248
+ "loss": 132.9293,
6249
+ "step": 8880
6250
+ },
6251
+ {
6252
+ "epoch": 0.9336779487179487,
6253
+ "grad_norm": 78.0625,
6254
+ "learning_rate": 9.981764102564102e-07,
6255
+ "loss": 134.1451,
6256
+ "step": 8890
6257
+ },
6258
+ {
6259
+ "epoch": 0.9347282051282051,
6260
+ "grad_norm": 77.75,
6261
+ "learning_rate": 9.98174358974359e-07,
6262
+ "loss": 132.7121,
6263
+ "step": 8900
6264
+ },
6265
+ {
6266
+ "epoch": 0.9357784615384616,
6267
+ "grad_norm": 72.75,
6268
+ "learning_rate": 9.981723076923076e-07,
6269
+ "loss": 132.0682,
6270
+ "step": 8910
6271
+ },
6272
+ {
6273
+ "epoch": 0.9368287179487179,
6274
+ "grad_norm": 78.75,
6275
+ "learning_rate": 9.981702564102563e-07,
6276
+ "loss": 132.9574,
6277
+ "step": 8920
6278
+ },
6279
+ {
6280
+ "epoch": 0.9378789743589744,
6281
+ "grad_norm": 81.625,
6282
+ "learning_rate": 9.98168205128205e-07,
6283
+ "loss": 133.9366,
6284
+ "step": 8930
6285
+ },
6286
+ {
6287
+ "epoch": 0.9389292307692307,
6288
+ "grad_norm": 74.125,
6289
+ "learning_rate": 9.981661538461537e-07,
6290
+ "loss": 134.1782,
6291
+ "step": 8940
6292
+ },
6293
+ {
6294
+ "epoch": 0.9399794871794872,
6295
+ "grad_norm": 80.4375,
6296
+ "learning_rate": 9.981641025641026e-07,
6297
+ "loss": 134.4583,
6298
+ "step": 8950
6299
+ },
6300
+ {
6301
+ "epoch": 0.9410297435897436,
6302
+ "grad_norm": 79.8125,
6303
+ "learning_rate": 9.981620512820513e-07,
6304
+ "loss": 134.201,
6305
+ "step": 8960
6306
+ },
6307
+ {
6308
+ "epoch": 0.94208,
6309
+ "grad_norm": 77.0,
6310
+ "learning_rate": 9.9816e-07,
6311
+ "loss": 133.9063,
6312
+ "step": 8970
6313
+ },
6314
+ {
6315
+ "epoch": 0.9431302564102564,
6316
+ "grad_norm": 74.8125,
6317
+ "learning_rate": 9.981579487179487e-07,
6318
+ "loss": 133.6804,
6319
+ "step": 8980
6320
+ },
6321
+ {
6322
+ "epoch": 0.9441805128205129,
6323
+ "grad_norm": 73.375,
6324
+ "learning_rate": 9.981558974358974e-07,
6325
+ "loss": 133.0501,
6326
+ "step": 8990
6327
+ },
6328
+ {
6329
+ "epoch": 0.9452307692307692,
6330
+ "grad_norm": 71.5,
6331
+ "learning_rate": 9.98153846153846e-07,
6332
+ "loss": 133.0835,
6333
+ "step": 9000
6334
+ },
6335
+ {
6336
+ "epoch": 0.9462810256410257,
6337
+ "grad_norm": 76.5625,
6338
+ "learning_rate": 9.981517948717948e-07,
6339
+ "loss": 133.7177,
6340
+ "step": 9010
6341
+ },
6342
+ {
6343
+ "epoch": 0.947331282051282,
6344
+ "grad_norm": 70.625,
6345
+ "learning_rate": 9.981497435897437e-07,
6346
+ "loss": 132.7033,
6347
+ "step": 9020
6348
+ },
6349
+ {
6350
+ "epoch": 0.9483815384615385,
6351
+ "grad_norm": 81.25,
6352
+ "learning_rate": 9.981476923076921e-07,
6353
+ "loss": 132.286,
6354
+ "step": 9030
6355
+ },
6356
+ {
6357
+ "epoch": 0.9494317948717949,
6358
+ "grad_norm": 82.125,
6359
+ "learning_rate": 9.98145641025641e-07,
6360
+ "loss": 132.4269,
6361
+ "step": 9040
6362
+ },
6363
+ {
6364
+ "epoch": 0.9504820512820513,
6365
+ "grad_norm": 76.875,
6366
+ "learning_rate": 9.981435897435897e-07,
6367
+ "loss": 132.6721,
6368
+ "step": 9050
6369
+ },
6370
+ {
6371
+ "epoch": 0.9515323076923077,
6372
+ "grad_norm": 72.5,
6373
+ "learning_rate": 9.981415384615384e-07,
6374
+ "loss": 132.3392,
6375
+ "step": 9060
6376
+ },
6377
+ {
6378
+ "epoch": 0.9525825641025641,
6379
+ "grad_norm": 82.25,
6380
+ "learning_rate": 9.981394871794871e-07,
6381
+ "loss": 133.6953,
6382
+ "step": 9070
6383
+ },
6384
+ {
6385
+ "epoch": 0.9536328205128205,
6386
+ "grad_norm": 81.0625,
6387
+ "learning_rate": 9.981374358974358e-07,
6388
+ "loss": 133.0798,
6389
+ "step": 9080
6390
+ },
6391
+ {
6392
+ "epoch": 0.954683076923077,
6393
+ "grad_norm": 75.0625,
6394
+ "learning_rate": 9.981353846153845e-07,
6395
+ "loss": 134.3506,
6396
+ "step": 9090
6397
+ },
6398
+ {
6399
+ "epoch": 0.9557333333333333,
6400
+ "grad_norm": 82.4375,
6401
+ "learning_rate": 9.981333333333332e-07,
6402
+ "loss": 135.1331,
6403
+ "step": 9100
6404
+ },
6405
+ {
6406
+ "epoch": 0.9567835897435898,
6407
+ "grad_norm": 76.375,
6408
+ "learning_rate": 9.981312820512821e-07,
6409
+ "loss": 131.6688,
6410
+ "step": 9110
6411
+ },
6412
+ {
6413
+ "epoch": 0.9578338461538461,
6414
+ "grad_norm": 72.25,
6415
+ "learning_rate": 9.981292307692308e-07,
6416
+ "loss": 134.2536,
6417
+ "step": 9120
6418
+ },
6419
+ {
6420
+ "epoch": 0.9588841025641026,
6421
+ "grad_norm": 75.375,
6422
+ "learning_rate": 9.981271794871793e-07,
6423
+ "loss": 132.993,
6424
+ "step": 9130
6425
+ },
6426
+ {
6427
+ "epoch": 0.959934358974359,
6428
+ "grad_norm": 79.625,
6429
+ "learning_rate": 9.981251282051282e-07,
6430
+ "loss": 132.7532,
6431
+ "step": 9140
6432
+ },
6433
+ {
6434
+ "epoch": 0.9609846153846154,
6435
+ "grad_norm": 72.8125,
6436
+ "learning_rate": 9.98123076923077e-07,
6437
+ "loss": 134.5466,
6438
+ "step": 9150
6439
+ },
6440
+ {
6441
+ "epoch": 0.9620348717948718,
6442
+ "grad_norm": 73.625,
6443
+ "learning_rate": 9.981210256410256e-07,
6444
+ "loss": 133.4606,
6445
+ "step": 9160
6446
+ },
6447
+ {
6448
+ "epoch": 0.9630851282051283,
6449
+ "grad_norm": 69.25,
6450
+ "learning_rate": 9.981189743589743e-07,
6451
+ "loss": 132.9202,
6452
+ "step": 9170
6453
+ },
6454
+ {
6455
+ "epoch": 0.9641353846153846,
6456
+ "grad_norm": 77.625,
6457
+ "learning_rate": 9.98116923076923e-07,
6458
+ "loss": 133.554,
6459
+ "step": 9180
6460
+ },
6461
+ {
6462
+ "epoch": 0.965185641025641,
6463
+ "grad_norm": 72.0,
6464
+ "learning_rate": 9.981148717948717e-07,
6465
+ "loss": 133.4544,
6466
+ "step": 9190
6467
+ },
6468
+ {
6469
+ "epoch": 0.9662358974358974,
6470
+ "grad_norm": 72.75,
6471
+ "learning_rate": 9.981128205128204e-07,
6472
+ "loss": 132.8837,
6473
+ "step": 9200
6474
+ },
6475
+ {
6476
+ "epoch": 0.9672861538461538,
6477
+ "grad_norm": 75.1875,
6478
+ "learning_rate": 9.981107692307693e-07,
6479
+ "loss": 133.7232,
6480
+ "step": 9210
6481
+ },
6482
+ {
6483
+ "epoch": 0.9683364102564103,
6484
+ "grad_norm": 69.6875,
6485
+ "learning_rate": 9.98108717948718e-07,
6486
+ "loss": 132.7222,
6487
+ "step": 9220
6488
+ },
6489
+ {
6490
+ "epoch": 0.9693866666666666,
6491
+ "grad_norm": 76.3125,
6492
+ "learning_rate": 9.981066666666667e-07,
6493
+ "loss": 134.062,
6494
+ "step": 9230
6495
+ },
6496
+ {
6497
+ "epoch": 0.9704369230769231,
6498
+ "grad_norm": 73.8125,
6499
+ "learning_rate": 9.981046153846154e-07,
6500
+ "loss": 133.7961,
6501
+ "step": 9240
6502
+ },
6503
+ {
6504
+ "epoch": 0.9714871794871794,
6505
+ "grad_norm": 76.25,
6506
+ "learning_rate": 9.98102564102564e-07,
6507
+ "loss": 133.53,
6508
+ "step": 9250
6509
+ },
6510
+ {
6511
+ "epoch": 0.9725374358974359,
6512
+ "grad_norm": 76.8125,
6513
+ "learning_rate": 9.981005128205127e-07,
6514
+ "loss": 134.952,
6515
+ "step": 9260
6516
+ },
6517
+ {
6518
+ "epoch": 0.9735876923076923,
6519
+ "grad_norm": 74.1875,
6520
+ "learning_rate": 9.980984615384614e-07,
6521
+ "loss": 132.9375,
6522
+ "step": 9270
6523
+ },
6524
+ {
6525
+ "epoch": 0.9746379487179487,
6526
+ "grad_norm": 87.25,
6527
+ "learning_rate": 9.980964102564101e-07,
6528
+ "loss": 132.9781,
6529
+ "step": 9280
6530
+ },
6531
+ {
6532
+ "epoch": 0.9756882051282051,
6533
+ "grad_norm": 74.4375,
6534
+ "learning_rate": 9.980943589743588e-07,
6535
+ "loss": 132.2021,
6536
+ "step": 9290
6537
+ },
6538
+ {
6539
+ "epoch": 0.9767384615384616,
6540
+ "grad_norm": 73.0,
6541
+ "learning_rate": 9.980923076923077e-07,
6542
+ "loss": 132.5939,
6543
+ "step": 9300
6544
+ },
6545
+ {
6546
+ "epoch": 0.9777887179487179,
6547
+ "grad_norm": 77.5,
6548
+ "learning_rate": 9.980902564102564e-07,
6549
+ "loss": 133.525,
6550
+ "step": 9310
6551
+ },
6552
+ {
6553
+ "epoch": 0.9788389743589744,
6554
+ "grad_norm": 79.1875,
6555
+ "learning_rate": 9.980882051282051e-07,
6556
+ "loss": 132.8209,
6557
+ "step": 9320
6558
+ },
6559
+ {
6560
+ "epoch": 0.9798892307692307,
6561
+ "grad_norm": 76.9375,
6562
+ "learning_rate": 9.980861538461538e-07,
6563
+ "loss": 133.3684,
6564
+ "step": 9330
6565
+ },
6566
+ {
6567
+ "epoch": 0.9809394871794872,
6568
+ "grad_norm": 71.9375,
6569
+ "learning_rate": 9.980841025641025e-07,
6570
+ "loss": 130.963,
6571
+ "step": 9340
6572
+ },
6573
+ {
6574
+ "epoch": 0.9819897435897436,
6575
+ "grad_norm": 79.9375,
6576
+ "learning_rate": 9.980820512820512e-07,
6577
+ "loss": 133.7593,
6578
+ "step": 9350
6579
+ },
6580
+ {
6581
+ "epoch": 0.98304,
6582
+ "grad_norm": 77.125,
6583
+ "learning_rate": 9.9808e-07,
6584
+ "loss": 132.379,
6585
+ "step": 9360
6586
+ },
6587
+ {
6588
+ "epoch": 0.9840902564102564,
6589
+ "grad_norm": 76.75,
6590
+ "learning_rate": 9.980779487179488e-07,
6591
+ "loss": 131.4839,
6592
+ "step": 9370
6593
+ },
6594
+ {
6595
+ "epoch": 0.9851405128205128,
6596
+ "grad_norm": 73.8125,
6597
+ "learning_rate": 9.980758974358973e-07,
6598
+ "loss": 133.2282,
6599
+ "step": 9380
6600
+ },
6601
+ {
6602
+ "epoch": 0.9861907692307692,
6603
+ "grad_norm": 73.9375,
6604
+ "learning_rate": 9.980738461538462e-07,
6605
+ "loss": 132.6678,
6606
+ "step": 9390
6607
+ },
6608
+ {
6609
+ "epoch": 0.9872410256410257,
6610
+ "grad_norm": 76.625,
6611
+ "learning_rate": 9.980717948717949e-07,
6612
+ "loss": 132.3676,
6613
+ "step": 9400
6614
+ },
6615
+ {
6616
+ "epoch": 0.988291282051282,
6617
+ "grad_norm": 73.125,
6618
+ "learning_rate": 9.980697435897436e-07,
6619
+ "loss": 132.7415,
6620
+ "step": 9410
6621
+ },
6622
+ {
6623
+ "epoch": 0.9893415384615385,
6624
+ "grad_norm": 74.75,
6625
+ "learning_rate": 9.980676923076923e-07,
6626
+ "loss": 134.5323,
6627
+ "step": 9420
6628
+ },
6629
+ {
6630
+ "epoch": 0.9903917948717949,
6631
+ "grad_norm": 75.75,
6632
+ "learning_rate": 9.98065641025641e-07,
6633
+ "loss": 132.3584,
6634
+ "step": 9430
6635
+ },
6636
+ {
6637
+ "epoch": 0.9914420512820513,
6638
+ "grad_norm": 77.5625,
6639
+ "learning_rate": 9.980635897435897e-07,
6640
+ "loss": 132.1099,
6641
+ "step": 9440
6642
+ },
6643
+ {
6644
+ "epoch": 0.9924923076923077,
6645
+ "grad_norm": 75.5625,
6646
+ "learning_rate": 9.980615384615384e-07,
6647
+ "loss": 132.6088,
6648
+ "step": 9450
6649
+ },
6650
+ {
6651
+ "epoch": 0.9935425641025641,
6652
+ "grad_norm": 77.75,
6653
+ "learning_rate": 9.980594871794873e-07,
6654
+ "loss": 132.0042,
6655
+ "step": 9460
6656
+ },
6657
+ {
6658
+ "epoch": 0.9945928205128205,
6659
+ "grad_norm": 76.6875,
6660
+ "learning_rate": 9.98057435897436e-07,
6661
+ "loss": 132.9037,
6662
+ "step": 9470
6663
+ },
6664
+ {
6665
+ "epoch": 0.995643076923077,
6666
+ "grad_norm": 75.5,
6667
+ "learning_rate": 9.980553846153844e-07,
6668
+ "loss": 133.0483,
6669
+ "step": 9480
6670
+ },
6671
+ {
6672
+ "epoch": 0.9966933333333333,
6673
+ "grad_norm": 74.8125,
6674
+ "learning_rate": 9.980533333333333e-07,
6675
+ "loss": 134.3751,
6676
+ "step": 9490
6677
+ },
6678
+ {
6679
+ "epoch": 0.9977435897435898,
6680
+ "grad_norm": 74.625,
6681
+ "learning_rate": 9.98051282051282e-07,
6682
+ "loss": 132.6699,
6683
+ "step": 9500
6684
+ },
6685
+ {
6686
+ "epoch": 0.9987938461538461,
6687
+ "grad_norm": 79.875,
6688
+ "learning_rate": 9.980492307692307e-07,
6689
+ "loss": 132.3946,
6690
+ "step": 9510
6691
+ },
6692
+ {
6693
+ "epoch": 0.9998441025641026,
6694
+ "grad_norm": 73.5625,
6695
+ "learning_rate": 9.980471794871794e-07,
6696
+ "loss": 133.7389,
6697
+ "step": 9520
6698
  }
6699
  ],
6700
  "logging_steps": 10,
 
6709
  "should_evaluate": false,
6710
  "should_log": false,
6711
  "should_save": true,
6712
+ "should_training_stop": true
6713
  },
6714
  "attributes": {}
6715
  }
6716
  },
6717
+ "total_flos": 2.6288590936506106e+19,
6718
  "train_batch_size": 4,
6719
  "trial_name": null,
6720
  "trial_params": null