CocoRoF commited on
Commit
6a71d0e
·
verified ·
1 Parent(s): b2134eb

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42712b8432626151cf72eeab30053b6dafed3680bdd8988d79ee3efa3a4048c9
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34fe23e2546f350bdc3ea7cd098359c61876cfd9860bbc4e904fff96718928df
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8d141fd55c17099b7962cac740c952a7eb00bd3b3569916c9f826e27fdfc91e
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0d1fd974fead158de0eadb70e7f57c959f5cfaef326177d457bb3324066005
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8018e97d4715d80d56763a43ddd4b2fc926329646141e0110ce049f80a136dea
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069d595c560369c6f0dc5e92d7d7a49b75f77981476650e93e924eb0ecc848f4
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5251282051282051,
5
  "eval_steps": 2500,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3523,6 +3523,1764 @@
3523
  "eval_samples_per_second": 1547.288,
3524
  "eval_steps_per_second": 48.356,
3525
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3526
  }
3527
  ],
3528
  "logging_steps": 10,
@@ -3542,7 +5300,7 @@
3542
  "attributes": {}
3543
  }
3544
  },
3545
- "total_flos": 1.3805582888730624e+19,
3546
  "train_batch_size": 4,
3547
  "trial_name": null,
3548
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7876923076923077,
5
  "eval_steps": 2500,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3523
  "eval_samples_per_second": 1547.288,
3524
  "eval_steps_per_second": 48.356,
3525
  "step": 5000
3526
+ },
3527
+ {
3528
+ "epoch": 0.5261784615384615,
3529
+ "grad_norm": 77.9375,
3530
+ "learning_rate": 9.989723076923077e-07,
3531
+ "loss": 135.7908,
3532
+ "step": 5010
3533
+ },
3534
+ {
3535
+ "epoch": 0.527228717948718,
3536
+ "grad_norm": 76.875,
3537
+ "learning_rate": 9.989702564102564e-07,
3538
+ "loss": 133.9965,
3539
+ "step": 5020
3540
+ },
3541
+ {
3542
+ "epoch": 0.5282789743589743,
3543
+ "grad_norm": 76.1875,
3544
+ "learning_rate": 9.98968205128205e-07,
3545
+ "loss": 136.395,
3546
+ "step": 5030
3547
+ },
3548
+ {
3549
+ "epoch": 0.5293292307692308,
3550
+ "grad_norm": 80.4375,
3551
+ "learning_rate": 9.989661538461538e-07,
3552
+ "loss": 137.1353,
3553
+ "step": 5040
3554
+ },
3555
+ {
3556
+ "epoch": 0.5303794871794871,
3557
+ "grad_norm": 79.6875,
3558
+ "learning_rate": 9.989641025641027e-07,
3559
+ "loss": 135.522,
3560
+ "step": 5050
3561
+ },
3562
+ {
3563
+ "epoch": 0.5314297435897436,
3564
+ "grad_norm": 79.125,
3565
+ "learning_rate": 9.989620512820511e-07,
3566
+ "loss": 135.3318,
3567
+ "step": 5060
3568
+ },
3569
+ {
3570
+ "epoch": 0.53248,
3571
+ "grad_norm": 79.4375,
3572
+ "learning_rate": 9.989599999999998e-07,
3573
+ "loss": 133.8391,
3574
+ "step": 5070
3575
+ },
3576
+ {
3577
+ "epoch": 0.5335302564102564,
3578
+ "grad_norm": 80.8125,
3579
+ "learning_rate": 9.989579487179487e-07,
3580
+ "loss": 134.4684,
3581
+ "step": 5080
3582
+ },
3583
+ {
3584
+ "epoch": 0.5345805128205128,
3585
+ "grad_norm": 73.375,
3586
+ "learning_rate": 9.989558974358974e-07,
3587
+ "loss": 135.565,
3588
+ "step": 5090
3589
+ },
3590
+ {
3591
+ "epoch": 0.5356307692307692,
3592
+ "grad_norm": 77.0,
3593
+ "learning_rate": 9.989538461538461e-07,
3594
+ "loss": 134.9499,
3595
+ "step": 5100
3596
+ },
3597
+ {
3598
+ "epoch": 0.5366810256410256,
3599
+ "grad_norm": 72.875,
3600
+ "learning_rate": 9.989517948717948e-07,
3601
+ "loss": 135.5963,
3602
+ "step": 5110
3603
+ },
3604
+ {
3605
+ "epoch": 0.5377312820512821,
3606
+ "grad_norm": 80.4375,
3607
+ "learning_rate": 9.989497435897435e-07,
3608
+ "loss": 134.9717,
3609
+ "step": 5120
3610
+ },
3611
+ {
3612
+ "epoch": 0.5387815384615384,
3613
+ "grad_norm": 76.9375,
3614
+ "learning_rate": 9.989476923076922e-07,
3615
+ "loss": 133.5198,
3616
+ "step": 5130
3617
+ },
3618
+ {
3619
+ "epoch": 0.5398317948717949,
3620
+ "grad_norm": 79.125,
3621
+ "learning_rate": 9.98945641025641e-07,
3622
+ "loss": 134.2271,
3623
+ "step": 5140
3624
+ },
3625
+ {
3626
+ "epoch": 0.5408820512820512,
3627
+ "grad_norm": 78.4375,
3628
+ "learning_rate": 9.989435897435898e-07,
3629
+ "loss": 135.2163,
3630
+ "step": 5150
3631
+ },
3632
+ {
3633
+ "epoch": 0.5419323076923077,
3634
+ "grad_norm": 82.75,
3635
+ "learning_rate": 9.989415384615383e-07,
3636
+ "loss": 134.3486,
3637
+ "step": 5160
3638
+ },
3639
+ {
3640
+ "epoch": 0.5429825641025641,
3641
+ "grad_norm": 75.5625,
3642
+ "learning_rate": 9.989394871794872e-07,
3643
+ "loss": 134.6154,
3644
+ "step": 5170
3645
+ },
3646
+ {
3647
+ "epoch": 0.5440328205128205,
3648
+ "grad_norm": 76.8125,
3649
+ "learning_rate": 9.989374358974359e-07,
3650
+ "loss": 134.4962,
3651
+ "step": 5180
3652
+ },
3653
+ {
3654
+ "epoch": 0.5450830769230769,
3655
+ "grad_norm": 76.3125,
3656
+ "learning_rate": 9.989353846153846e-07,
3657
+ "loss": 134.8321,
3658
+ "step": 5190
3659
+ },
3660
+ {
3661
+ "epoch": 0.5461333333333334,
3662
+ "grad_norm": 76.125,
3663
+ "learning_rate": 9.989333333333333e-07,
3664
+ "loss": 134.5848,
3665
+ "step": 5200
3666
+ },
3667
+ {
3668
+ "epoch": 0.5471835897435897,
3669
+ "grad_norm": 82.9375,
3670
+ "learning_rate": 9.98931282051282e-07,
3671
+ "loss": 135.8918,
3672
+ "step": 5210
3673
+ },
3674
+ {
3675
+ "epoch": 0.5482338461538462,
3676
+ "grad_norm": 79.75,
3677
+ "learning_rate": 9.989292307692307e-07,
3678
+ "loss": 134.9434,
3679
+ "step": 5220
3680
+ },
3681
+ {
3682
+ "epoch": 0.5492841025641025,
3683
+ "grad_norm": 79.0625,
3684
+ "learning_rate": 9.989271794871794e-07,
3685
+ "loss": 136.0459,
3686
+ "step": 5230
3687
+ },
3688
+ {
3689
+ "epoch": 0.550334358974359,
3690
+ "grad_norm": 80.5625,
3691
+ "learning_rate": 9.989251282051283e-07,
3692
+ "loss": 134.0239,
3693
+ "step": 5240
3694
+ },
3695
+ {
3696
+ "epoch": 0.5513846153846154,
3697
+ "grad_norm": 82.5625,
3698
+ "learning_rate": 9.98923076923077e-07,
3699
+ "loss": 135.3833,
3700
+ "step": 5250
3701
+ },
3702
+ {
3703
+ "epoch": 0.5524348717948718,
3704
+ "grad_norm": 79.125,
3705
+ "learning_rate": 9.989210256410257e-07,
3706
+ "loss": 135.0751,
3707
+ "step": 5260
3708
+ },
3709
+ {
3710
+ "epoch": 0.5534851282051282,
3711
+ "grad_norm": 91.125,
3712
+ "learning_rate": 9.989189743589743e-07,
3713
+ "loss": 134.6919,
3714
+ "step": 5270
3715
+ },
3716
+ {
3717
+ "epoch": 0.5545353846153847,
3718
+ "grad_norm": 79.375,
3719
+ "learning_rate": 9.98916923076923e-07,
3720
+ "loss": 134.0143,
3721
+ "step": 5280
3722
+ },
3723
+ {
3724
+ "epoch": 0.555585641025641,
3725
+ "grad_norm": 79.125,
3726
+ "learning_rate": 9.989148717948717e-07,
3727
+ "loss": 134.374,
3728
+ "step": 5290
3729
+ },
3730
+ {
3731
+ "epoch": 0.5566358974358975,
3732
+ "grad_norm": 77.75,
3733
+ "learning_rate": 9.989128205128204e-07,
3734
+ "loss": 134.8326,
3735
+ "step": 5300
3736
+ },
3737
+ {
3738
+ "epoch": 0.5576861538461538,
3739
+ "grad_norm": 79.25,
3740
+ "learning_rate": 9.989107692307691e-07,
3741
+ "loss": 134.68,
3742
+ "step": 5310
3743
+ },
3744
+ {
3745
+ "epoch": 0.5587364102564103,
3746
+ "grad_norm": 82.1875,
3747
+ "learning_rate": 9.989087179487178e-07,
3748
+ "loss": 134.3108,
3749
+ "step": 5320
3750
+ },
3751
+ {
3752
+ "epoch": 0.5597866666666667,
3753
+ "grad_norm": 74.0,
3754
+ "learning_rate": 9.989066666666667e-07,
3755
+ "loss": 134.6612,
3756
+ "step": 5330
3757
+ },
3758
+ {
3759
+ "epoch": 0.5608369230769231,
3760
+ "grad_norm": 80.0,
3761
+ "learning_rate": 9.989046153846154e-07,
3762
+ "loss": 135.2691,
3763
+ "step": 5340
3764
+ },
3765
+ {
3766
+ "epoch": 0.5618871794871795,
3767
+ "grad_norm": 73.875,
3768
+ "learning_rate": 9.989025641025641e-07,
3769
+ "loss": 134.7519,
3770
+ "step": 5350
3771
+ },
3772
+ {
3773
+ "epoch": 0.562937435897436,
3774
+ "grad_norm": 75.3125,
3775
+ "learning_rate": 9.989005128205128e-07,
3776
+ "loss": 134.7679,
3777
+ "step": 5360
3778
+ },
3779
+ {
3780
+ "epoch": 0.5639876923076923,
3781
+ "grad_norm": 82.3125,
3782
+ "learning_rate": 9.988984615384615e-07,
3783
+ "loss": 135.0176,
3784
+ "step": 5370
3785
+ },
3786
+ {
3787
+ "epoch": 0.5650379487179488,
3788
+ "grad_norm": 78.4375,
3789
+ "learning_rate": 9.988964102564102e-07,
3790
+ "loss": 135.313,
3791
+ "step": 5380
3792
+ },
3793
+ {
3794
+ "epoch": 0.5660882051282051,
3795
+ "grad_norm": 74.5,
3796
+ "learning_rate": 9.988943589743589e-07,
3797
+ "loss": 134.0978,
3798
+ "step": 5390
3799
+ },
3800
+ {
3801
+ "epoch": 0.5671384615384616,
3802
+ "grad_norm": 71.375,
3803
+ "learning_rate": 9.988923076923078e-07,
3804
+ "loss": 134.1742,
3805
+ "step": 5400
3806
+ },
3807
+ {
3808
+ "epoch": 0.568188717948718,
3809
+ "grad_norm": 74.75,
3810
+ "learning_rate": 9.988902564102563e-07,
3811
+ "loss": 134.1676,
3812
+ "step": 5410
3813
+ },
3814
+ {
3815
+ "epoch": 0.5692389743589744,
3816
+ "grad_norm": 80.0,
3817
+ "learning_rate": 9.98888205128205e-07,
3818
+ "loss": 136.1296,
3819
+ "step": 5420
3820
+ },
3821
+ {
3822
+ "epoch": 0.5702892307692308,
3823
+ "grad_norm": 72.25,
3824
+ "learning_rate": 9.988861538461539e-07,
3825
+ "loss": 135.4083,
3826
+ "step": 5430
3827
+ },
3828
+ {
3829
+ "epoch": 0.5713394871794871,
3830
+ "grad_norm": 74.4375,
3831
+ "learning_rate": 9.988841025641026e-07,
3832
+ "loss": 136.1724,
3833
+ "step": 5440
3834
+ },
3835
+ {
3836
+ "epoch": 0.5723897435897436,
3837
+ "grad_norm": 80.25,
3838
+ "learning_rate": 9.988820512820513e-07,
3839
+ "loss": 134.1989,
3840
+ "step": 5450
3841
+ },
3842
+ {
3843
+ "epoch": 0.57344,
3844
+ "grad_norm": 83.0,
3845
+ "learning_rate": 9.9888e-07,
3846
+ "loss": 134.7482,
3847
+ "step": 5460
3848
+ },
3849
+ {
3850
+ "epoch": 0.5744902564102564,
3851
+ "grad_norm": 75.75,
3852
+ "learning_rate": 9.988779487179486e-07,
3853
+ "loss": 134.8441,
3854
+ "step": 5470
3855
+ },
3856
+ {
3857
+ "epoch": 0.5755405128205128,
3858
+ "grad_norm": 73.6875,
3859
+ "learning_rate": 9.988758974358973e-07,
3860
+ "loss": 133.2782,
3861
+ "step": 5480
3862
+ },
3863
+ {
3864
+ "epoch": 0.5765907692307692,
3865
+ "grad_norm": 76.5625,
3866
+ "learning_rate": 9.98873846153846e-07,
3867
+ "loss": 132.6737,
3868
+ "step": 5490
3869
+ },
3870
+ {
3871
+ "epoch": 0.5776410256410256,
3872
+ "grad_norm": 76.875,
3873
+ "learning_rate": 9.98871794871795e-07,
3874
+ "loss": 133.4861,
3875
+ "step": 5500
3876
+ },
3877
+ {
3878
+ "epoch": 0.5786912820512821,
3879
+ "grad_norm": 77.625,
3880
+ "learning_rate": 9.988697435897434e-07,
3881
+ "loss": 135.5047,
3882
+ "step": 5510
3883
+ },
3884
+ {
3885
+ "epoch": 0.5797415384615384,
3886
+ "grad_norm": 76.4375,
3887
+ "learning_rate": 9.988676923076923e-07,
3888
+ "loss": 135.0375,
3889
+ "step": 5520
3890
+ },
3891
+ {
3892
+ "epoch": 0.5807917948717949,
3893
+ "grad_norm": 72.0,
3894
+ "learning_rate": 9.98865641025641e-07,
3895
+ "loss": 135.1532,
3896
+ "step": 5530
3897
+ },
3898
+ {
3899
+ "epoch": 0.5818420512820512,
3900
+ "grad_norm": 76.8125,
3901
+ "learning_rate": 9.988635897435897e-07,
3902
+ "loss": 133.7135,
3903
+ "step": 5540
3904
+ },
3905
+ {
3906
+ "epoch": 0.5828923076923077,
3907
+ "grad_norm": 77.625,
3908
+ "learning_rate": 9.988615384615384e-07,
3909
+ "loss": 134.7512,
3910
+ "step": 5550
3911
+ },
3912
+ {
3913
+ "epoch": 0.5839425641025641,
3914
+ "grad_norm": 76.8125,
3915
+ "learning_rate": 9.988594871794871e-07,
3916
+ "loss": 133.1693,
3917
+ "step": 5560
3918
+ },
3919
+ {
3920
+ "epoch": 0.5849928205128205,
3921
+ "grad_norm": 73.8125,
3922
+ "learning_rate": 9.988574358974358e-07,
3923
+ "loss": 134.1779,
3924
+ "step": 5570
3925
+ },
3926
+ {
3927
+ "epoch": 0.5860430769230769,
3928
+ "grad_norm": 73.4375,
3929
+ "learning_rate": 9.988553846153845e-07,
3930
+ "loss": 134.0802,
3931
+ "step": 5580
3932
+ },
3933
+ {
3934
+ "epoch": 0.5870933333333334,
3935
+ "grad_norm": 80.25,
3936
+ "learning_rate": 9.988533333333334e-07,
3937
+ "loss": 133.4755,
3938
+ "step": 5590
3939
+ },
3940
+ {
3941
+ "epoch": 0.5881435897435897,
3942
+ "grad_norm": 75.5,
3943
+ "learning_rate": 9.98851282051282e-07,
3944
+ "loss": 133.0621,
3945
+ "step": 5600
3946
+ },
3947
+ {
3948
+ "epoch": 0.5891938461538462,
3949
+ "grad_norm": 76.75,
3950
+ "learning_rate": 9.988492307692308e-07,
3951
+ "loss": 134.8084,
3952
+ "step": 5610
3953
+ },
3954
+ {
3955
+ "epoch": 0.5902441025641025,
3956
+ "grad_norm": 80.5,
3957
+ "learning_rate": 9.988471794871795e-07,
3958
+ "loss": 134.011,
3959
+ "step": 5620
3960
+ },
3961
+ {
3962
+ "epoch": 0.591294358974359,
3963
+ "grad_norm": 79.25,
3964
+ "learning_rate": 9.988451282051282e-07,
3965
+ "loss": 134.1596,
3966
+ "step": 5630
3967
+ },
3968
+ {
3969
+ "epoch": 0.5923446153846154,
3970
+ "grad_norm": 79.1875,
3971
+ "learning_rate": 9.988430769230769e-07,
3972
+ "loss": 133.8143,
3973
+ "step": 5640
3974
+ },
3975
+ {
3976
+ "epoch": 0.5933948717948718,
3977
+ "grad_norm": 78.1875,
3978
+ "learning_rate": 9.988410256410256e-07,
3979
+ "loss": 132.8301,
3980
+ "step": 5650
3981
+ },
3982
+ {
3983
+ "epoch": 0.5944451282051282,
3984
+ "grad_norm": 81.4375,
3985
+ "learning_rate": 9.988389743589743e-07,
3986
+ "loss": 134.9465,
3987
+ "step": 5660
3988
+ },
3989
+ {
3990
+ "epoch": 0.5954953846153846,
3991
+ "grad_norm": 75.8125,
3992
+ "learning_rate": 9.98836923076923e-07,
3993
+ "loss": 133.7755,
3994
+ "step": 5670
3995
+ },
3996
+ {
3997
+ "epoch": 0.596545641025641,
3998
+ "grad_norm": 77.375,
3999
+ "learning_rate": 9.988348717948719e-07,
4000
+ "loss": 135.0217,
4001
+ "step": 5680
4002
+ },
4003
+ {
4004
+ "epoch": 0.5975958974358975,
4005
+ "grad_norm": 78.625,
4006
+ "learning_rate": 9.988328205128206e-07,
4007
+ "loss": 133.1542,
4008
+ "step": 5690
4009
+ },
4010
+ {
4011
+ "epoch": 0.5986461538461538,
4012
+ "grad_norm": 75.9375,
4013
+ "learning_rate": 9.988307692307692e-07,
4014
+ "loss": 133.7185,
4015
+ "step": 5700
4016
+ },
4017
+ {
4018
+ "epoch": 0.5996964102564103,
4019
+ "grad_norm": 76.625,
4020
+ "learning_rate": 9.98828717948718e-07,
4021
+ "loss": 134.098,
4022
+ "step": 5710
4023
+ },
4024
+ {
4025
+ "epoch": 0.6007466666666667,
4026
+ "grad_norm": 80.0,
4027
+ "learning_rate": 9.988266666666666e-07,
4028
+ "loss": 135.2549,
4029
+ "step": 5720
4030
+ },
4031
+ {
4032
+ "epoch": 0.6017969230769231,
4033
+ "grad_norm": 74.375,
4034
+ "learning_rate": 9.988246153846153e-07,
4035
+ "loss": 133.9132,
4036
+ "step": 5730
4037
+ },
4038
+ {
4039
+ "epoch": 0.6028471794871795,
4040
+ "grad_norm": 72.75,
4041
+ "learning_rate": 9.98822564102564e-07,
4042
+ "loss": 133.688,
4043
+ "step": 5740
4044
+ },
4045
+ {
4046
+ "epoch": 0.6038974358974359,
4047
+ "grad_norm": 74.4375,
4048
+ "learning_rate": 9.98820512820513e-07,
4049
+ "loss": 134.2703,
4050
+ "step": 5750
4051
+ },
4052
+ {
4053
+ "epoch": 0.6049476923076923,
4054
+ "grad_norm": 84.625,
4055
+ "learning_rate": 9.988184615384614e-07,
4056
+ "loss": 136.125,
4057
+ "step": 5760
4058
+ },
4059
+ {
4060
+ "epoch": 0.6059979487179488,
4061
+ "grad_norm": 76.8125,
4062
+ "learning_rate": 9.9881641025641e-07,
4063
+ "loss": 134.1665,
4064
+ "step": 5770
4065
+ },
4066
+ {
4067
+ "epoch": 0.6070482051282051,
4068
+ "grad_norm": 76.625,
4069
+ "learning_rate": 9.98814358974359e-07,
4070
+ "loss": 134.8555,
4071
+ "step": 5780
4072
+ },
4073
+ {
4074
+ "epoch": 0.6080984615384616,
4075
+ "grad_norm": 76.625,
4076
+ "learning_rate": 9.988123076923077e-07,
4077
+ "loss": 133.6014,
4078
+ "step": 5790
4079
+ },
4080
+ {
4081
+ "epoch": 0.6091487179487179,
4082
+ "grad_norm": 74.25,
4083
+ "learning_rate": 9.988102564102564e-07,
4084
+ "loss": 133.9827,
4085
+ "step": 5800
4086
+ },
4087
+ {
4088
+ "epoch": 0.6101989743589744,
4089
+ "grad_norm": 77.3125,
4090
+ "learning_rate": 9.98808205128205e-07,
4091
+ "loss": 133.2115,
4092
+ "step": 5810
4093
+ },
4094
+ {
4095
+ "epoch": 0.6112492307692308,
4096
+ "grad_norm": 73.9375,
4097
+ "learning_rate": 9.988061538461538e-07,
4098
+ "loss": 133.0385,
4099
+ "step": 5820
4100
+ },
4101
+ {
4102
+ "epoch": 0.6122994871794872,
4103
+ "grad_norm": 79.625,
4104
+ "learning_rate": 9.988041025641025e-07,
4105
+ "loss": 132.0571,
4106
+ "step": 5830
4107
+ },
4108
+ {
4109
+ "epoch": 0.6133497435897436,
4110
+ "grad_norm": 73.9375,
4111
+ "learning_rate": 9.988020512820512e-07,
4112
+ "loss": 133.794,
4113
+ "step": 5840
4114
+ },
4115
+ {
4116
+ "epoch": 0.6144,
4117
+ "grad_norm": 79.125,
4118
+ "learning_rate": 9.988e-07,
4119
+ "loss": 132.8695,
4120
+ "step": 5850
4121
+ },
4122
+ {
4123
+ "epoch": 0.6154502564102564,
4124
+ "grad_norm": 77.25,
4125
+ "learning_rate": 9.987979487179486e-07,
4126
+ "loss": 133.7304,
4127
+ "step": 5860
4128
+ },
4129
+ {
4130
+ "epoch": 0.6165005128205128,
4131
+ "grad_norm": 79.5,
4132
+ "learning_rate": 9.987958974358975e-07,
4133
+ "loss": 133.9512,
4134
+ "step": 5870
4135
+ },
4136
+ {
4137
+ "epoch": 0.6175507692307692,
4138
+ "grad_norm": 75.9375,
4139
+ "learning_rate": 9.987938461538462e-07,
4140
+ "loss": 135.2929,
4141
+ "step": 5880
4142
+ },
4143
+ {
4144
+ "epoch": 0.6186010256410256,
4145
+ "grad_norm": 83.3125,
4146
+ "learning_rate": 9.987917948717949e-07,
4147
+ "loss": 135.2723,
4148
+ "step": 5890
4149
+ },
4150
+ {
4151
+ "epoch": 0.6196512820512821,
4152
+ "grad_norm": 74.0625,
4153
+ "learning_rate": 9.987897435897435e-07,
4154
+ "loss": 135.1879,
4155
+ "step": 5900
4156
+ },
4157
+ {
4158
+ "epoch": 0.6207015384615384,
4159
+ "grad_norm": 76.75,
4160
+ "learning_rate": 9.987876923076922e-07,
4161
+ "loss": 134.8009,
4162
+ "step": 5910
4163
+ },
4164
+ {
4165
+ "epoch": 0.6217517948717949,
4166
+ "grad_norm": 82.5,
4167
+ "learning_rate": 9.98785641025641e-07,
4168
+ "loss": 135.7849,
4169
+ "step": 5920
4170
+ },
4171
+ {
4172
+ "epoch": 0.6228020512820512,
4173
+ "grad_norm": 74.4375,
4174
+ "learning_rate": 9.987835897435896e-07,
4175
+ "loss": 134.5044,
4176
+ "step": 5930
4177
+ },
4178
+ {
4179
+ "epoch": 0.6238523076923077,
4180
+ "grad_norm": 80.0,
4181
+ "learning_rate": 9.987815384615385e-07,
4182
+ "loss": 133.4429,
4183
+ "step": 5940
4184
+ },
4185
+ {
4186
+ "epoch": 0.6249025641025641,
4187
+ "grad_norm": 85.625,
4188
+ "learning_rate": 9.987794871794872e-07,
4189
+ "loss": 133.1182,
4190
+ "step": 5950
4191
+ },
4192
+ {
4193
+ "epoch": 0.6259528205128205,
4194
+ "grad_norm": 77.0625,
4195
+ "learning_rate": 9.98777435897436e-07,
4196
+ "loss": 133.511,
4197
+ "step": 5960
4198
+ },
4199
+ {
4200
+ "epoch": 0.6270030769230769,
4201
+ "grad_norm": 80.125,
4202
+ "learning_rate": 9.987753846153846e-07,
4203
+ "loss": 134.8845,
4204
+ "step": 5970
4205
+ },
4206
+ {
4207
+ "epoch": 0.6280533333333334,
4208
+ "grad_norm": 69.9375,
4209
+ "learning_rate": 9.987733333333333e-07,
4210
+ "loss": 133.8227,
4211
+ "step": 5980
4212
+ },
4213
+ {
4214
+ "epoch": 0.6291035897435897,
4215
+ "grad_norm": 75.5625,
4216
+ "learning_rate": 9.98771282051282e-07,
4217
+ "loss": 134.4892,
4218
+ "step": 5990
4219
+ },
4220
+ {
4221
+ "epoch": 0.6301538461538462,
4222
+ "grad_norm": 72.125,
4223
+ "learning_rate": 9.987692307692307e-07,
4224
+ "loss": 135.4582,
4225
+ "step": 6000
4226
+ },
4227
+ {
4228
+ "epoch": 0.6312041025641025,
4229
+ "grad_norm": 75.625,
4230
+ "learning_rate": 9.987671794871794e-07,
4231
+ "loss": 135.2013,
4232
+ "step": 6010
4233
+ },
4234
+ {
4235
+ "epoch": 0.632254358974359,
4236
+ "grad_norm": 74.0625,
4237
+ "learning_rate": 9.98765128205128e-07,
4238
+ "loss": 134.8731,
4239
+ "step": 6020
4240
+ },
4241
+ {
4242
+ "epoch": 0.6333046153846154,
4243
+ "grad_norm": 82.6875,
4244
+ "learning_rate": 9.98763076923077e-07,
4245
+ "loss": 135.0607,
4246
+ "step": 6030
4247
+ },
4248
+ {
4249
+ "epoch": 0.6343548717948718,
4250
+ "grad_norm": 79.75,
4251
+ "learning_rate": 9.987610256410257e-07,
4252
+ "loss": 134.0427,
4253
+ "step": 6040
4254
+ },
4255
+ {
4256
+ "epoch": 0.6354051282051282,
4257
+ "grad_norm": 75.4375,
4258
+ "learning_rate": 9.987589743589742e-07,
4259
+ "loss": 133.6469,
4260
+ "step": 6050
4261
+ },
4262
+ {
4263
+ "epoch": 0.6364553846153846,
4264
+ "grad_norm": 77.6875,
4265
+ "learning_rate": 9.98756923076923e-07,
4266
+ "loss": 133.951,
4267
+ "step": 6060
4268
+ },
4269
+ {
4270
+ "epoch": 0.637505641025641,
4271
+ "grad_norm": 74.1875,
4272
+ "learning_rate": 9.987548717948718e-07,
4273
+ "loss": 134.3955,
4274
+ "step": 6070
4275
+ },
4276
+ {
4277
+ "epoch": 0.6385558974358975,
4278
+ "grad_norm": 80.3125,
4279
+ "learning_rate": 9.987528205128205e-07,
4280
+ "loss": 133.7433,
4281
+ "step": 6080
4282
+ },
4283
+ {
4284
+ "epoch": 0.6396061538461538,
4285
+ "grad_norm": 75.8125,
4286
+ "learning_rate": 9.987507692307692e-07,
4287
+ "loss": 132.4955,
4288
+ "step": 6090
4289
+ },
4290
+ {
4291
+ "epoch": 0.6406564102564103,
4292
+ "grad_norm": 80.125,
4293
+ "learning_rate": 9.98748717948718e-07,
4294
+ "loss": 133.3074,
4295
+ "step": 6100
4296
+ },
4297
+ {
4298
+ "epoch": 0.6417066666666666,
4299
+ "grad_norm": 75.75,
4300
+ "learning_rate": 9.987466666666665e-07,
4301
+ "loss": 134.3473,
4302
+ "step": 6110
4303
+ },
4304
+ {
4305
+ "epoch": 0.6427569230769231,
4306
+ "grad_norm": 77.625,
4307
+ "learning_rate": 9.987446153846152e-07,
4308
+ "loss": 134.2209,
4309
+ "step": 6120
4310
+ },
4311
+ {
4312
+ "epoch": 0.6438071794871795,
4313
+ "grad_norm": 75.375,
4314
+ "learning_rate": 9.987425641025641e-07,
4315
+ "loss": 136.051,
4316
+ "step": 6130
4317
+ },
4318
+ {
4319
+ "epoch": 0.6448574358974359,
4320
+ "grad_norm": 78.625,
4321
+ "learning_rate": 9.987405128205128e-07,
4322
+ "loss": 133.6452,
4323
+ "step": 6140
4324
+ },
4325
+ {
4326
+ "epoch": 0.6459076923076923,
4327
+ "grad_norm": 72.875,
4328
+ "learning_rate": 9.987384615384615e-07,
4329
+ "loss": 135.9148,
4330
+ "step": 6150
4331
+ },
4332
+ {
4333
+ "epoch": 0.6469579487179488,
4334
+ "grad_norm": 79.9375,
4335
+ "learning_rate": 9.987364102564102e-07,
4336
+ "loss": 133.9573,
4337
+ "step": 6160
4338
+ },
4339
+ {
4340
+ "epoch": 0.6480082051282051,
4341
+ "grad_norm": 77.375,
4342
+ "learning_rate": 9.98734358974359e-07,
4343
+ "loss": 134.187,
4344
+ "step": 6170
4345
+ },
4346
+ {
4347
+ "epoch": 0.6490584615384616,
4348
+ "grad_norm": 77.6875,
4349
+ "learning_rate": 9.987323076923076e-07,
4350
+ "loss": 135.4981,
4351
+ "step": 6180
4352
+ },
4353
+ {
4354
+ "epoch": 0.6501087179487179,
4355
+ "grad_norm": 76.375,
4356
+ "learning_rate": 9.987302564102563e-07,
4357
+ "loss": 134.0596,
4358
+ "step": 6190
4359
+ },
4360
+ {
4361
+ "epoch": 0.6511589743589744,
4362
+ "grad_norm": 68.75,
4363
+ "learning_rate": 9.98728205128205e-07,
4364
+ "loss": 133.4401,
4365
+ "step": 6200
4366
+ },
4367
+ {
4368
+ "epoch": 0.6522092307692308,
4369
+ "grad_norm": 88.0,
4370
+ "learning_rate": 9.987261538461537e-07,
4371
+ "loss": 133.9623,
4372
+ "step": 6210
4373
+ },
4374
+ {
4375
+ "epoch": 0.6532594871794872,
4376
+ "grad_norm": 73.125,
4377
+ "learning_rate": 9.987241025641026e-07,
4378
+ "loss": 134.9918,
4379
+ "step": 6220
4380
+ },
4381
+ {
4382
+ "epoch": 0.6543097435897436,
4383
+ "grad_norm": 79.6875,
4384
+ "learning_rate": 9.987220512820513e-07,
4385
+ "loss": 134.2293,
4386
+ "step": 6230
4387
+ },
4388
+ {
4389
+ "epoch": 0.65536,
4390
+ "grad_norm": 79.125,
4391
+ "learning_rate": 9.9872e-07,
4392
+ "loss": 134.9116,
4393
+ "step": 6240
4394
+ },
4395
+ {
4396
+ "epoch": 0.6564102564102564,
4397
+ "grad_norm": 74.0625,
4398
+ "learning_rate": 9.987179487179487e-07,
4399
+ "loss": 133.4889,
4400
+ "step": 6250
4401
+ },
4402
+ {
4403
+ "epoch": 0.6574605128205128,
4404
+ "grad_norm": 74.0,
4405
+ "learning_rate": 9.987158974358974e-07,
4406
+ "loss": 133.3706,
4407
+ "step": 6260
4408
+ },
4409
+ {
4410
+ "epoch": 0.6585107692307692,
4411
+ "grad_norm": 73.5625,
4412
+ "learning_rate": 9.98713846153846e-07,
4413
+ "loss": 134.6084,
4414
+ "step": 6270
4415
+ },
4416
+ {
4417
+ "epoch": 0.6595610256410256,
4418
+ "grad_norm": 76.8125,
4419
+ "learning_rate": 9.987117948717948e-07,
4420
+ "loss": 133.7456,
4421
+ "step": 6280
4422
+ },
4423
+ {
4424
+ "epoch": 0.6606112820512821,
4425
+ "grad_norm": 77.625,
4426
+ "learning_rate": 9.987097435897437e-07,
4427
+ "loss": 133.8633,
4428
+ "step": 6290
4429
+ },
4430
+ {
4431
+ "epoch": 0.6616615384615384,
4432
+ "grad_norm": 76.625,
4433
+ "learning_rate": 9.987076923076921e-07,
4434
+ "loss": 133.3514,
4435
+ "step": 6300
4436
+ },
4437
+ {
4438
+ "epoch": 0.6627117948717949,
4439
+ "grad_norm": 78.6875,
4440
+ "learning_rate": 9.98705641025641e-07,
4441
+ "loss": 133.0192,
4442
+ "step": 6310
4443
+ },
4444
+ {
4445
+ "epoch": 0.6637620512820512,
4446
+ "grad_norm": 76.375,
4447
+ "learning_rate": 9.987035897435897e-07,
4448
+ "loss": 135.2725,
4449
+ "step": 6320
4450
+ },
4451
+ {
4452
+ "epoch": 0.6648123076923077,
4453
+ "grad_norm": 75.25,
4454
+ "learning_rate": 9.987015384615384e-07,
4455
+ "loss": 133.7453,
4456
+ "step": 6330
4457
+ },
4458
+ {
4459
+ "epoch": 0.6658625641025641,
4460
+ "grad_norm": 76.0625,
4461
+ "learning_rate": 9.986994871794871e-07,
4462
+ "loss": 134.4162,
4463
+ "step": 6340
4464
+ },
4465
+ {
4466
+ "epoch": 0.6669128205128205,
4467
+ "grad_norm": 77.3125,
4468
+ "learning_rate": 9.986974358974358e-07,
4469
+ "loss": 133.5715,
4470
+ "step": 6350
4471
+ },
4472
+ {
4473
+ "epoch": 0.6679630769230769,
4474
+ "grad_norm": 79.375,
4475
+ "learning_rate": 9.986953846153845e-07,
4476
+ "loss": 133.915,
4477
+ "step": 6360
4478
+ },
4479
+ {
4480
+ "epoch": 0.6690133333333333,
4481
+ "grad_norm": 76.0,
4482
+ "learning_rate": 9.986933333333332e-07,
4483
+ "loss": 134.2913,
4484
+ "step": 6370
4485
+ },
4486
+ {
4487
+ "epoch": 0.6700635897435897,
4488
+ "grad_norm": 80.0,
4489
+ "learning_rate": 9.986912820512821e-07,
4490
+ "loss": 135.7478,
4491
+ "step": 6380
4492
+ },
4493
+ {
4494
+ "epoch": 0.6711138461538462,
4495
+ "grad_norm": 81.0625,
4496
+ "learning_rate": 9.986892307692308e-07,
4497
+ "loss": 135.2542,
4498
+ "step": 6390
4499
+ },
4500
+ {
4501
+ "epoch": 0.6721641025641025,
4502
+ "grad_norm": 73.3125,
4503
+ "learning_rate": 9.986871794871793e-07,
4504
+ "loss": 133.8043,
4505
+ "step": 6400
4506
+ },
4507
+ {
4508
+ "epoch": 0.673214358974359,
4509
+ "grad_norm": 74.9375,
4510
+ "learning_rate": 9.986851282051282e-07,
4511
+ "loss": 132.8474,
4512
+ "step": 6410
4513
+ },
4514
+ {
4515
+ "epoch": 0.6742646153846154,
4516
+ "grad_norm": 78.5625,
4517
+ "learning_rate": 9.98683076923077e-07,
4518
+ "loss": 133.1841,
4519
+ "step": 6420
4520
+ },
4521
+ {
4522
+ "epoch": 0.6753148717948718,
4523
+ "grad_norm": 81.8125,
4524
+ "learning_rate": 9.986810256410256e-07,
4525
+ "loss": 132.1895,
4526
+ "step": 6430
4527
+ },
4528
+ {
4529
+ "epoch": 0.6763651282051282,
4530
+ "grad_norm": 79.6875,
4531
+ "learning_rate": 9.986789743589743e-07,
4532
+ "loss": 133.3178,
4533
+ "step": 6440
4534
+ },
4535
+ {
4536
+ "epoch": 0.6774153846153846,
4537
+ "grad_norm": 77.9375,
4538
+ "learning_rate": 9.98676923076923e-07,
4539
+ "loss": 134.5587,
4540
+ "step": 6450
4541
+ },
4542
+ {
4543
+ "epoch": 0.678465641025641,
4544
+ "grad_norm": 76.75,
4545
+ "learning_rate": 9.986748717948717e-07,
4546
+ "loss": 133.5905,
4547
+ "step": 6460
4548
+ },
4549
+ {
4550
+ "epoch": 0.6795158974358975,
4551
+ "grad_norm": 74.0,
4552
+ "learning_rate": 9.986728205128204e-07,
4553
+ "loss": 133.0102,
4554
+ "step": 6470
4555
+ },
4556
+ {
4557
+ "epoch": 0.6805661538461538,
4558
+ "grad_norm": 85.0,
4559
+ "learning_rate": 9.986707692307693e-07,
4560
+ "loss": 135.4808,
4561
+ "step": 6480
4562
+ },
4563
+ {
4564
+ "epoch": 0.6816164102564103,
4565
+ "grad_norm": 82.0625,
4566
+ "learning_rate": 9.98668717948718e-07,
4567
+ "loss": 133.2434,
4568
+ "step": 6490
4569
+ },
4570
+ {
4571
+ "epoch": 0.6826666666666666,
4572
+ "grad_norm": 73.1875,
4573
+ "learning_rate": 9.986666666666667e-07,
4574
+ "loss": 134.3713,
4575
+ "step": 6500
4576
+ },
4577
+ {
4578
+ "epoch": 0.6837169230769231,
4579
+ "grad_norm": 75.9375,
4580
+ "learning_rate": 9.986646153846154e-07,
4581
+ "loss": 134.012,
4582
+ "step": 6510
4583
+ },
4584
+ {
4585
+ "epoch": 0.6847671794871795,
4586
+ "grad_norm": 77.5,
4587
+ "learning_rate": 9.98662564102564e-07,
4588
+ "loss": 134.6316,
4589
+ "step": 6520
4590
+ },
4591
+ {
4592
+ "epoch": 0.6858174358974359,
4593
+ "grad_norm": 79.9375,
4594
+ "learning_rate": 9.986605128205127e-07,
4595
+ "loss": 134.4724,
4596
+ "step": 6530
4597
+ },
4598
+ {
4599
+ "epoch": 0.6868676923076923,
4600
+ "grad_norm": 73.0,
4601
+ "learning_rate": 9.986584615384614e-07,
4602
+ "loss": 133.8849,
4603
+ "step": 6540
4604
+ },
4605
+ {
4606
+ "epoch": 0.6879179487179488,
4607
+ "grad_norm": 74.3125,
4608
+ "learning_rate": 9.986564102564101e-07,
4609
+ "loss": 135.4998,
4610
+ "step": 6550
4611
+ },
4612
+ {
4613
+ "epoch": 0.6889682051282051,
4614
+ "grad_norm": 82.0,
4615
+ "learning_rate": 9.986543589743588e-07,
4616
+ "loss": 134.5724,
4617
+ "step": 6560
4618
+ },
4619
+ {
4620
+ "epoch": 0.6900184615384616,
4621
+ "grad_norm": 77.625,
4622
+ "learning_rate": 9.986523076923077e-07,
4623
+ "loss": 134.0957,
4624
+ "step": 6570
4625
+ },
4626
+ {
4627
+ "epoch": 0.6910687179487179,
4628
+ "grad_norm": 73.5,
4629
+ "learning_rate": 9.986502564102564e-07,
4630
+ "loss": 133.3827,
4631
+ "step": 6580
4632
+ },
4633
+ {
4634
+ "epoch": 0.6921189743589744,
4635
+ "grad_norm": 73.625,
4636
+ "learning_rate": 9.986482051282051e-07,
4637
+ "loss": 134.7083,
4638
+ "step": 6590
4639
+ },
4640
+ {
4641
+ "epoch": 0.6931692307692308,
4642
+ "grad_norm": 81.1875,
4643
+ "learning_rate": 9.986461538461538e-07,
4644
+ "loss": 134.4949,
4645
+ "step": 6600
4646
+ },
4647
+ {
4648
+ "epoch": 0.6942194871794872,
4649
+ "grad_norm": 74.8125,
4650
+ "learning_rate": 9.986441025641025e-07,
4651
+ "loss": 133.9215,
4652
+ "step": 6610
4653
+ },
4654
+ {
4655
+ "epoch": 0.6952697435897436,
4656
+ "grad_norm": 76.5,
4657
+ "learning_rate": 9.986420512820512e-07,
4658
+ "loss": 134.7131,
4659
+ "step": 6620
4660
+ },
4661
+ {
4662
+ "epoch": 0.69632,
4663
+ "grad_norm": 74.1875,
4664
+ "learning_rate": 9.9864e-07,
4665
+ "loss": 132.6291,
4666
+ "step": 6630
4667
+ },
4668
+ {
4669
+ "epoch": 0.6973702564102564,
4670
+ "grad_norm": 76.375,
4671
+ "learning_rate": 9.986379487179488e-07,
4672
+ "loss": 133.4663,
4673
+ "step": 6640
4674
+ },
4675
+ {
4676
+ "epoch": 0.6984205128205129,
4677
+ "grad_norm": 76.6875,
4678
+ "learning_rate": 9.986358974358973e-07,
4679
+ "loss": 133.2238,
4680
+ "step": 6650
4681
+ },
4682
+ {
4683
+ "epoch": 0.6994707692307692,
4684
+ "grad_norm": 75.4375,
4685
+ "learning_rate": 9.986338461538462e-07,
4686
+ "loss": 135.6267,
4687
+ "step": 6660
4688
+ },
4689
+ {
4690
+ "epoch": 0.7005210256410257,
4691
+ "grad_norm": 76.5625,
4692
+ "learning_rate": 9.986317948717949e-07,
4693
+ "loss": 133.9723,
4694
+ "step": 6670
4695
+ },
4696
+ {
4697
+ "epoch": 0.701571282051282,
4698
+ "grad_norm": 75.5,
4699
+ "learning_rate": 9.986297435897436e-07,
4700
+ "loss": 132.1382,
4701
+ "step": 6680
4702
+ },
4703
+ {
4704
+ "epoch": 0.7026215384615384,
4705
+ "grad_norm": 75.375,
4706
+ "learning_rate": 9.986276923076923e-07,
4707
+ "loss": 132.8579,
4708
+ "step": 6690
4709
+ },
4710
+ {
4711
+ "epoch": 0.7036717948717949,
4712
+ "grad_norm": 76.3125,
4713
+ "learning_rate": 9.98625641025641e-07,
4714
+ "loss": 133.7719,
4715
+ "step": 6700
4716
+ },
4717
+ {
4718
+ "epoch": 0.7047220512820512,
4719
+ "grad_norm": 81.0625,
4720
+ "learning_rate": 9.986235897435897e-07,
4721
+ "loss": 135.3735,
4722
+ "step": 6710
4723
+ },
4724
+ {
4725
+ "epoch": 0.7057723076923077,
4726
+ "grad_norm": 75.75,
4727
+ "learning_rate": 9.986215384615384e-07,
4728
+ "loss": 132.8083,
4729
+ "step": 6720
4730
+ },
4731
+ {
4732
+ "epoch": 0.706822564102564,
4733
+ "grad_norm": 74.8125,
4734
+ "learning_rate": 9.986194871794873e-07,
4735
+ "loss": 133.2499,
4736
+ "step": 6730
4737
+ },
4738
+ {
4739
+ "epoch": 0.7078728205128205,
4740
+ "grad_norm": 74.375,
4741
+ "learning_rate": 9.98617435897436e-07,
4742
+ "loss": 133.904,
4743
+ "step": 6740
4744
+ },
4745
+ {
4746
+ "epoch": 0.7089230769230769,
4747
+ "grad_norm": 71.25,
4748
+ "learning_rate": 9.986153846153844e-07,
4749
+ "loss": 134.7248,
4750
+ "step": 6750
4751
+ },
4752
+ {
4753
+ "epoch": 0.7099733333333333,
4754
+ "grad_norm": 78.4375,
4755
+ "learning_rate": 9.986133333333333e-07,
4756
+ "loss": 134.4564,
4757
+ "step": 6760
4758
+ },
4759
+ {
4760
+ "epoch": 0.7110235897435897,
4761
+ "grad_norm": 87.3125,
4762
+ "learning_rate": 9.98611282051282e-07,
4763
+ "loss": 132.3151,
4764
+ "step": 6770
4765
+ },
4766
+ {
4767
+ "epoch": 0.7120738461538462,
4768
+ "grad_norm": 73.0625,
4769
+ "learning_rate": 9.986092307692307e-07,
4770
+ "loss": 133.7481,
4771
+ "step": 6780
4772
+ },
4773
+ {
4774
+ "epoch": 0.7131241025641025,
4775
+ "grad_norm": 73.0,
4776
+ "learning_rate": 9.986071794871794e-07,
4777
+ "loss": 133.2306,
4778
+ "step": 6790
4779
+ },
4780
+ {
4781
+ "epoch": 0.714174358974359,
4782
+ "grad_norm": 74.6875,
4783
+ "learning_rate": 9.986051282051281e-07,
4784
+ "loss": 133.5443,
4785
+ "step": 6800
4786
+ },
4787
+ {
4788
+ "epoch": 0.7152246153846153,
4789
+ "grad_norm": 73.4375,
4790
+ "learning_rate": 9.986030769230768e-07,
4791
+ "loss": 133.3841,
4792
+ "step": 6810
4793
+ },
4794
+ {
4795
+ "epoch": 0.7162748717948718,
4796
+ "grad_norm": 73.9375,
4797
+ "learning_rate": 9.986010256410255e-07,
4798
+ "loss": 135.036,
4799
+ "step": 6820
4800
+ },
4801
+ {
4802
+ "epoch": 0.7173251282051282,
4803
+ "grad_norm": 83.9375,
4804
+ "learning_rate": 9.985989743589744e-07,
4805
+ "loss": 134.1337,
4806
+ "step": 6830
4807
+ },
4808
+ {
4809
+ "epoch": 0.7183753846153846,
4810
+ "grad_norm": 78.1875,
4811
+ "learning_rate": 9.98596923076923e-07,
4812
+ "loss": 133.3089,
4813
+ "step": 6840
4814
+ },
4815
+ {
4816
+ "epoch": 0.719425641025641,
4817
+ "grad_norm": 76.875,
4818
+ "learning_rate": 9.985948717948718e-07,
4819
+ "loss": 133.6665,
4820
+ "step": 6850
4821
+ },
4822
+ {
4823
+ "epoch": 0.7204758974358975,
4824
+ "grad_norm": 74.125,
4825
+ "learning_rate": 9.985928205128205e-07,
4826
+ "loss": 133.7026,
4827
+ "step": 6860
4828
+ },
4829
+ {
4830
+ "epoch": 0.7215261538461538,
4831
+ "grad_norm": 78.4375,
4832
+ "learning_rate": 9.985907692307692e-07,
4833
+ "loss": 133.7467,
4834
+ "step": 6870
4835
+ },
4836
+ {
4837
+ "epoch": 0.7225764102564103,
4838
+ "grad_norm": 73.1875,
4839
+ "learning_rate": 9.985887179487179e-07,
4840
+ "loss": 133.6408,
4841
+ "step": 6880
4842
+ },
4843
+ {
4844
+ "epoch": 0.7236266666666666,
4845
+ "grad_norm": 77.0625,
4846
+ "learning_rate": 9.985866666666666e-07,
4847
+ "loss": 132.6118,
4848
+ "step": 6890
4849
+ },
4850
+ {
4851
+ "epoch": 0.7246769230769231,
4852
+ "grad_norm": 76.0625,
4853
+ "learning_rate": 9.985846153846153e-07,
4854
+ "loss": 132.8741,
4855
+ "step": 6900
4856
+ },
4857
+ {
4858
+ "epoch": 0.7257271794871795,
4859
+ "grad_norm": 73.5625,
4860
+ "learning_rate": 9.98582564102564e-07,
4861
+ "loss": 134.1753,
4862
+ "step": 6910
4863
+ },
4864
+ {
4865
+ "epoch": 0.7267774358974359,
4866
+ "grad_norm": 76.875,
4867
+ "learning_rate": 9.985805128205129e-07,
4868
+ "loss": 132.842,
4869
+ "step": 6920
4870
+ },
4871
+ {
4872
+ "epoch": 0.7278276923076923,
4873
+ "grad_norm": 70.0,
4874
+ "learning_rate": 9.985784615384616e-07,
4875
+ "loss": 134.3394,
4876
+ "step": 6930
4877
+ },
4878
+ {
4879
+ "epoch": 0.7288779487179488,
4880
+ "grad_norm": 74.0625,
4881
+ "learning_rate": 9.985764102564103e-07,
4882
+ "loss": 135.2059,
4883
+ "step": 6940
4884
+ },
4885
+ {
4886
+ "epoch": 0.7299282051282051,
4887
+ "grad_norm": 78.625,
4888
+ "learning_rate": 9.98574358974359e-07,
4889
+ "loss": 134.0309,
4890
+ "step": 6950
4891
+ },
4892
+ {
4893
+ "epoch": 0.7309784615384616,
4894
+ "grad_norm": 77.75,
4895
+ "learning_rate": 9.985723076923076e-07,
4896
+ "loss": 134.4309,
4897
+ "step": 6960
4898
+ },
4899
+ {
4900
+ "epoch": 0.7320287179487179,
4901
+ "grad_norm": 74.6875,
4902
+ "learning_rate": 9.985702564102563e-07,
4903
+ "loss": 134.0655,
4904
+ "step": 6970
4905
+ },
4906
+ {
4907
+ "epoch": 0.7330789743589744,
4908
+ "grad_norm": 71.1875,
4909
+ "learning_rate": 9.98568205128205e-07,
4910
+ "loss": 134.798,
4911
+ "step": 6980
4912
+ },
4913
+ {
4914
+ "epoch": 0.7341292307692308,
4915
+ "grad_norm": 80.25,
4916
+ "learning_rate": 9.98566153846154e-07,
4917
+ "loss": 133.176,
4918
+ "step": 6990
4919
+ },
4920
+ {
4921
+ "epoch": 0.7351794871794872,
4922
+ "grad_norm": 77.3125,
4923
+ "learning_rate": 9.985641025641024e-07,
4924
+ "loss": 133.7738,
4925
+ "step": 7000
4926
+ },
4927
+ {
4928
+ "epoch": 0.7362297435897436,
4929
+ "grad_norm": 73.4375,
4930
+ "learning_rate": 9.985620512820513e-07,
4931
+ "loss": 133.5852,
4932
+ "step": 7010
4933
+ },
4934
+ {
4935
+ "epoch": 0.73728,
4936
+ "grad_norm": 82.125,
4937
+ "learning_rate": 9.9856e-07,
4938
+ "loss": 133.2783,
4939
+ "step": 7020
4940
+ },
4941
+ {
4942
+ "epoch": 0.7383302564102564,
4943
+ "grad_norm": 78.0625,
4944
+ "learning_rate": 9.985579487179487e-07,
4945
+ "loss": 133.2677,
4946
+ "step": 7030
4947
+ },
4948
+ {
4949
+ "epoch": 0.7393805128205129,
4950
+ "grad_norm": 78.5,
4951
+ "learning_rate": 9.985558974358974e-07,
4952
+ "loss": 133.9121,
4953
+ "step": 7040
4954
+ },
4955
+ {
4956
+ "epoch": 0.7404307692307692,
4957
+ "grad_norm": 78.875,
4958
+ "learning_rate": 9.98553846153846e-07,
4959
+ "loss": 133.3367,
4960
+ "step": 7050
4961
+ },
4962
+ {
4963
+ "epoch": 0.7414810256410257,
4964
+ "grad_norm": 77.8125,
4965
+ "learning_rate": 9.985517948717948e-07,
4966
+ "loss": 132.6513,
4967
+ "step": 7060
4968
+ },
4969
+ {
4970
+ "epoch": 0.742531282051282,
4971
+ "grad_norm": 73.0,
4972
+ "learning_rate": 9.985497435897435e-07,
4973
+ "loss": 135.524,
4974
+ "step": 7070
4975
+ },
4976
+ {
4977
+ "epoch": 0.7435815384615385,
4978
+ "grad_norm": 74.6875,
4979
+ "learning_rate": 9.985476923076924e-07,
4980
+ "loss": 132.8569,
4981
+ "step": 7080
4982
+ },
4983
+ {
4984
+ "epoch": 0.7446317948717949,
4985
+ "grad_norm": 71.25,
4986
+ "learning_rate": 9.98545641025641e-07,
4987
+ "loss": 132.9906,
4988
+ "step": 7090
4989
+ },
4990
+ {
4991
+ "epoch": 0.7456820512820512,
4992
+ "grad_norm": 75.4375,
4993
+ "learning_rate": 9.985435897435896e-07,
4994
+ "loss": 134.611,
4995
+ "step": 7100
4996
+ },
4997
+ {
4998
+ "epoch": 0.7467323076923077,
4999
+ "grad_norm": 76.0625,
5000
+ "learning_rate": 9.985415384615385e-07,
5001
+ "loss": 135.1841,
5002
+ "step": 7110
5003
+ },
5004
+ {
5005
+ "epoch": 0.747782564102564,
5006
+ "grad_norm": 80.25,
5007
+ "learning_rate": 9.985394871794872e-07,
5008
+ "loss": 132.8851,
5009
+ "step": 7120
5010
+ },
5011
+ {
5012
+ "epoch": 0.7488328205128205,
5013
+ "grad_norm": 77.875,
5014
+ "learning_rate": 9.985374358974359e-07,
5015
+ "loss": 132.9889,
5016
+ "step": 7130
5017
+ },
5018
+ {
5019
+ "epoch": 0.7498830769230769,
5020
+ "grad_norm": 74.375,
5021
+ "learning_rate": 9.985353846153846e-07,
5022
+ "loss": 134.75,
5023
+ "step": 7140
5024
+ },
5025
+ {
5026
+ "epoch": 0.7509333333333333,
5027
+ "grad_norm": 77.6875,
5028
+ "learning_rate": 9.985333333333332e-07,
5029
+ "loss": 131.8883,
5030
+ "step": 7150
5031
+ },
5032
+ {
5033
+ "epoch": 0.7519835897435897,
5034
+ "grad_norm": 71.5,
5035
+ "learning_rate": 9.98531282051282e-07,
5036
+ "loss": 132.9914,
5037
+ "step": 7160
5038
+ },
5039
+ {
5040
+ "epoch": 0.7530338461538462,
5041
+ "grad_norm": 77.4375,
5042
+ "learning_rate": 9.985292307692306e-07,
5043
+ "loss": 133.2398,
5044
+ "step": 7170
5045
+ },
5046
+ {
5047
+ "epoch": 0.7540841025641025,
5048
+ "grad_norm": 79.1875,
5049
+ "learning_rate": 9.985271794871795e-07,
5050
+ "loss": 133.4467,
5051
+ "step": 7180
5052
+ },
5053
+ {
5054
+ "epoch": 0.755134358974359,
5055
+ "grad_norm": 79.375,
5056
+ "learning_rate": 9.985251282051282e-07,
5057
+ "loss": 134.8063,
5058
+ "step": 7190
5059
+ },
5060
+ {
5061
+ "epoch": 0.7561846153846153,
5062
+ "grad_norm": 75.3125,
5063
+ "learning_rate": 9.98523076923077e-07,
5064
+ "loss": 133.1698,
5065
+ "step": 7200
5066
+ },
5067
+ {
5068
+ "epoch": 0.7572348717948718,
5069
+ "grad_norm": 73.5625,
5070
+ "learning_rate": 9.985210256410256e-07,
5071
+ "loss": 133.0102,
5072
+ "step": 7210
5073
+ },
5074
+ {
5075
+ "epoch": 0.7582851282051282,
5076
+ "grad_norm": 71.75,
5077
+ "learning_rate": 9.985189743589743e-07,
5078
+ "loss": 134.511,
5079
+ "step": 7220
5080
+ },
5081
+ {
5082
+ "epoch": 0.7593353846153846,
5083
+ "grad_norm": 85.5,
5084
+ "learning_rate": 9.98516923076923e-07,
5085
+ "loss": 134.3958,
5086
+ "step": 7230
5087
+ },
5088
+ {
5089
+ "epoch": 0.760385641025641,
5090
+ "grad_norm": 87.5625,
5091
+ "learning_rate": 9.985148717948717e-07,
5092
+ "loss": 134.2493,
5093
+ "step": 7240
5094
+ },
5095
+ {
5096
+ "epoch": 0.7614358974358975,
5097
+ "grad_norm": 82.25,
5098
+ "learning_rate": 9.985128205128204e-07,
5099
+ "loss": 133.0641,
5100
+ "step": 7250
5101
+ },
5102
+ {
5103
+ "epoch": 0.7624861538461538,
5104
+ "grad_norm": 82.25,
5105
+ "learning_rate": 9.98510769230769e-07,
5106
+ "loss": 134.3485,
5107
+ "step": 7260
5108
+ },
5109
+ {
5110
+ "epoch": 0.7635364102564103,
5111
+ "grad_norm": 79.1875,
5112
+ "learning_rate": 9.98508717948718e-07,
5113
+ "loss": 134.2047,
5114
+ "step": 7270
5115
+ },
5116
+ {
5117
+ "epoch": 0.7645866666666666,
5118
+ "grad_norm": 84.0625,
5119
+ "learning_rate": 9.985066666666667e-07,
5120
+ "loss": 133.9074,
5121
+ "step": 7280
5122
+ },
5123
+ {
5124
+ "epoch": 0.7656369230769231,
5125
+ "grad_norm": 75.875,
5126
+ "learning_rate": 9.985046153846154e-07,
5127
+ "loss": 132.4027,
5128
+ "step": 7290
5129
+ },
5130
+ {
5131
+ "epoch": 0.7666871794871795,
5132
+ "grad_norm": 79.6875,
5133
+ "learning_rate": 9.98502564102564e-07,
5134
+ "loss": 132.4265,
5135
+ "step": 7300
5136
+ },
5137
+ {
5138
+ "epoch": 0.7677374358974359,
5139
+ "grad_norm": 75.6875,
5140
+ "learning_rate": 9.985005128205128e-07,
5141
+ "loss": 132.4591,
5142
+ "step": 7310
5143
+ },
5144
+ {
5145
+ "epoch": 0.7687876923076923,
5146
+ "grad_norm": 70.75,
5147
+ "learning_rate": 9.984984615384615e-07,
5148
+ "loss": 133.4718,
5149
+ "step": 7320
5150
+ },
5151
+ {
5152
+ "epoch": 0.7698379487179488,
5153
+ "grad_norm": 74.3125,
5154
+ "learning_rate": 9.984964102564102e-07,
5155
+ "loss": 132.798,
5156
+ "step": 7330
5157
+ },
5158
+ {
5159
+ "epoch": 0.7708882051282051,
5160
+ "grad_norm": 76.4375,
5161
+ "learning_rate": 9.98494358974359e-07,
5162
+ "loss": 133.6705,
5163
+ "step": 7340
5164
+ },
5165
+ {
5166
+ "epoch": 0.7719384615384616,
5167
+ "grad_norm": 81.125,
5168
+ "learning_rate": 9.984923076923076e-07,
5169
+ "loss": 132.7228,
5170
+ "step": 7350
5171
+ },
5172
+ {
5173
+ "epoch": 0.7729887179487179,
5174
+ "grad_norm": 75.0,
5175
+ "learning_rate": 9.984902564102565e-07,
5176
+ "loss": 133.0821,
5177
+ "step": 7360
5178
+ },
5179
+ {
5180
+ "epoch": 0.7740389743589744,
5181
+ "grad_norm": 80.625,
5182
+ "learning_rate": 9.984882051282051e-07,
5183
+ "loss": 132.362,
5184
+ "step": 7370
5185
+ },
5186
+ {
5187
+ "epoch": 0.7750892307692308,
5188
+ "grad_norm": 74.6875,
5189
+ "learning_rate": 9.984861538461538e-07,
5190
+ "loss": 133.2498,
5191
+ "step": 7380
5192
+ },
5193
+ {
5194
+ "epoch": 0.7761394871794872,
5195
+ "grad_norm": 78.5,
5196
+ "learning_rate": 9.984841025641025e-07,
5197
+ "loss": 134.545,
5198
+ "step": 7390
5199
+ },
5200
+ {
5201
+ "epoch": 0.7771897435897436,
5202
+ "grad_norm": 82.625,
5203
+ "learning_rate": 9.984820512820512e-07,
5204
+ "loss": 135.3461,
5205
+ "step": 7400
5206
+ },
5207
+ {
5208
+ "epoch": 0.77824,
5209
+ "grad_norm": 80.8125,
5210
+ "learning_rate": 9.9848e-07,
5211
+ "loss": 133.5678,
5212
+ "step": 7410
5213
+ },
5214
+ {
5215
+ "epoch": 0.7792902564102564,
5216
+ "grad_norm": 77.875,
5217
+ "learning_rate": 9.984779487179486e-07,
5218
+ "loss": 133.22,
5219
+ "step": 7420
5220
+ },
5221
+ {
5222
+ "epoch": 0.7803405128205129,
5223
+ "grad_norm": 81.3125,
5224
+ "learning_rate": 9.984758974358975e-07,
5225
+ "loss": 133.2538,
5226
+ "step": 7430
5227
+ },
5228
+ {
5229
+ "epoch": 0.7813907692307692,
5230
+ "grad_norm": 76.125,
5231
+ "learning_rate": 9.98473846153846e-07,
5232
+ "loss": 134.0912,
5233
+ "step": 7440
5234
+ },
5235
+ {
5236
+ "epoch": 0.7824410256410257,
5237
+ "grad_norm": 77.375,
5238
+ "learning_rate": 9.984717948717947e-07,
5239
+ "loss": 133.252,
5240
+ "step": 7450
5241
+ },
5242
+ {
5243
+ "epoch": 0.783491282051282,
5244
+ "grad_norm": 82.0,
5245
+ "learning_rate": 9.984697435897436e-07,
5246
+ "loss": 131.4886,
5247
+ "step": 7460
5248
+ },
5249
+ {
5250
+ "epoch": 0.7845415384615385,
5251
+ "grad_norm": 73.375,
5252
+ "learning_rate": 9.984676923076923e-07,
5253
+ "loss": 133.4359,
5254
+ "step": 7470
5255
+ },
5256
+ {
5257
+ "epoch": 0.7855917948717949,
5258
+ "grad_norm": 74.375,
5259
+ "learning_rate": 9.98465641025641e-07,
5260
+ "loss": 134.0868,
5261
+ "step": 7480
5262
+ },
5263
+ {
5264
+ "epoch": 0.7866420512820513,
5265
+ "grad_norm": 83.4375,
5266
+ "learning_rate": 9.984635897435897e-07,
5267
+ "loss": 132.8313,
5268
+ "step": 7490
5269
+ },
5270
+ {
5271
+ "epoch": 0.7876923076923077,
5272
+ "grad_norm": 76.9375,
5273
+ "learning_rate": 9.984615384615384e-07,
5274
+ "loss": 134.246,
5275
+ "step": 7500
5276
+ },
5277
+ {
5278
+ "epoch": 0.7876923076923077,
5279
+ "eval_loss": 2.086611270904541,
5280
+ "eval_runtime": 327.2333,
5281
+ "eval_samples_per_second": 1527.965,
5282
+ "eval_steps_per_second": 47.752,
5283
+ "step": 7500
5284
  }
5285
  ],
5286
  "logging_steps": 10,
 
5300
  "attributes": {}
5301
  }
5302
  },
5303
+ "total_flos": 2.0708374333095936e+19,
5304
  "train_batch_size": 4,
5305
  "trial_name": null,
5306
  "trial_params": null