mtzig commited on
Commit
ad5f102
·
verified ·
1 Parent(s): bb99c55

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48f0b842ee73338c5196631f87772bba6f5edf4b3ae89cae7bbfc7f309e0857a
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97e5f90b02b18eee0439efcd1e11c562003887a0a8341c65f3c61afc97e6ce91
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3172121c5181db7f36c5f20a872297dfe5b4f0ae30a7959ec1c6216d04d0d1cc
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa33fe523d912fae3cb37eeb6b60af785266354c6c31911ecc4617df910b0be2
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88a60cedca48e5fc8740b4f1f705f978c0560a4e6385b3969f4dac4afed261e8
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe1bbd6e31aea6726660dc7dac9d7b7b788a128231286077750bd3b7ceeb5a97
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb17705c60e3748d16ff6a5ed77b771e13f629bc8439632ab64d0f641cc2332a
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cefe39a14475612351b6fccd8db8eec85a931549215bf24bafd93144edce8a5
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22c799f3fc1e686a2648fd9a88df8f0e9f27001631c96224ad9df9e896a5d223
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57079b1ad6dfda7f50c73be4cc9a2461ca37b66b4a9e6186c57fa89a2fbb32dc
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:900d0bd1d3c3bcd0dad9c4909629cf63a5d624cabc1257f001e2d9077a9e9e53
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90c40fdfe265353374604f556a9c76615bc263d7688eb1dc6fa1733158babe8
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e86c07f1298f4667edef5c54e67b1e608e33a7d17ed5a2972f6c419f38e6ca94
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:834bf46634f0752bdb674694ee8a0f7d157d699667caf2b5dc77591f5ada58ec
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85791137bbe5abdeb01422c95c0695f38d7b465390cfce57a8908907a93aa9c3
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20a45e516450ff75e5f30798a9fd5c55d60506aebd3e02c1c8b581ae0fd8ecb1
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8eeab6b7e925d9ac0af1499c6158c2bd3d2fa709063a35e8908c75fc9a3bf66e
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f258b75154d2aee1a76c51ba8c53eb9ff1afc1684f65be22d906efc966e2f31d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d56b19c89e54575da49ba3691c2d1cd4239936a6e7cdd184f280c64e52c90fc2
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2cd1c1229272286316fc487e083e3c0dbb26b851fd444bc5cfa3906d05744d
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:660fb9882f614217e98ebdc720c67d1f69f90546870acb0d060c2c463fa269c7
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54bd2f8ba2fbed41edcaf0b31a7cc52ace7dc5e888e79b744825e45b024f9c0c
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc5612b1d5a8804a93743e626503af0c5c4b4134be7747f86c470f7d404097de
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20daa4d654ef46df708f18dbbf7bc707be5815cfc90479bf1752f4b1f5183f51
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fbd4dd1b12705ad122adddd6e7db3dc1baec5f8063c359269d322c1f0027ee1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0c332a71d8fb512346f2df9841021fb4baac7da78dd4eb8a3c1b75157d59e96
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9006873666745674,
5
  "eval_steps": 20,
6
- "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -14459,6 +14459,766 @@
14459
  "eval_samples_per_second": 5.48,
14460
  "eval_steps_per_second": 0.182,
14461
  "step": 1900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14462
  }
14463
  ],
14464
  "logging_steps": 1,
@@ -14478,7 +15238,7 @@
14478
  "attributes": {}
14479
  }
14480
  },
14481
- "total_flos": 5.0749333019243315e+17,
14482
  "train_batch_size": 8,
14483
  "trial_name": null,
14484
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9480919649205973,
5
  "eval_steps": 20,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
14459
  "eval_samples_per_second": 5.48,
14460
  "eval_steps_per_second": 0.182,
14461
  "step": 1900
14462
+ },
14463
+ {
14464
+ "epoch": 0.9011614126570278,
14465
+ "grad_norm": 4.734807014465332,
14466
+ "learning_rate": 5.868268248715292e-07,
14467
+ "loss": 0.2087,
14468
+ "step": 1901
14469
+ },
14470
+ {
14471
+ "epoch": 0.901635458639488,
14472
+ "grad_norm": 3.426779270172119,
14473
+ "learning_rate": 5.812530021526541e-07,
14474
+ "loss": 0.1254,
14475
+ "step": 1902
14476
+ },
14477
+ {
14478
+ "epoch": 0.9021095046219483,
14479
+ "grad_norm": 6.730406761169434,
14480
+ "learning_rate": 5.7570498422569e-07,
14481
+ "loss": 0.1027,
14482
+ "step": 1903
14483
+ },
14484
+ {
14485
+ "epoch": 0.9025835506044086,
14486
+ "grad_norm": 7.400092601776123,
14487
+ "learning_rate": 5.701827862906894e-07,
14488
+ "loss": 0.2117,
14489
+ "step": 1904
14490
+ },
14491
+ {
14492
+ "epoch": 0.9030575965868689,
14493
+ "grad_norm": 5.896395206451416,
14494
+ "learning_rate": 5.646864234769644e-07,
14495
+ "loss": 0.2147,
14496
+ "step": 1905
14497
+ },
14498
+ {
14499
+ "epoch": 0.9035316425693293,
14500
+ "grad_norm": 4.465495586395264,
14501
+ "learning_rate": 5.592159108430472e-07,
14502
+ "loss": 0.0945,
14503
+ "step": 1906
14504
+ },
14505
+ {
14506
+ "epoch": 0.9040056885517895,
14507
+ "grad_norm": 3.8033761978149414,
14508
+ "learning_rate": 5.537712633766479e-07,
14509
+ "loss": 0.0876,
14510
+ "step": 1907
14511
+ },
14512
+ {
14513
+ "epoch": 0.9044797345342498,
14514
+ "grad_norm": 4.422494888305664,
14515
+ "learning_rate": 5.483524959946097e-07,
14516
+ "loss": 0.0863,
14517
+ "step": 1908
14518
+ },
14519
+ {
14520
+ "epoch": 0.9049537805167102,
14521
+ "grad_norm": 5.122066974639893,
14522
+ "learning_rate": 5.429596235428746e-07,
14523
+ "loss": 0.1666,
14524
+ "step": 1909
14525
+ },
14526
+ {
14527
+ "epoch": 0.9054278264991704,
14528
+ "grad_norm": 4.508274078369141,
14529
+ "learning_rate": 5.375926607964399e-07,
14530
+ "loss": 0.1302,
14531
+ "step": 1910
14532
+ },
14533
+ {
14534
+ "epoch": 0.9059018724816307,
14535
+ "grad_norm": 5.495452880859375,
14536
+ "learning_rate": 5.322516224593143e-07,
14537
+ "loss": 0.1386,
14538
+ "step": 1911
14539
+ },
14540
+ {
14541
+ "epoch": 0.906375918464091,
14542
+ "grad_norm": 4.296015739440918,
14543
+ "learning_rate": 5.269365231644851e-07,
14544
+ "loss": 0.1947,
14545
+ "step": 1912
14546
+ },
14547
+ {
14548
+ "epoch": 0.9068499644465513,
14549
+ "grad_norm": 5.446202278137207,
14550
+ "learning_rate": 5.216473774738706e-07,
14551
+ "loss": 0.2568,
14552
+ "step": 1913
14553
+ },
14554
+ {
14555
+ "epoch": 0.9073240104290116,
14556
+ "grad_norm": 3.5428926944732666,
14557
+ "learning_rate": 5.163841998782837e-07,
14558
+ "loss": 0.0723,
14559
+ "step": 1914
14560
+ },
14561
+ {
14562
+ "epoch": 0.9077980564114719,
14563
+ "grad_norm": 3.931621789932251,
14564
+ "learning_rate": 5.111470047973932e-07,
14565
+ "loss": 0.1773,
14566
+ "step": 1915
14567
+ },
14568
+ {
14569
+ "epoch": 0.9082721023939322,
14570
+ "grad_norm": 2.9896233081817627,
14571
+ "learning_rate": 5.059358065796816e-07,
14572
+ "loss": 0.1289,
14573
+ "step": 1916
14574
+ },
14575
+ {
14576
+ "epoch": 0.9087461483763926,
14577
+ "grad_norm": 4.83162784576416,
14578
+ "learning_rate": 5.007506195024059e-07,
14579
+ "loss": 0.1292,
14580
+ "step": 1917
14581
+ },
14582
+ {
14583
+ "epoch": 0.9092201943588528,
14584
+ "grad_norm": 3.8033645153045654,
14585
+ "learning_rate": 4.955914577715615e-07,
14586
+ "loss": 0.1018,
14587
+ "step": 1918
14588
+ },
14589
+ {
14590
+ "epoch": 0.9096942403413131,
14591
+ "grad_norm": 3.928222417831421,
14592
+ "learning_rate": 4.904583355218429e-07,
14593
+ "loss": 0.1198,
14594
+ "step": 1919
14595
+ },
14596
+ {
14597
+ "epoch": 0.9101682863237734,
14598
+ "grad_norm": 5.3848557472229,
14599
+ "learning_rate": 4.853512668166005e-07,
14600
+ "loss": 0.0762,
14601
+ "step": 1920
14602
+ },
14603
+ {
14604
+ "epoch": 0.9101682863237734,
14605
+ "eval_accuracy": 0.9935587761674718,
14606
+ "eval_f1": 0.9272727272727272,
14607
+ "eval_loss": 0.012816701084375381,
14608
+ "eval_precision": 0.8793103448275862,
14609
+ "eval_recall": 0.9807692307692307,
14610
+ "eval_runtime": 50.4456,
14611
+ "eval_samples_per_second": 5.372,
14612
+ "eval_steps_per_second": 0.178,
14613
+ "step": 1920
14614
+ },
14615
+ {
14616
+ "epoch": 0.9106423323062337,
14617
+ "grad_norm": 5.235629558563232,
14618
+ "learning_rate": 4.802702656478053e-07,
14619
+ "loss": 0.1394,
14620
+ "step": 1921
14621
+ },
14622
+ {
14623
+ "epoch": 0.911116378288694,
14624
+ "grad_norm": 6.737102031707764,
14625
+ "learning_rate": 4.752153459360143e-07,
14626
+ "loss": 0.1753,
14627
+ "step": 1922
14628
+ },
14629
+ {
14630
+ "epoch": 0.9115904242711543,
14631
+ "grad_norm": 6.279690265655518,
14632
+ "learning_rate": 4.701865215303236e-07,
14633
+ "loss": 0.1381,
14634
+ "step": 1923
14635
+ },
14636
+ {
14637
+ "epoch": 0.9120644702536146,
14638
+ "grad_norm": 5.9221086502075195,
14639
+ "learning_rate": 4.6518380620833694e-07,
14640
+ "loss": 0.1984,
14641
+ "step": 1924
14642
+ },
14643
+ {
14644
+ "epoch": 0.9125385162360748,
14645
+ "grad_norm": 3.1503348350524902,
14646
+ "learning_rate": 4.602072136761282e-07,
14647
+ "loss": 0.1374,
14648
+ "step": 1925
14649
+ },
14650
+ {
14651
+ "epoch": 0.9130125622185352,
14652
+ "grad_norm": 5.046225070953369,
14653
+ "learning_rate": 4.5525675756819987e-07,
14654
+ "loss": 0.1439,
14655
+ "step": 1926
14656
+ },
14657
+ {
14658
+ "epoch": 0.9134866082009955,
14659
+ "grad_norm": 4.287316799163818,
14660
+ "learning_rate": 4.503324514474483e-07,
14661
+ "loss": 0.1769,
14662
+ "step": 1927
14663
+ },
14664
+ {
14665
+ "epoch": 0.9139606541834558,
14666
+ "grad_norm": 3.769968032836914,
14667
+ "learning_rate": 4.4543430880512604e-07,
14668
+ "loss": 0.1381,
14669
+ "step": 1928
14670
+ },
14671
+ {
14672
+ "epoch": 0.9144347001659161,
14673
+ "grad_norm": 3.910022020339966,
14674
+ "learning_rate": 4.4056234306080415e-07,
14675
+ "loss": 0.142,
14676
+ "step": 1929
14677
+ },
14678
+ {
14679
+ "epoch": 0.9149087461483764,
14680
+ "grad_norm": 2.3405351638793945,
14681
+ "learning_rate": 4.357165675623376e-07,
14682
+ "loss": 0.1014,
14683
+ "step": 1930
14684
+ },
14685
+ {
14686
+ "epoch": 0.9153827921308367,
14687
+ "grad_norm": 5.19395637512207,
14688
+ "learning_rate": 4.3089699558582776e-07,
14689
+ "loss": 0.1192,
14690
+ "step": 1931
14691
+ },
14692
+ {
14693
+ "epoch": 0.915856838113297,
14694
+ "grad_norm": 4.177900314331055,
14695
+ "learning_rate": 4.261036403355823e-07,
14696
+ "loss": 0.1327,
14697
+ "step": 1932
14698
+ },
14699
+ {
14700
+ "epoch": 0.9163308840957572,
14701
+ "grad_norm": 8.132122039794922,
14702
+ "learning_rate": 4.2133651494408513e-07,
14703
+ "loss": 0.1815,
14704
+ "step": 1933
14705
+ },
14706
+ {
14707
+ "epoch": 0.9168049300782176,
14708
+ "grad_norm": 4.363158702850342,
14709
+ "learning_rate": 4.165956324719556e-07,
14710
+ "loss": 0.1009,
14711
+ "step": 1934
14712
+ },
14713
+ {
14714
+ "epoch": 0.9172789760606779,
14715
+ "grad_norm": 2.370462417602539,
14716
+ "learning_rate": 4.1188100590791704e-07,
14717
+ "loss": 0.0722,
14718
+ "step": 1935
14719
+ },
14720
+ {
14721
+ "epoch": 0.9177530220431381,
14722
+ "grad_norm": 3.8629467487335205,
14723
+ "learning_rate": 4.0719264816875713e-07,
14724
+ "loss": 0.1657,
14725
+ "step": 1936
14726
+ },
14727
+ {
14728
+ "epoch": 0.9182270680255985,
14729
+ "grad_norm": 5.270659923553467,
14730
+ "learning_rate": 4.0253057209929556e-07,
14731
+ "loss": 0.1906,
14732
+ "step": 1937
14733
+ },
14734
+ {
14735
+ "epoch": 0.9187011140080588,
14736
+ "grad_norm": 3.739020824432373,
14737
+ "learning_rate": 3.9789479047234293e-07,
14738
+ "loss": 0.1354,
14739
+ "step": 1938
14740
+ },
14741
+ {
14742
+ "epoch": 0.9191751599905191,
14743
+ "grad_norm": 3.877326011657715,
14744
+ "learning_rate": 3.9328531598867517e-07,
14745
+ "loss": 0.1159,
14746
+ "step": 1939
14747
+ },
14748
+ {
14749
+ "epoch": 0.9196492059729794,
14750
+ "grad_norm": 3.6506576538085938,
14751
+ "learning_rate": 3.887021612769937e-07,
14752
+ "loss": 0.1372,
14753
+ "step": 1940
14754
+ },
14755
+ {
14756
+ "epoch": 0.9196492059729794,
14757
+ "eval_accuracy": 0.9935587761674718,
14758
+ "eval_f1": 0.9272727272727272,
14759
+ "eval_loss": 0.012422804720699787,
14760
+ "eval_precision": 0.8793103448275862,
14761
+ "eval_recall": 0.9807692307692307,
14762
+ "eval_runtime": 49.6153,
14763
+ "eval_samples_per_second": 5.462,
14764
+ "eval_steps_per_second": 0.181,
14765
+ "step": 1940
14766
+ },
14767
+ {
14768
+ "epoch": 0.9201232519554396,
14769
+ "grad_norm": 4.246330261230469,
14770
+ "learning_rate": 3.841453388938876e-07,
14771
+ "loss": 0.1756,
14772
+ "step": 1941
14773
+ },
14774
+ {
14775
+ "epoch": 0.9205972979379,
14776
+ "grad_norm": 4.453822135925293,
14777
+ "learning_rate": 3.7961486132380487e-07,
14778
+ "loss": 0.0994,
14779
+ "step": 1942
14780
+ },
14781
+ {
14782
+ "epoch": 0.9210713439203603,
14783
+ "grad_norm": 3.8408455848693848,
14784
+ "learning_rate": 3.7511074097901557e-07,
14785
+ "loss": 0.1043,
14786
+ "step": 1943
14787
+ },
14788
+ {
14789
+ "epoch": 0.9215453899028205,
14790
+ "grad_norm": 4.067150592803955,
14791
+ "learning_rate": 3.7063299019957867e-07,
14792
+ "loss": 0.1134,
14793
+ "step": 1944
14794
+ },
14795
+ {
14796
+ "epoch": 0.9220194358852809,
14797
+ "grad_norm": 6.159415245056152,
14798
+ "learning_rate": 3.661816212533076e-07,
14799
+ "loss": 0.1361,
14800
+ "step": 1945
14801
+ },
14802
+ {
14803
+ "epoch": 0.9224934818677412,
14804
+ "grad_norm": 2.458495616912842,
14805
+ "learning_rate": 3.617566463357336e-07,
14806
+ "loss": 0.0948,
14807
+ "step": 1946
14808
+ },
14809
+ {
14810
+ "epoch": 0.9229675278502014,
14811
+ "grad_norm": 6.143227577209473,
14812
+ "learning_rate": 3.5735807757008354e-07,
14813
+ "loss": 0.2176,
14814
+ "step": 1947
14815
+ },
14816
+ {
14817
+ "epoch": 0.9234415738326618,
14818
+ "grad_norm": 3.9085354804992676,
14819
+ "learning_rate": 3.529859270072289e-07,
14820
+ "loss": 0.1214,
14821
+ "step": 1948
14822
+ },
14823
+ {
14824
+ "epoch": 0.923915619815122,
14825
+ "grad_norm": 4.7195963859558105,
14826
+ "learning_rate": 3.4864020662566775e-07,
14827
+ "loss": 0.1125,
14828
+ "step": 1949
14829
+ },
14830
+ {
14831
+ "epoch": 0.9243896657975824,
14832
+ "grad_norm": 4.297183990478516,
14833
+ "learning_rate": 3.443209283314863e-07,
14834
+ "loss": 0.1678,
14835
+ "step": 1950
14836
+ },
14837
+ {
14838
+ "epoch": 0.9248637117800427,
14839
+ "grad_norm": 2.1935582160949707,
14840
+ "learning_rate": 3.4002810395832753e-07,
14841
+ "loss": 0.0998,
14842
+ "step": 1951
14843
+ },
14844
+ {
14845
+ "epoch": 0.9253377577625029,
14846
+ "grad_norm": 4.309812068939209,
14847
+ "learning_rate": 3.357617452673545e-07,
14848
+ "loss": 0.0861,
14849
+ "step": 1952
14850
+ },
14851
+ {
14852
+ "epoch": 0.9258118037449633,
14853
+ "grad_norm": 5.1253743171691895,
14854
+ "learning_rate": 3.3152186394722506e-07,
14855
+ "loss": 0.1119,
14856
+ "step": 1953
14857
+ },
14858
+ {
14859
+ "epoch": 0.9262858497274236,
14860
+ "grad_norm": 4.127727031707764,
14861
+ "learning_rate": 3.27308471614054e-07,
14862
+ "loss": 0.1562,
14863
+ "step": 1954
14864
+ },
14865
+ {
14866
+ "epoch": 0.9267598957098838,
14867
+ "grad_norm": 3.679004430770874,
14868
+ "learning_rate": 3.2312157981138626e-07,
14869
+ "loss": 0.1136,
14870
+ "step": 1955
14871
+ },
14872
+ {
14873
+ "epoch": 0.9272339416923442,
14874
+ "grad_norm": 6.002187728881836,
14875
+ "learning_rate": 3.189612000101594e-07,
14876
+ "loss": 0.1292,
14877
+ "step": 1956
14878
+ },
14879
+ {
14880
+ "epoch": 0.9277079876748044,
14881
+ "grad_norm": 4.653674125671387,
14882
+ "learning_rate": 3.148273436086757e-07,
14883
+ "loss": 0.1301,
14884
+ "step": 1957
14885
+ },
14886
+ {
14887
+ "epoch": 0.9281820336572647,
14888
+ "grad_norm": 3.5616414546966553,
14889
+ "learning_rate": 3.107200219325746e-07,
14890
+ "loss": 0.1325,
14891
+ "step": 1958
14892
+ },
14893
+ {
14894
+ "epoch": 0.9286560796397251,
14895
+ "grad_norm": 3.704566240310669,
14896
+ "learning_rate": 3.0663924623479337e-07,
14897
+ "loss": 0.1194,
14898
+ "step": 1959
14899
+ },
14900
+ {
14901
+ "epoch": 0.9291301256221853,
14902
+ "grad_norm": 3.3392817974090576,
14903
+ "learning_rate": 3.0258502769553996e-07,
14904
+ "loss": 0.0837,
14905
+ "step": 1960
14906
+ },
14907
+ {
14908
+ "epoch": 0.9291301256221853,
14909
+ "eval_accuracy": 0.9935587761674718,
14910
+ "eval_f1": 0.9272727272727272,
14911
+ "eval_loss": 0.012340452522039413,
14912
+ "eval_precision": 0.8793103448275862,
14913
+ "eval_recall": 0.9807692307692307,
14914
+ "eval_runtime": 50.1351,
14915
+ "eval_samples_per_second": 5.405,
14916
+ "eval_steps_per_second": 0.18,
14917
+ "step": 1960
14918
+ },
14919
+ {
14920
+ "epoch": 0.9296041716046457,
14921
+ "grad_norm": 8.908299446105957,
14922
+ "learning_rate": 2.985573774222661e-07,
14923
+ "loss": 0.1625,
14924
+ "step": 1961
14925
+ },
14926
+ {
14927
+ "epoch": 0.930078217587106,
14928
+ "grad_norm": 3.4975991249084473,
14929
+ "learning_rate": 2.945563064496326e-07,
14930
+ "loss": 0.1712,
14931
+ "step": 1962
14932
+ },
14933
+ {
14934
+ "epoch": 0.9305522635695662,
14935
+ "grad_norm": 4.6506147384643555,
14936
+ "learning_rate": 2.905818257394799e-07,
14937
+ "loss": 0.1451,
14938
+ "step": 1963
14939
+ },
14940
+ {
14941
+ "epoch": 0.9310263095520266,
14942
+ "grad_norm": 1.9439915418624878,
14943
+ "learning_rate": 2.8663394618079875e-07,
14944
+ "loss": 0.0572,
14945
+ "step": 1964
14946
+ },
14947
+ {
14948
+ "epoch": 0.9315003555344868,
14949
+ "grad_norm": 4.1189374923706055,
14950
+ "learning_rate": 2.827126785897005e-07,
14951
+ "loss": 0.1361,
14952
+ "step": 1965
14953
+ },
14954
+ {
14955
+ "epoch": 0.9319744015169471,
14956
+ "grad_norm": 6.66880989074707,
14957
+ "learning_rate": 2.78818033709386e-07,
14958
+ "loss": 0.1701,
14959
+ "step": 1966
14960
+ },
14961
+ {
14962
+ "epoch": 0.9324484474994075,
14963
+ "grad_norm": 4.2832794189453125,
14964
+ "learning_rate": 2.7495002221011757e-07,
14965
+ "loss": 0.1376,
14966
+ "step": 1967
14967
+ },
14968
+ {
14969
+ "epoch": 0.9329224934818677,
14970
+ "grad_norm": 3.8820581436157227,
14971
+ "learning_rate": 2.7110865468919057e-07,
14972
+ "loss": 0.1829,
14973
+ "step": 1968
14974
+ },
14975
+ {
14976
+ "epoch": 0.933396539464328,
14977
+ "grad_norm": 4.808830261230469,
14978
+ "learning_rate": 2.672939416708986e-07,
14979
+ "loss": 0.1535,
14980
+ "step": 1969
14981
+ },
14982
+ {
14983
+ "epoch": 0.9338705854467884,
14984
+ "grad_norm": 3.9023189544677734,
14985
+ "learning_rate": 2.635058936065138e-07,
14986
+ "loss": 0.1386,
14987
+ "step": 1970
14988
+ },
14989
+ {
14990
+ "epoch": 0.9343446314292486,
14991
+ "grad_norm": 8.328058242797852,
14992
+ "learning_rate": 2.5974452087425437e-07,
14993
+ "loss": 0.2852,
14994
+ "step": 1971
14995
+ },
14996
+ {
14997
+ "epoch": 0.934818677411709,
14998
+ "grad_norm": 7.914390563964844,
14999
+ "learning_rate": 2.5600983377925046e-07,
15000
+ "loss": 0.1979,
15001
+ "step": 1972
15002
+ },
15003
+ {
15004
+ "epoch": 0.9352927233941692,
15005
+ "grad_norm": 3.548283815383911,
15006
+ "learning_rate": 2.523018425535251e-07,
15007
+ "loss": 0.1297,
15008
+ "step": 1973
15009
+ },
15010
+ {
15011
+ "epoch": 0.9357667693766295,
15012
+ "grad_norm": 6.728952884674072,
15013
+ "learning_rate": 2.486205573559608e-07,
15014
+ "loss": 0.1692,
15015
+ "step": 1974
15016
+ },
15017
+ {
15018
+ "epoch": 0.9362408153590899,
15019
+ "grad_norm": 3.3491721153259277,
15020
+ "learning_rate": 2.4496598827227213e-07,
15021
+ "loss": 0.0886,
15022
+ "step": 1975
15023
+ },
15024
+ {
15025
+ "epoch": 0.9367148613415501,
15026
+ "grad_norm": 5.318295001983643,
15027
+ "learning_rate": 2.413381453149799e-07,
15028
+ "loss": 0.1496,
15029
+ "step": 1976
15030
+ },
15031
+ {
15032
+ "epoch": 0.9371889073240104,
15033
+ "grad_norm": 8.961012840270996,
15034
+ "learning_rate": 2.3773703842338125e-07,
15035
+ "loss": 0.1294,
15036
+ "step": 1977
15037
+ },
15038
+ {
15039
+ "epoch": 0.9376629533064708,
15040
+ "grad_norm": 8.02442741394043,
15041
+ "learning_rate": 2.3416267746352528e-07,
15042
+ "loss": 0.1405,
15043
+ "step": 1978
15044
+ },
15045
+ {
15046
+ "epoch": 0.938136999288931,
15047
+ "grad_norm": 5.480352401733398,
15048
+ "learning_rate": 2.3061507222818303e-07,
15049
+ "loss": 0.1055,
15050
+ "step": 1979
15051
+ },
15052
+ {
15053
+ "epoch": 0.9386110452713913,
15054
+ "grad_norm": 4.235230445861816,
15055
+ "learning_rate": 2.2709423243682416e-07,
15056
+ "loss": 0.1353,
15057
+ "step": 1980
15058
+ },
15059
+ {
15060
+ "epoch": 0.9386110452713913,
15061
+ "eval_accuracy": 0.9935587761674718,
15062
+ "eval_f1": 0.9272727272727272,
15063
+ "eval_loss": 0.01255668792873621,
15064
+ "eval_precision": 0.8793103448275862,
15065
+ "eval_recall": 0.9807692307692307,
15066
+ "eval_runtime": 49.3085,
15067
+ "eval_samples_per_second": 5.496,
15068
+ "eval_steps_per_second": 0.183,
15069
+ "step": 1980
15070
+ },
15071
+ {
15072
+ "epoch": 0.9390850912538516,
15073
+ "grad_norm": 3.984555959701538,
15074
+ "learning_rate": 2.23600167735587e-07,
15075
+ "loss": 0.1236,
15076
+ "step": 1981
15077
+ },
15078
+ {
15079
+ "epoch": 0.9395591372363119,
15080
+ "grad_norm": 5.206995487213135,
15081
+ "learning_rate": 2.2013288769725194e-07,
15082
+ "loss": 0.2124,
15083
+ "step": 1982
15084
+ },
15085
+ {
15086
+ "epoch": 0.9400331832187723,
15087
+ "grad_norm": 4.533375263214111,
15088
+ "learning_rate": 2.166924018212202e-07,
15089
+ "loss": 0.1632,
15090
+ "step": 1983
15091
+ },
15092
+ {
15093
+ "epoch": 0.9405072292012325,
15094
+ "grad_norm": 5.9977641105651855,
15095
+ "learning_rate": 2.132787195334829e-07,
15096
+ "loss": 0.2039,
15097
+ "step": 1984
15098
+ },
15099
+ {
15100
+ "epoch": 0.9409812751836928,
15101
+ "grad_norm": 6.321089744567871,
15102
+ "learning_rate": 2.0989185018659431e-07,
15103
+ "loss": 0.1539,
15104
+ "step": 1985
15105
+ },
15106
+ {
15107
+ "epoch": 0.9414553211661532,
15108
+ "grad_norm": 4.989069938659668,
15109
+ "learning_rate": 2.0653180305965194e-07,
15110
+ "loss": 0.1501,
15111
+ "step": 1986
15112
+ },
15113
+ {
15114
+ "epoch": 0.9419293671486134,
15115
+ "grad_norm": 4.138362407684326,
15116
+ "learning_rate": 2.0319858735826648e-07,
15117
+ "loss": 0.1388,
15118
+ "step": 1987
15119
+ },
15120
+ {
15121
+ "epoch": 0.9424034131310737,
15122
+ "grad_norm": 11.884577751159668,
15123
+ "learning_rate": 1.9989221221453746e-07,
15124
+ "loss": 0.2071,
15125
+ "step": 1988
15126
+ },
15127
+ {
15128
+ "epoch": 0.942877459113534,
15129
+ "grad_norm": 4.623379707336426,
15130
+ "learning_rate": 1.966126866870277e-07,
15131
+ "loss": 0.1529,
15132
+ "step": 1989
15133
+ },
15134
+ {
15135
+ "epoch": 0.9433515050959943,
15136
+ "grad_norm": 5.523632526397705,
15137
+ "learning_rate": 1.9336001976074326e-07,
15138
+ "loss": 0.2024,
15139
+ "step": 1990
15140
+ },
15141
+ {
15142
+ "epoch": 0.9438255510784546,
15143
+ "grad_norm": 4.839412689208984,
15144
+ "learning_rate": 1.9013422034710016e-07,
15145
+ "loss": 0.1426,
15146
+ "step": 1991
15147
+ },
15148
+ {
15149
+ "epoch": 0.9442995970609149,
15150
+ "grad_norm": 9.59015941619873,
15151
+ "learning_rate": 1.869352972839067e-07,
15152
+ "loss": 0.2005,
15153
+ "step": 1992
15154
+ },
15155
+ {
15156
+ "epoch": 0.9447736430433752,
15157
+ "grad_norm": 4.234097957611084,
15158
+ "learning_rate": 1.837632593353389e-07,
15159
+ "loss": 0.1123,
15160
+ "step": 1993
15161
+ },
15162
+ {
15163
+ "epoch": 0.9452476890258356,
15164
+ "grad_norm": 4.442883491516113,
15165
+ "learning_rate": 1.8061811519191287e-07,
15166
+ "loss": 0.1053,
15167
+ "step": 1994
15168
+ },
15169
+ {
15170
+ "epoch": 0.9457217350082958,
15171
+ "grad_norm": 4.088728904724121,
15172
+ "learning_rate": 1.7749987347046471e-07,
15173
+ "loss": 0.0867,
15174
+ "step": 1995
15175
+ },
15176
+ {
15177
+ "epoch": 0.9461957809907561,
15178
+ "grad_norm": 4.195045471191406,
15179
+ "learning_rate": 1.7440854271412288e-07,
15180
+ "loss": 0.159,
15181
+ "step": 1996
15182
+ },
15183
+ {
15184
+ "epoch": 0.9466698269732164,
15185
+ "grad_norm": 4.0102739334106445,
15186
+ "learning_rate": 1.7134413139228812e-07,
15187
+ "loss": 0.1162,
15188
+ "step": 1997
15189
+ },
15190
+ {
15191
+ "epoch": 0.9471438729556767,
15192
+ "grad_norm": 5.108349800109863,
15193
+ "learning_rate": 1.6830664790061124e-07,
15194
+ "loss": 0.1445,
15195
+ "step": 1998
15196
+ },
15197
+ {
15198
+ "epoch": 0.947617918938137,
15199
+ "grad_norm": 6.93289852142334,
15200
+ "learning_rate": 1.6529610056096768e-07,
15201
+ "loss": 0.1204,
15202
+ "step": 1999
15203
+ },
15204
+ {
15205
+ "epoch": 0.9480919649205973,
15206
+ "grad_norm": 8.224555015563965,
15207
+ "learning_rate": 1.6231249762143187e-07,
15208
+ "loss": 0.1914,
15209
+ "step": 2000
15210
+ },
15211
+ {
15212
+ "epoch": 0.9480919649205973,
15213
+ "eval_accuracy": 0.9935587761674718,
15214
+ "eval_f1": 0.9272727272727272,
15215
+ "eval_loss": 0.012721872888505459,
15216
+ "eval_precision": 0.8793103448275862,
15217
+ "eval_recall": 0.9807692307692307,
15218
+ "eval_runtime": 50.4202,
15219
+ "eval_samples_per_second": 5.375,
15220
+ "eval_steps_per_second": 0.179,
15221
+ "step": 2000
15222
  }
15223
  ],
15224
  "logging_steps": 1,
 
15238
  "attributes": {}
15239
  }
15240
  },
15241
+ "total_flos": 5.343662741557084e+17,
15242
  "train_batch_size": 8,
15243
  "trial_name": null,
15244
  "trial_params": null