mtzig commited on
Commit
bb99c55
·
verified ·
1 Parent(s): 13d988c

Training in progress, step 1900, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab77ddbb637a9bc0d36ef5dbbbaa0af341f0bf80b31d6e285eda485f1898eb8f
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48f0b842ee73338c5196631f87772bba6f5edf4b3ae89cae7bbfc7f309e0857a
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:286a3c21980a7a404bde1675a6009d842c88e5d90ac29813f962481a4368f26f
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3172121c5181db7f36c5f20a872297dfe5b4f0ae30a7959ec1c6216d04d0d1cc
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09f72128a18889f676efa0e2e54c424b47d5e4132cc601aa9074f6f5411b8f94
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a60cedca48e5fc8740b4f1f705f978c0560a4e6385b3969f4dac4afed261e8
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:041f67a3ce2b2d81077a75116ed983ec1c3d6e3f3611853c338c7ecf44913d9e
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb17705c60e3748d16ff6a5ed77b771e13f629bc8439632ab64d0f641cc2332a
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:851162e189682dac7fc53a1c0d10ceb5145d8569c1a94696d7c715b4c49a67ea
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22c799f3fc1e686a2648fd9a88df8f0e9f27001631c96224ad9df9e896a5d223
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7d32e1e73d5d548c4bd50868ff94314b76b56f22bf14438e5afcb5d47b865b7
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:900d0bd1d3c3bcd0dad9c4909629cf63a5d624cabc1257f001e2d9077a9e9e53
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9738f30136c7a74ad2e25b79cf200868a8a6622ff78163abd5aa23402612abf6
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e86c07f1298f4667edef5c54e67b1e608e33a7d17ed5a2972f6c419f38e6ca94
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4a3d3761905544d982ee155e6770c63fdcd8e1d6ad804c9e3fc0b48ef3c557a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85791137bbe5abdeb01422c95c0695f38d7b465390cfce57a8908907a93aa9c3
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6294caa602c6fe4743ca0c7205bb0551de153ef41f54789786a229cd626bef4
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eeab6b7e925d9ac0af1499c6158c2bd3d2fa709063a35e8908c75fc9a3bf66e
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:305f7da916867733708b5e00527298ca628c3162916331e86427a0e6c1d84c36
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d56b19c89e54575da49ba3691c2d1cd4239936a6e7cdd184f280c64e52c90fc2
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91731ae666b3fff1615f6c83fbbbe5160c401bb673770f4a96920e7df7c75154
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:660fb9882f614217e98ebdc720c67d1f69f90546870acb0d060c2c463fa269c7
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30c1d4c35a1c4cecfd92a1a7ea971a84bf462bf7dead3baefbde0a5e7b2317cc
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc5612b1d5a8804a93743e626503af0c5c4b4134be7747f86c470f7d404097de
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1735f1ec303b05af874a7e52da143869d5f926b0a117b607b330e92e0e8872be
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fbd4dd1b12705ad122adddd6e7db3dc1baec5f8063c359269d322c1f0027ee1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8532827684285376,
5
  "eval_steps": 20,
6
- "global_step": 1800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -13699,6 +13699,766 @@
13699
  "eval_samples_per_second": 5.301,
13700
  "eval_steps_per_second": 0.176,
13701
  "step": 1800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13702
  }
13703
  ],
13704
  "logging_steps": 1,
@@ -13718,7 +14478,7 @@
13718
  "attributes": {}
13719
  }
13720
  },
13721
- "total_flos": 4.8095721145604506e+17,
13722
  "train_batch_size": 8,
13723
  "trial_name": null,
13724
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9006873666745674,
5
  "eval_steps": 20,
6
+ "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
13699
  "eval_samples_per_second": 5.301,
13700
  "eval_steps_per_second": 0.176,
13701
  "step": 1800
13702
+ },
13703
+ {
13704
+ "epoch": 0.8537568144109978,
13705
+ "grad_norm": 3.1635870933532715,
13706
+ "learning_rate": 1.2716048081775823e-06,
13707
+ "loss": 0.1749,
13708
+ "step": 1801
13709
+ },
13710
+ {
13711
+ "epoch": 0.8542308603934582,
13712
+ "grad_norm": 8.236771583557129,
13713
+ "learning_rate": 1.2635392116408095e-06,
13714
+ "loss": 0.1951,
13715
+ "step": 1802
13716
+ },
13717
+ {
13718
+ "epoch": 0.8547049063759184,
13719
+ "grad_norm": 7.184986114501953,
13720
+ "learning_rate": 1.2554975506182533e-06,
13721
+ "loss": 0.157,
13722
+ "step": 1803
13723
+ },
13724
+ {
13725
+ "epoch": 0.8551789523583787,
13726
+ "grad_norm": 2.5440175533294678,
13727
+ "learning_rate": 1.247479847141867e-06,
13728
+ "loss": 0.0708,
13729
+ "step": 1804
13730
+ },
13731
+ {
13732
+ "epoch": 0.8556529983408391,
13733
+ "grad_norm": 3.4107348918914795,
13734
+ "learning_rate": 1.2394861231779677e-06,
13735
+ "loss": 0.0968,
13736
+ "step": 1805
13737
+ },
13738
+ {
13739
+ "epoch": 0.8561270443232993,
13740
+ "grad_norm": 8.80566692352295,
13741
+ "learning_rate": 1.2315164006271718e-06,
13742
+ "loss": 0.1692,
13743
+ "step": 1806
13744
+ },
13745
+ {
13746
+ "epoch": 0.8566010903057597,
13747
+ "grad_norm": 5.312666416168213,
13748
+ "learning_rate": 1.2235707013243426e-06,
13749
+ "loss": 0.119,
13750
+ "step": 1807
13751
+ },
13752
+ {
13753
+ "epoch": 0.85707513628822,
13754
+ "grad_norm": 7.94031286239624,
13755
+ "learning_rate": 1.2156490470385207e-06,
13756
+ "loss": 0.1993,
13757
+ "step": 1808
13758
+ },
13759
+ {
13760
+ "epoch": 0.8575491822706802,
13761
+ "grad_norm": 3.85893177986145,
13762
+ "learning_rate": 1.2077514594728778e-06,
13763
+ "loss": 0.1085,
13764
+ "step": 1809
13765
+ },
13766
+ {
13767
+ "epoch": 0.8580232282531406,
13768
+ "grad_norm": 2.8813283443450928,
13769
+ "learning_rate": 1.1998779602646438e-06,
13770
+ "loss": 0.1099,
13771
+ "step": 1810
13772
+ },
13773
+ {
13774
+ "epoch": 0.8584972742356008,
13775
+ "grad_norm": 4.950772762298584,
13776
+ "learning_rate": 1.1920285709850509e-06,
13777
+ "loss": 0.1064,
13778
+ "step": 1811
13779
+ },
13780
+ {
13781
+ "epoch": 0.8589713202180611,
13782
+ "grad_norm": 3.9935288429260254,
13783
+ "learning_rate": 1.184203313139286e-06,
13784
+ "loss": 0.1145,
13785
+ "step": 1812
13786
+ },
13787
+ {
13788
+ "epoch": 0.8594453662005215,
13789
+ "grad_norm": 5.1902360916137695,
13790
+ "learning_rate": 1.1764022081664094e-06,
13791
+ "loss": 0.164,
13792
+ "step": 1813
13793
+ },
13794
+ {
13795
+ "epoch": 0.8599194121829817,
13796
+ "grad_norm": 4.6810150146484375,
13797
+ "learning_rate": 1.1686252774393181e-06,
13798
+ "loss": 0.1272,
13799
+ "step": 1814
13800
+ },
13801
+ {
13802
+ "epoch": 0.860393458165442,
13803
+ "grad_norm": 3.890429735183716,
13804
+ "learning_rate": 1.1608725422646782e-06,
13805
+ "loss": 0.1128,
13806
+ "step": 1815
13807
+ },
13808
+ {
13809
+ "epoch": 0.8608675041479024,
13810
+ "grad_norm": 9.929910659790039,
13811
+ "learning_rate": 1.1531440238828639e-06,
13812
+ "loss": 0.169,
13813
+ "step": 1816
13814
+ },
13815
+ {
13816
+ "epoch": 0.8613415501303626,
13817
+ "grad_norm": 3.39127516746521,
13818
+ "learning_rate": 1.1454397434679022e-06,
13819
+ "loss": 0.0916,
13820
+ "step": 1817
13821
+ },
13822
+ {
13823
+ "epoch": 0.861815596112823,
13824
+ "grad_norm": 3.8935232162475586,
13825
+ "learning_rate": 1.137759722127415e-06,
13826
+ "loss": 0.1236,
13827
+ "step": 1818
13828
+ },
13829
+ {
13830
+ "epoch": 0.8622896420952832,
13831
+ "grad_norm": 4.592057704925537,
13832
+ "learning_rate": 1.1301039809025628e-06,
13833
+ "loss": 0.1573,
13834
+ "step": 1819
13835
+ },
13836
+ {
13837
+ "epoch": 0.8627636880777435,
13838
+ "grad_norm": 3.4906246662139893,
13839
+ "learning_rate": 1.1224725407679814e-06,
13840
+ "loss": 0.0799,
13841
+ "step": 1820
13842
+ },
13843
+ {
13844
+ "epoch": 0.8627636880777435,
13845
+ "eval_accuracy": 0.9943639291465378,
13846
+ "eval_f1": 0.9369369369369369,
13847
+ "eval_loss": 0.014933480881154537,
13848
+ "eval_precision": 0.8813559322033898,
13849
+ "eval_recall": 1.0,
13850
+ "eval_runtime": 49.8899,
13851
+ "eval_samples_per_second": 5.432,
13852
+ "eval_steps_per_second": 0.18,
13853
+ "step": 1820
13854
+ },
13855
+ {
13856
+ "epoch": 0.8632377340602039,
13857
+ "grad_norm": 7.27462911605835,
13858
+ "learning_rate": 1.1148654226317325e-06,
13859
+ "loss": 0.1538,
13860
+ "step": 1821
13861
+ },
13862
+ {
13863
+ "epoch": 0.8637117800426641,
13864
+ "grad_norm": 3.6112170219421387,
13865
+ "learning_rate": 1.1072826473352394e-06,
13866
+ "loss": 0.1337,
13867
+ "step": 1822
13868
+ },
13869
+ {
13870
+ "epoch": 0.8641858260251244,
13871
+ "grad_norm": 4.936607360839844,
13872
+ "learning_rate": 1.0997242356532335e-06,
13873
+ "loss": 0.152,
13874
+ "step": 1823
13875
+ },
13876
+ {
13877
+ "epoch": 0.8646598720075848,
13878
+ "grad_norm": 7.105523109436035,
13879
+ "learning_rate": 1.0921902082936987e-06,
13880
+ "loss": 0.1772,
13881
+ "step": 1824
13882
+ },
13883
+ {
13884
+ "epoch": 0.865133917990045,
13885
+ "grad_norm": 7.956032752990723,
13886
+ "learning_rate": 1.0846805858978038e-06,
13887
+ "loss": 0.1794,
13888
+ "step": 1825
13889
+ },
13890
+ {
13891
+ "epoch": 0.8656079639725053,
13892
+ "grad_norm": 2.654674530029297,
13893
+ "learning_rate": 1.0771953890398679e-06,
13894
+ "loss": 0.1223,
13895
+ "step": 1826
13896
+ },
13897
+ {
13898
+ "epoch": 0.8660820099549656,
13899
+ "grad_norm": 5.592787265777588,
13900
+ "learning_rate": 1.0697346382272822e-06,
13901
+ "loss": 0.1459,
13902
+ "step": 1827
13903
+ },
13904
+ {
13905
+ "epoch": 0.8665560559374259,
13906
+ "grad_norm": 9.735966682434082,
13907
+ "learning_rate": 1.0622983539004628e-06,
13908
+ "loss": 0.176,
13909
+ "step": 1828
13910
+ },
13911
+ {
13912
+ "epoch": 0.8670301019198863,
13913
+ "grad_norm": 7.254702568054199,
13914
+ "learning_rate": 1.054886556432798e-06,
13915
+ "loss": 0.1962,
13916
+ "step": 1829
13917
+ },
13918
+ {
13919
+ "epoch": 0.8675041479023465,
13920
+ "grad_norm": 3.388284206390381,
13921
+ "learning_rate": 1.047499266130585e-06,
13922
+ "loss": 0.1744,
13923
+ "step": 1830
13924
+ },
13925
+ {
13926
+ "epoch": 0.8679781938848068,
13927
+ "grad_norm": 5.419455051422119,
13928
+ "learning_rate": 1.0401365032329812e-06,
13929
+ "loss": 0.2004,
13930
+ "step": 1831
13931
+ },
13932
+ {
13933
+ "epoch": 0.8684522398672672,
13934
+ "grad_norm": 5.385417938232422,
13935
+ "learning_rate": 1.0327982879119425e-06,
13936
+ "loss": 0.2027,
13937
+ "step": 1832
13938
+ },
13939
+ {
13940
+ "epoch": 0.8689262858497274,
13941
+ "grad_norm": 4.17827033996582,
13942
+ "learning_rate": 1.0254846402721764e-06,
13943
+ "loss": 0.1501,
13944
+ "step": 1833
13945
+ },
13946
+ {
13947
+ "epoch": 0.8694003318321877,
13948
+ "grad_norm": 4.2940354347229,
13949
+ "learning_rate": 1.0181955803510724e-06,
13950
+ "loss": 0.1162,
13951
+ "step": 1834
13952
+ },
13953
+ {
13954
+ "epoch": 0.869874377814648,
13955
+ "grad_norm": 8.596222877502441,
13956
+ "learning_rate": 1.010931128118665e-06,
13957
+ "loss": 0.1216,
13958
+ "step": 1835
13959
+ },
13960
+ {
13961
+ "epoch": 0.8703484237971083,
13962
+ "grad_norm": 4.9963884353637695,
13963
+ "learning_rate": 1.0036913034775675e-06,
13964
+ "loss": 0.1779,
13965
+ "step": 1836
13966
+ },
13967
+ {
13968
+ "epoch": 0.8708224697795686,
13969
+ "grad_norm": 4.238993167877197,
13970
+ "learning_rate": 9.964761262629196e-07,
13971
+ "loss": 0.1237,
13972
+ "step": 1837
13973
+ },
13974
+ {
13975
+ "epoch": 0.8712965157620289,
13976
+ "grad_norm": 8.45755672454834,
13977
+ "learning_rate": 9.892856162423348e-07,
13978
+ "loss": 0.1578,
13979
+ "step": 1838
13980
+ },
13981
+ {
13982
+ "epoch": 0.8717705617444892,
13983
+ "grad_norm": 7.35408353805542,
13984
+ "learning_rate": 9.821197931158455e-07,
13985
+ "loss": 0.2077,
13986
+ "step": 1839
13987
+ },
13988
+ {
13989
+ "epoch": 0.8722446077269496,
13990
+ "grad_norm": 4.194153785705566,
13991
+ "learning_rate": 9.749786765158464e-07,
13992
+ "loss": 0.1294,
13993
+ "step": 1840
13994
+ },
13995
+ {
13996
+ "epoch": 0.8722446077269496,
13997
+ "eval_accuracy": 0.9943639291465378,
13998
+ "eval_f1": 0.9369369369369369,
13999
+ "eval_loss": 0.012980014085769653,
14000
+ "eval_precision": 0.8813559322033898,
14001
+ "eval_recall": 1.0,
14002
+ "eval_runtime": 49.6245,
14003
+ "eval_samples_per_second": 5.461,
14004
+ "eval_steps_per_second": 0.181,
14005
+ "step": 1840
14006
+ },
14007
+ {
14008
+ "epoch": 0.8727186537094098,
14009
+ "grad_norm": 8.562105178833008,
14010
+ "learning_rate": 9.678622860070474e-07,
14011
+ "loss": 0.2731,
14012
+ "step": 1841
14013
+ },
14014
+ {
14015
+ "epoch": 0.8731926996918701,
14016
+ "grad_norm": 7.327461242675781,
14017
+ "learning_rate": 9.607706410864083e-07,
14018
+ "loss": 0.1846,
14019
+ "step": 1842
14020
+ },
14021
+ {
14022
+ "epoch": 0.8736667456743304,
14023
+ "grad_norm": 5.737156867980957,
14024
+ "learning_rate": 9.537037611831047e-07,
14025
+ "loss": 0.2219,
14026
+ "step": 1843
14027
+ },
14028
+ {
14029
+ "epoch": 0.8741407916567907,
14030
+ "grad_norm": 3.665459156036377,
14031
+ "learning_rate": 9.466616656584493e-07,
14032
+ "loss": 0.1163,
14033
+ "step": 1844
14034
+ },
14035
+ {
14036
+ "epoch": 0.874614837639251,
14037
+ "grad_norm": 5.575207710266113,
14038
+ "learning_rate": 9.396443738058614e-07,
14039
+ "loss": 0.1411,
14040
+ "step": 1845
14041
+ },
14042
+ {
14043
+ "epoch": 0.8750888836217113,
14044
+ "grad_norm": 2.1095454692840576,
14045
+ "learning_rate": 9.32651904850801e-07,
14046
+ "loss": 0.0826,
14047
+ "step": 1846
14048
+ },
14049
+ {
14050
+ "epoch": 0.8755629296041716,
14051
+ "grad_norm": 8.68192195892334,
14052
+ "learning_rate": 9.256842779507236e-07,
14053
+ "loss": 0.1324,
14054
+ "step": 1847
14055
+ },
14056
+ {
14057
+ "epoch": 0.8760369755866318,
14058
+ "grad_norm": 7.812302112579346,
14059
+ "learning_rate": 9.187415121950194e-07,
14060
+ "loss": 0.2442,
14061
+ "step": 1848
14062
+ },
14063
+ {
14064
+ "epoch": 0.8765110215690922,
14065
+ "grad_norm": 3.16363787651062,
14066
+ "learning_rate": 9.118236266049707e-07,
14067
+ "loss": 0.1255,
14068
+ "step": 1849
14069
+ },
14070
+ {
14071
+ "epoch": 0.8769850675515525,
14072
+ "grad_norm": 5.470139503479004,
14073
+ "learning_rate": 9.049306401336922e-07,
14074
+ "loss": 0.1974,
14075
+ "step": 1850
14076
+ },
14077
+ {
14078
+ "epoch": 0.8774591135340128,
14079
+ "grad_norm": 3.268472194671631,
14080
+ "learning_rate": 8.980625716660829e-07,
14081
+ "loss": 0.0863,
14082
+ "step": 1851
14083
+ },
14084
+ {
14085
+ "epoch": 0.8779331595164731,
14086
+ "grad_norm": 2.927609920501709,
14087
+ "learning_rate": 8.912194400187712e-07,
14088
+ "loss": 0.0827,
14089
+ "step": 1852
14090
+ },
14091
+ {
14092
+ "epoch": 0.8784072054989334,
14093
+ "grad_norm": 6.857902526855469,
14094
+ "learning_rate": 8.84401263940069e-07,
14095
+ "loss": 0.1555,
14096
+ "step": 1853
14097
+ },
14098
+ {
14099
+ "epoch": 0.8788812514813937,
14100
+ "grad_norm": 4.798774719238281,
14101
+ "learning_rate": 8.776080621099159e-07,
14102
+ "loss": 0.1973,
14103
+ "step": 1854
14104
+ },
14105
+ {
14106
+ "epoch": 0.879355297463854,
14107
+ "grad_norm": 4.6252946853637695,
14108
+ "learning_rate": 8.708398531398233e-07,
14109
+ "loss": 0.1612,
14110
+ "step": 1855
14111
+ },
14112
+ {
14113
+ "epoch": 0.8798293434463142,
14114
+ "grad_norm": 4.394217491149902,
14115
+ "learning_rate": 8.640966555728369e-07,
14116
+ "loss": 0.1261,
14117
+ "step": 1856
14118
+ },
14119
+ {
14120
+ "epoch": 0.8803033894287746,
14121
+ "grad_norm": 6.826826095581055,
14122
+ "learning_rate": 8.573784878834734e-07,
14123
+ "loss": 0.1587,
14124
+ "step": 1857
14125
+ },
14126
+ {
14127
+ "epoch": 0.8807774354112349,
14128
+ "grad_norm": 10.54698657989502,
14129
+ "learning_rate": 8.506853684776773e-07,
14130
+ "loss": 0.154,
14131
+ "step": 1858
14132
+ },
14133
+ {
14134
+ "epoch": 0.8812514813936951,
14135
+ "grad_norm": 4.272285461425781,
14136
+ "learning_rate": 8.440173156927612e-07,
14137
+ "loss": 0.1157,
14138
+ "step": 1859
14139
+ },
14140
+ {
14141
+ "epoch": 0.8817255273761555,
14142
+ "grad_norm": 5.016007900238037,
14143
+ "learning_rate": 8.373743477973739e-07,
14144
+ "loss": 0.2076,
14145
+ "step": 1860
14146
+ },
14147
+ {
14148
+ "epoch": 0.8817255273761555,
14149
+ "eval_accuracy": 0.9935587761674718,
14150
+ "eval_f1": 0.9272727272727272,
14151
+ "eval_loss": 0.012083540670573711,
14152
+ "eval_precision": 0.8793103448275862,
14153
+ "eval_recall": 0.9807692307692307,
14154
+ "eval_runtime": 49.9345,
14155
+ "eval_samples_per_second": 5.427,
14156
+ "eval_steps_per_second": 0.18,
14157
+ "step": 1860
14158
+ },
14159
+ {
14160
+ "epoch": 0.8821995733586158,
14161
+ "grad_norm": 5.868921756744385,
14162
+ "learning_rate": 8.307564829914272e-07,
14163
+ "loss": 0.168,
14164
+ "step": 1861
14165
+ },
14166
+ {
14167
+ "epoch": 0.8826736193410761,
14168
+ "grad_norm": 8.008037567138672,
14169
+ "learning_rate": 8.241637394060619e-07,
14170
+ "loss": 0.0711,
14171
+ "step": 1862
14172
+ },
14173
+ {
14174
+ "epoch": 0.8831476653235364,
14175
+ "grad_norm": 4.42324686050415,
14176
+ "learning_rate": 8.175961351035943e-07,
14177
+ "loss": 0.0957,
14178
+ "step": 1863
14179
+ },
14180
+ {
14181
+ "epoch": 0.8836217113059966,
14182
+ "grad_norm": 5.00337553024292,
14183
+ "learning_rate": 8.110536880774655e-07,
14184
+ "loss": 0.1814,
14185
+ "step": 1864
14186
+ },
14187
+ {
14188
+ "epoch": 0.884095757288457,
14189
+ "grad_norm": 4.169017791748047,
14190
+ "learning_rate": 8.045364162521884e-07,
14191
+ "loss": 0.112,
14192
+ "step": 1865
14193
+ },
14194
+ {
14195
+ "epoch": 0.8845698032709173,
14196
+ "grad_norm": 8.79692554473877,
14197
+ "learning_rate": 7.98044337483308e-07,
14198
+ "loss": 0.2539,
14199
+ "step": 1866
14200
+ },
14201
+ {
14202
+ "epoch": 0.8850438492533775,
14203
+ "grad_norm": 6.905977725982666,
14204
+ "learning_rate": 7.915774695573452e-07,
14205
+ "loss": 0.1628,
14206
+ "step": 1867
14207
+ },
14208
+ {
14209
+ "epoch": 0.8855178952358379,
14210
+ "grad_norm": 3.759481430053711,
14211
+ "learning_rate": 7.851358301917511e-07,
14212
+ "loss": 0.17,
14213
+ "step": 1868
14214
+ },
14215
+ {
14216
+ "epoch": 0.8859919412182982,
14217
+ "grad_norm": 4.754873275756836,
14218
+ "learning_rate": 7.787194370348549e-07,
14219
+ "loss": 0.2469,
14220
+ "step": 1869
14221
+ },
14222
+ {
14223
+ "epoch": 0.8864659872007584,
14224
+ "grad_norm": 5.5656280517578125,
14225
+ "learning_rate": 7.723283076658217e-07,
14226
+ "loss": 0.1551,
14227
+ "step": 1870
14228
+ },
14229
+ {
14230
+ "epoch": 0.8869400331832188,
14231
+ "grad_norm": 2.3625526428222656,
14232
+ "learning_rate": 7.659624595945969e-07,
14233
+ "loss": 0.0846,
14234
+ "step": 1871
14235
+ },
14236
+ {
14237
+ "epoch": 0.887414079165679,
14238
+ "grad_norm": 10.592917442321777,
14239
+ "learning_rate": 7.596219102618652e-07,
14240
+ "loss": 0.2762,
14241
+ "step": 1872
14242
+ },
14243
+ {
14244
+ "epoch": 0.8878881251481394,
14245
+ "grad_norm": 5.2067952156066895,
14246
+ "learning_rate": 7.533066770389985e-07,
14247
+ "loss": 0.1768,
14248
+ "step": 1873
14249
+ },
14250
+ {
14251
+ "epoch": 0.8883621711305997,
14252
+ "grad_norm": 4.715292930603027,
14253
+ "learning_rate": 7.470167772280091e-07,
14254
+ "loss": 0.1107,
14255
+ "step": 1874
14256
+ },
14257
+ {
14258
+ "epoch": 0.8888362171130599,
14259
+ "grad_norm": 3.512718439102173,
14260
+ "learning_rate": 7.40752228061502e-07,
14261
+ "loss": 0.1145,
14262
+ "step": 1875
14263
+ },
14264
+ {
14265
+ "epoch": 0.8893102630955203,
14266
+ "grad_norm": 3.8536527156829834,
14267
+ "learning_rate": 7.345130467026318e-07,
14268
+ "loss": 0.1473,
14269
+ "step": 1876
14270
+ },
14271
+ {
14272
+ "epoch": 0.8897843090779806,
14273
+ "grad_norm": 3.4637436866760254,
14274
+ "learning_rate": 7.282992502450447e-07,
14275
+ "loss": 0.1661,
14276
+ "step": 1877
14277
+ },
14278
+ {
14279
+ "epoch": 0.8902583550604408,
14280
+ "grad_norm": 2.695815324783325,
14281
+ "learning_rate": 7.221108557128509e-07,
14282
+ "loss": 0.139,
14283
+ "step": 1878
14284
+ },
14285
+ {
14286
+ "epoch": 0.8907324010429012,
14287
+ "grad_norm": 4.534758567810059,
14288
+ "learning_rate": 7.159478800605546e-07,
14289
+ "loss": 0.1425,
14290
+ "step": 1879
14291
+ },
14292
+ {
14293
+ "epoch": 0.8912064470253614,
14294
+ "grad_norm": 7.158409595489502,
14295
+ "learning_rate": 7.098103401730272e-07,
14296
+ "loss": 0.1628,
14297
+ "step": 1880
14298
+ },
14299
+ {
14300
+ "epoch": 0.8912064470253614,
14301
+ "eval_accuracy": 0.9935587761674718,
14302
+ "eval_f1": 0.9272727272727272,
14303
+ "eval_loss": 0.012087295763194561,
14304
+ "eval_precision": 0.8793103448275862,
14305
+ "eval_recall": 0.9807692307692307,
14306
+ "eval_runtime": 49.8216,
14307
+ "eval_samples_per_second": 5.439,
14308
+ "eval_steps_per_second": 0.181,
14309
+ "step": 1880
14310
+ },
14311
+ {
14312
+ "epoch": 0.8916804930078217,
14313
+ "grad_norm": 4.419368267059326,
14314
+ "learning_rate": 7.03698252865449e-07,
14315
+ "loss": 0.0986,
14316
+ "step": 1881
14317
+ },
14318
+ {
14319
+ "epoch": 0.8921545389902821,
14320
+ "grad_norm": 5.9724931716918945,
14321
+ "learning_rate": 6.976116348832684e-07,
14322
+ "loss": 0.2064,
14323
+ "step": 1882
14324
+ },
14325
+ {
14326
+ "epoch": 0.8926285849727423,
14327
+ "grad_norm": 4.130607604980469,
14328
+ "learning_rate": 6.915505029021552e-07,
14329
+ "loss": 0.1445,
14330
+ "step": 1883
14331
+ },
14332
+ {
14333
+ "epoch": 0.8931026309552027,
14334
+ "grad_norm": 4.273713111877441,
14335
+ "learning_rate": 6.855148735279527e-07,
14336
+ "loss": 0.1389,
14337
+ "step": 1884
14338
+ },
14339
+ {
14340
+ "epoch": 0.893576676937663,
14341
+ "grad_norm": 5.399996280670166,
14342
+ "learning_rate": 6.795047632966379e-07,
14343
+ "loss": 0.1461,
14344
+ "step": 1885
14345
+ },
14346
+ {
14347
+ "epoch": 0.8940507229201232,
14348
+ "grad_norm": 6.056548118591309,
14349
+ "learning_rate": 6.735201886742671e-07,
14350
+ "loss": 0.1935,
14351
+ "step": 1886
14352
+ },
14353
+ {
14354
+ "epoch": 0.8945247689025836,
14355
+ "grad_norm": 5.537142276763916,
14356
+ "learning_rate": 6.675611660569403e-07,
14357
+ "loss": 0.1816,
14358
+ "step": 1887
14359
+ },
14360
+ {
14361
+ "epoch": 0.8949988148850438,
14362
+ "grad_norm": 6.469786167144775,
14363
+ "learning_rate": 6.616277117707493e-07,
14364
+ "loss": 0.1772,
14365
+ "step": 1888
14366
+ },
14367
+ {
14368
+ "epoch": 0.8954728608675041,
14369
+ "grad_norm": 4.300382137298584,
14370
+ "learning_rate": 6.55719842071737e-07,
14371
+ "loss": 0.0932,
14372
+ "step": 1889
14373
+ },
14374
+ {
14375
+ "epoch": 0.8959469068499645,
14376
+ "grad_norm": 6.920015335083008,
14377
+ "learning_rate": 6.498375731458529e-07,
14378
+ "loss": 0.208,
14379
+ "step": 1890
14380
+ },
14381
+ {
14382
+ "epoch": 0.8964209528324247,
14383
+ "grad_norm": 5.358169078826904,
14384
+ "learning_rate": 6.439809211089043e-07,
14385
+ "loss": 0.1518,
14386
+ "step": 1891
14387
+ },
14388
+ {
14389
+ "epoch": 0.896894998814885,
14390
+ "grad_norm": 9.420503616333008,
14391
+ "learning_rate": 6.381499020065163e-07,
14392
+ "loss": 0.1817,
14393
+ "step": 1892
14394
+ },
14395
+ {
14396
+ "epoch": 0.8973690447973454,
14397
+ "grad_norm": 5.0321855545043945,
14398
+ "learning_rate": 6.323445318140886e-07,
14399
+ "loss": 0.1786,
14400
+ "step": 1893
14401
+ },
14402
+ {
14403
+ "epoch": 0.8978430907798056,
14404
+ "grad_norm": 4.13561487197876,
14405
+ "learning_rate": 6.265648264367452e-07,
14406
+ "loss": 0.1003,
14407
+ "step": 1894
14408
+ },
14409
+ {
14410
+ "epoch": 0.898317136762266,
14411
+ "grad_norm": 7.733060359954834,
14412
+ "learning_rate": 6.20810801709305e-07,
14413
+ "loss": 0.216,
14414
+ "step": 1895
14415
+ },
14416
+ {
14417
+ "epoch": 0.8987911827447262,
14418
+ "grad_norm": 2.7273457050323486,
14419
+ "learning_rate": 6.15082473396218e-07,
14420
+ "loss": 0.1149,
14421
+ "step": 1896
14422
+ },
14423
+ {
14424
+ "epoch": 0.8992652287271865,
14425
+ "grad_norm": 2.0938057899475098,
14426
+ "learning_rate": 6.093798571915389e-07,
14427
+ "loss": 0.0787,
14428
+ "step": 1897
14429
+ },
14430
+ {
14431
+ "epoch": 0.8997392747096469,
14432
+ "grad_norm": 6.044375896453857,
14433
+ "learning_rate": 6.037029687188767e-07,
14434
+ "loss": 0.1878,
14435
+ "step": 1898
14436
+ },
14437
+ {
14438
+ "epoch": 0.9002133206921071,
14439
+ "grad_norm": 2.365513563156128,
14440
+ "learning_rate": 5.980518235313549e-07,
14441
+ "loss": 0.1065,
14442
+ "step": 1899
14443
+ },
14444
+ {
14445
+ "epoch": 0.9006873666745674,
14446
+ "grad_norm": 4.049135684967041,
14447
+ "learning_rate": 5.924264371115652e-07,
14448
+ "loss": 0.156,
14449
+ "step": 1900
14450
+ },
14451
+ {
14452
+ "epoch": 0.9006873666745674,
14453
+ "eval_accuracy": 0.9935587761674718,
14454
+ "eval_f1": 0.9272727272727272,
14455
+ "eval_loss": 0.012837257236242294,
14456
+ "eval_precision": 0.8793103448275862,
14457
+ "eval_recall": 0.9807692307692307,
14458
+ "eval_runtime": 49.4558,
14459
+ "eval_samples_per_second": 5.48,
14460
+ "eval_steps_per_second": 0.182,
14461
+ "step": 1900
14462
  }
14463
  ],
14464
  "logging_steps": 1,
 
14478
  "attributes": {}
14479
  }
14480
  },
14481
+ "total_flos": 5.0749333019243315e+17,
14482
  "train_batch_size": 8,
14483
  "trial_name": null,
14484
  "trial_params": null