fats-fme commited on
Commit
7eff20c
1 Parent(s): 74fa25b

Training in progress, step 303, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36c48e9709004b33bdbbd28b1e33adaa806cd06a26e1f4d1bf6b5fedd784f297
3
  size 501168482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:556d16bb6aae79c6f06e3f2b46b8e0433216e37e93f5a8a74d9897d77f66f311
3
  size 501168482
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc3893adf12779f0d48b1513b948863c9f74d5b80bf37fd6bb3d143cbfff91ec
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3254f9e31f58266c38d6de2c365eacf98b89fb35d937a352ff406f7e1ed6eac8
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce62f71c8c10d088ccc652695dd03a663736071fc2cab92e5639ffdddbd4c788
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f1f81f7017fe3de5487899a5240e530892937427049e1d8f17c3160bd856c94
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8b5a959d9459f2fc9422d29e6b89c56e276e4fcd74ca81bc84eb9dc629ceec0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cd637ac2493efabb59c857ff309da18b7f1fa1c1f76d6f3f94628c5048f8d88
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.06933779365924124,
5
  "eval_steps": 76,
6
- "global_step": 228,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1635,6 +1635,531 @@
1635
  "eval_samples_per_second": 3.941,
1636
  "eval_steps_per_second": 0.986,
1637
  "step": 228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1638
  }
1639
  ],
1640
  "logging_steps": 1,
@@ -1649,12 +2174,12 @@
1649
  "should_evaluate": false,
1650
  "should_log": false,
1651
  "should_save": true,
1652
- "should_training_stop": false
1653
  },
1654
  "attributes": {}
1655
  }
1656
  },
1657
- "total_flos": 1.5612257832366244e+18,
1658
  "train_batch_size": 2,
1659
  "trial_name": null,
1660
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.09214627841557059,
5
  "eval_steps": 76,
6
+ "global_step": 303,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1635
  "eval_samples_per_second": 3.941,
1636
  "eval_steps_per_second": 0.986,
1637
  "step": 228
1638
+ },
1639
+ {
1640
+ "epoch": 0.06964190678932564,
1641
+ "grad_norm": NaN,
1642
+ "learning_rate": 3.932929715102863e-05,
1643
+ "loss": 0.0,
1644
+ "step": 229
1645
+ },
1646
+ {
1647
+ "epoch": 0.06994601991941002,
1648
+ "grad_norm": NaN,
1649
+ "learning_rate": 3.834691207696649e-05,
1650
+ "loss": 0.0,
1651
+ "step": 230
1652
+ },
1653
+ {
1654
+ "epoch": 0.07025013304949441,
1655
+ "grad_norm": NaN,
1656
+ "learning_rate": 3.7374033224987084e-05,
1657
+ "loss": 0.0,
1658
+ "step": 231
1659
+ },
1660
+ {
1661
+ "epoch": 0.0705542461795788,
1662
+ "grad_norm": NaN,
1663
+ "learning_rate": 3.6410810602214684e-05,
1664
+ "loss": 0.0,
1665
+ "step": 232
1666
+ },
1667
+ {
1668
+ "epoch": 0.0708583593096632,
1669
+ "grad_norm": NaN,
1670
+ "learning_rate": 3.5457392726890236e-05,
1671
+ "loss": 0.0,
1672
+ "step": 233
1673
+ },
1674
+ {
1675
+ "epoch": 0.07116247243974759,
1676
+ "grad_norm": NaN,
1677
+ "learning_rate": 3.45139266054715e-05,
1678
+ "loss": 0.0,
1679
+ "step": 234
1680
+ },
1681
+ {
1682
+ "epoch": 0.07146658556983197,
1683
+ "grad_norm": NaN,
1684
+ "learning_rate": 3.3580557709966066e-05,
1685
+ "loss": 0.0,
1686
+ "step": 235
1687
+ },
1688
+ {
1689
+ "epoch": 0.07177069869991637,
1690
+ "grad_norm": NaN,
1691
+ "learning_rate": 3.2657429955501394e-05,
1692
+ "loss": 0.0,
1693
+ "step": 236
1694
+ },
1695
+ {
1696
+ "epoch": 0.07207481183000077,
1697
+ "grad_norm": NaN,
1698
+ "learning_rate": 3.174468567813461e-05,
1699
+ "loss": 0.0,
1700
+ "step": 237
1701
+ },
1702
+ {
1703
+ "epoch": 0.07237892496008515,
1704
+ "grad_norm": NaN,
1705
+ "learning_rate": 3.0842465612905837e-05,
1706
+ "loss": 0.0,
1707
+ "step": 238
1708
+ },
1709
+ {
1710
+ "epoch": 0.07268303809016954,
1711
+ "grad_norm": NaN,
1712
+ "learning_rate": 2.9950908872138584e-05,
1713
+ "loss": 0.0,
1714
+ "step": 239
1715
+ },
1716
+ {
1717
+ "epoch": 0.07298715122025394,
1718
+ "grad_norm": NaN,
1719
+ "learning_rate": 2.9070152923989946e-05,
1720
+ "loss": 0.0,
1721
+ "step": 240
1722
+ },
1723
+ {
1724
+ "epoch": 0.07329126435033832,
1725
+ "grad_norm": NaN,
1726
+ "learning_rate": 2.82003335712546e-05,
1727
+ "loss": 0.0,
1728
+ "step": 241
1729
+ },
1730
+ {
1731
+ "epoch": 0.07359537748042272,
1732
+ "grad_norm": NaN,
1733
+ "learning_rate": 2.7341584930425657e-05,
1734
+ "loss": 0.0,
1735
+ "step": 242
1736
+ },
1737
+ {
1738
+ "epoch": 0.07389949061050712,
1739
+ "grad_norm": NaN,
1740
+ "learning_rate": 2.6494039411015193e-05,
1741
+ "loss": 0.0,
1742
+ "step": 243
1743
+ },
1744
+ {
1745
+ "epoch": 0.0742036037405915,
1746
+ "grad_norm": NaN,
1747
+ "learning_rate": 2.5657827695138372e-05,
1748
+ "loss": 0.0,
1749
+ "step": 244
1750
+ },
1751
+ {
1752
+ "epoch": 0.0745077168706759,
1753
+ "grad_norm": NaN,
1754
+ "learning_rate": 2.4833078717363544e-05,
1755
+ "loss": 0.0,
1756
+ "step": 245
1757
+ },
1758
+ {
1759
+ "epoch": 0.07481183000076028,
1760
+ "grad_norm": NaN,
1761
+ "learning_rate": 2.4019919644832023e-05,
1762
+ "loss": 0.0,
1763
+ "step": 246
1764
+ },
1765
+ {
1766
+ "epoch": 0.07511594313084467,
1767
+ "grad_norm": NaN,
1768
+ "learning_rate": 2.3218475857650346e-05,
1769
+ "loss": 0.0,
1770
+ "step": 247
1771
+ },
1772
+ {
1773
+ "epoch": 0.07542005626092907,
1774
+ "grad_norm": NaN,
1775
+ "learning_rate": 2.242887092955801e-05,
1776
+ "loss": 0.0,
1777
+ "step": 248
1778
+ },
1779
+ {
1780
+ "epoch": 0.07572416939101345,
1781
+ "grad_norm": NaN,
1782
+ "learning_rate": 2.1651226608873877e-05,
1783
+ "loss": 0.0,
1784
+ "step": 249
1785
+ },
1786
+ {
1787
+ "epoch": 0.07602828252109785,
1788
+ "grad_norm": NaN,
1789
+ "learning_rate": 2.0885662799723804e-05,
1790
+ "loss": 0.0,
1791
+ "step": 250
1792
+ },
1793
+ {
1794
+ "epoch": 0.07633239565118224,
1795
+ "grad_norm": NaN,
1796
+ "learning_rate": 2.0132297543552757e-05,
1797
+ "loss": 0.0,
1798
+ "step": 251
1799
+ },
1800
+ {
1801
+ "epoch": 0.07663650878126663,
1802
+ "grad_norm": NaN,
1803
+ "learning_rate": 1.939124700092423e-05,
1804
+ "loss": 0.0,
1805
+ "step": 252
1806
+ },
1807
+ {
1808
+ "epoch": 0.07694062191135102,
1809
+ "grad_norm": NaN,
1810
+ "learning_rate": 1.866262543360958e-05,
1811
+ "loss": 0.0,
1812
+ "step": 253
1813
+ },
1814
+ {
1815
+ "epoch": 0.07724473504143542,
1816
+ "grad_norm": NaN,
1817
+ "learning_rate": 1.7946545186970022e-05,
1818
+ "loss": 0.0,
1819
+ "step": 254
1820
+ },
1821
+ {
1822
+ "epoch": 0.0775488481715198,
1823
+ "grad_norm": NaN,
1824
+ "learning_rate": 1.7243116672634262e-05,
1825
+ "loss": 0.0,
1826
+ "step": 255
1827
+ },
1828
+ {
1829
+ "epoch": 0.0778529613016042,
1830
+ "grad_norm": NaN,
1831
+ "learning_rate": 1.6552448351474304e-05,
1832
+ "loss": 0.0,
1833
+ "step": 256
1834
+ },
1835
+ {
1836
+ "epoch": 0.0781570744316886,
1837
+ "grad_norm": NaN,
1838
+ "learning_rate": 1.587464671688187e-05,
1839
+ "loss": 0.0,
1840
+ "step": 257
1841
+ },
1842
+ {
1843
+ "epoch": 0.07846118756177298,
1844
+ "grad_norm": NaN,
1845
+ "learning_rate": 1.520981627834851e-05,
1846
+ "loss": 0.0,
1847
+ "step": 258
1848
+ },
1849
+ {
1850
+ "epoch": 0.07876530069185737,
1851
+ "grad_norm": NaN,
1852
+ "learning_rate": 1.4558059545351143e-05,
1853
+ "loss": 0.0,
1854
+ "step": 259
1855
+ },
1856
+ {
1857
+ "epoch": 0.07906941382194176,
1858
+ "grad_norm": NaN,
1859
+ "learning_rate": 1.3919477011546423e-05,
1860
+ "loss": 0.0,
1861
+ "step": 260
1862
+ },
1863
+ {
1864
+ "epoch": 0.07937352695202615,
1865
+ "grad_norm": NaN,
1866
+ "learning_rate": 1.3294167139275593e-05,
1867
+ "loss": 0.0,
1868
+ "step": 261
1869
+ },
1870
+ {
1871
+ "epoch": 0.07967764008211055,
1872
+ "grad_norm": NaN,
1873
+ "learning_rate": 1.2682226344382796e-05,
1874
+ "loss": 0.0,
1875
+ "step": 262
1876
+ },
1877
+ {
1878
+ "epoch": 0.07998175321219493,
1879
+ "grad_norm": NaN,
1880
+ "learning_rate": 1.208374898134883e-05,
1881
+ "loss": 0.0,
1882
+ "step": 263
1883
+ },
1884
+ {
1885
+ "epoch": 0.08028586634227933,
1886
+ "grad_norm": NaN,
1887
+ "learning_rate": 1.1498827328742623e-05,
1888
+ "loss": 0.0,
1889
+ "step": 264
1890
+ },
1891
+ {
1892
+ "epoch": 0.08058997947236372,
1893
+ "grad_norm": NaN,
1894
+ "learning_rate": 1.0927551574992967e-05,
1895
+ "loss": 0.0,
1896
+ "step": 265
1897
+ },
1898
+ {
1899
+ "epoch": 0.0808940926024481,
1900
+ "grad_norm": NaN,
1901
+ "learning_rate": 1.0370009804482483e-05,
1902
+ "loss": 0.0,
1903
+ "step": 266
1904
+ },
1905
+ {
1906
+ "epoch": 0.0811982057325325,
1907
+ "grad_norm": NaN,
1908
+ "learning_rate": 9.82628798396592e-06,
1909
+ "loss": 0.0,
1910
+ "step": 267
1911
+ },
1912
+ {
1913
+ "epoch": 0.0815023188626169,
1914
+ "grad_norm": NaN,
1915
+ "learning_rate": 9.296469949315156e-06,
1916
+ "loss": 0.0,
1917
+ "step": 268
1918
+ },
1919
+ {
1920
+ "epoch": 0.08180643199270128,
1921
+ "grad_norm": NaN,
1922
+ "learning_rate": 8.780637392592495e-06,
1923
+ "loss": 0.0,
1924
+ "step": 269
1925
+ },
1926
+ {
1927
+ "epoch": 0.08211054512278568,
1928
+ "grad_norm": NaN,
1929
+ "learning_rate": 8.278869849454718e-06,
1930
+ "loss": 0.0,
1931
+ "step": 270
1932
+ },
1933
+ {
1934
+ "epoch": 0.08241465825287007,
1935
+ "grad_norm": NaN,
1936
+ "learning_rate": 7.791244686889588e-06,
1937
+ "loss": 0.0,
1938
+ "step": 271
1939
+ },
1940
+ {
1941
+ "epoch": 0.08271877138295446,
1942
+ "grad_norm": NaN,
1943
+ "learning_rate": 7.317837091286706e-06,
1944
+ "loss": 0.0,
1945
+ "step": 272
1946
+ },
1947
+ {
1948
+ "epoch": 0.08302288451303885,
1949
+ "grad_norm": NaN,
1950
+ "learning_rate": 6.858720056844614e-06,
1951
+ "loss": 0.0,
1952
+ "step": 273
1953
+ },
1954
+ {
1955
+ "epoch": 0.08332699764312324,
1956
+ "grad_norm": NaN,
1957
+ "learning_rate": 6.413964374315851e-06,
1958
+ "loss": 0.0,
1959
+ "step": 274
1960
+ },
1961
+ {
1962
+ "epoch": 0.08363111077320763,
1963
+ "grad_norm": NaN,
1964
+ "learning_rate": 5.983638620091858e-06,
1965
+ "loss": 0.0,
1966
+ "step": 275
1967
+ },
1968
+ {
1969
+ "epoch": 0.08393522390329203,
1970
+ "grad_norm": NaN,
1971
+ "learning_rate": 5.567809145629244e-06,
1972
+ "loss": 0.0,
1973
+ "step": 276
1974
+ },
1975
+ {
1976
+ "epoch": 0.08423933703337641,
1977
+ "grad_norm": NaN,
1978
+ "learning_rate": 5.1665400672190725e-06,
1979
+ "loss": 0.0,
1980
+ "step": 277
1981
+ },
1982
+ {
1983
+ "epoch": 0.0845434501634608,
1984
+ "grad_norm": NaN,
1985
+ "learning_rate": 4.7798932561009865e-06,
1986
+ "loss": 0.0,
1987
+ "step": 278
1988
+ },
1989
+ {
1990
+ "epoch": 0.0848475632935452,
1991
+ "grad_norm": NaN,
1992
+ "learning_rate": 4.407928328923194e-06,
1993
+ "loss": 0.0,
1994
+ "step": 279
1995
+ },
1996
+ {
1997
+ "epoch": 0.08515167642362959,
1998
+ "grad_norm": NaN,
1999
+ "learning_rate": 4.050702638550275e-06,
2000
+ "loss": 0.0,
2001
+ "step": 280
2002
+ },
2003
+ {
2004
+ "epoch": 0.08545578955371398,
2005
+ "grad_norm": NaN,
2006
+ "learning_rate": 3.7082712652200867e-06,
2007
+ "loss": 0.0,
2008
+ "step": 281
2009
+ },
2010
+ {
2011
+ "epoch": 0.08575990268379838,
2012
+ "grad_norm": NaN,
2013
+ "learning_rate": 3.380687008050909e-06,
2014
+ "loss": 0.0,
2015
+ "step": 282
2016
+ },
2017
+ {
2018
+ "epoch": 0.08606401581388276,
2019
+ "grad_norm": NaN,
2020
+ "learning_rate": 3.068000376900515e-06,
2021
+ "loss": 0.0,
2022
+ "step": 283
2023
+ },
2024
+ {
2025
+ "epoch": 0.08636812894396716,
2026
+ "grad_norm": NaN,
2027
+ "learning_rate": 2.770259584577972e-06,
2028
+ "loss": 0.0,
2029
+ "step": 284
2030
+ },
2031
+ {
2032
+ "epoch": 0.08667224207405155,
2033
+ "grad_norm": NaN,
2034
+ "learning_rate": 2.4875105394098654e-06,
2035
+ "loss": 0.0,
2036
+ "step": 285
2037
+ },
2038
+ {
2039
+ "epoch": 0.08697635520413594,
2040
+ "grad_norm": NaN,
2041
+ "learning_rate": 2.219796838161681e-06,
2042
+ "loss": 0.0,
2043
+ "step": 286
2044
+ },
2045
+ {
2046
+ "epoch": 0.08728046833422033,
2047
+ "grad_norm": NaN,
2048
+ "learning_rate": 1.967159759315751e-06,
2049
+ "loss": 0.0,
2050
+ "step": 287
2051
+ },
2052
+ {
2053
+ "epoch": 0.08758458146430473,
2054
+ "grad_norm": NaN,
2055
+ "learning_rate": 1.7296382567064672e-06,
2056
+ "loss": 0.0,
2057
+ "step": 288
2058
+ },
2059
+ {
2060
+ "epoch": 0.08788869459438911,
2061
+ "grad_norm": NaN,
2062
+ "learning_rate": 1.5072689535141072e-06,
2063
+ "loss": 0.0,
2064
+ "step": 289
2065
+ },
2066
+ {
2067
+ "epoch": 0.08819280772447351,
2068
+ "grad_norm": NaN,
2069
+ "learning_rate": 1.3000861366179062e-06,
2070
+ "loss": 0.0,
2071
+ "step": 290
2072
+ },
2073
+ {
2074
+ "epoch": 0.08849692085455789,
2075
+ "grad_norm": NaN,
2076
+ "learning_rate": 1.1081217513094212e-06,
2077
+ "loss": 0.0,
2078
+ "step": 291
2079
+ },
2080
+ {
2081
+ "epoch": 0.08880103398464229,
2082
+ "grad_norm": NaN,
2083
+ "learning_rate": 9.314053963669245e-07,
2084
+ "loss": 0.0,
2085
+ "step": 292
2086
+ },
2087
+ {
2088
+ "epoch": 0.08910514711472668,
2089
+ "grad_norm": NaN,
2090
+ "learning_rate": 7.699643194915784e-07,
2091
+ "loss": 0.0,
2092
+ "step": 293
2093
+ },
2094
+ {
2095
+ "epoch": 0.08940926024481106,
2096
+ "grad_norm": NaN,
2097
+ "learning_rate": 6.238234131061616e-07,
2098
+ "loss": 0.0,
2099
+ "step": 294
2100
+ },
2101
+ {
2102
+ "epoch": 0.08971337337489546,
2103
+ "grad_norm": NaN,
2104
+ "learning_rate": 4.93005210516928e-07,
2105
+ "loss": 0.0,
2106
+ "step": 295
2107
+ },
2108
+ {
2109
+ "epoch": 0.09001748650497986,
2110
+ "grad_norm": NaN,
2111
+ "learning_rate": 3.775298824391982e-07,
2112
+ "loss": 0.0,
2113
+ "step": 296
2114
+ },
2115
+ {
2116
+ "epoch": 0.09032159963506424,
2117
+ "grad_norm": NaN,
2118
+ "learning_rate": 2.774152338873126e-07,
2119
+ "loss": 0.0,
2120
+ "step": 297
2121
+ },
2122
+ {
2123
+ "epoch": 0.09062571276514864,
2124
+ "grad_norm": NaN,
2125
+ "learning_rate": 1.9267670142926187e-07,
2126
+ "loss": 0.0,
2127
+ "step": 298
2128
+ },
2129
+ {
2130
+ "epoch": 0.09092982589523303,
2131
+ "grad_norm": NaN,
2132
+ "learning_rate": 1.2332735080651248e-07,
2133
+ "loss": 0.0,
2134
+ "step": 299
2135
+ },
2136
+ {
2137
+ "epoch": 0.09123393902531741,
2138
+ "grad_norm": NaN,
2139
+ "learning_rate": 6.9377874919474e-08,
2140
+ "loss": 0.0,
2141
+ "step": 300
2142
+ },
2143
+ {
2144
+ "epoch": 0.09153805215540181,
2145
+ "grad_norm": NaN,
2146
+ "learning_rate": 3.0836592178717926e-08,
2147
+ "loss": 0.0,
2148
+ "step": 301
2149
+ },
2150
+ {
2151
+ "epoch": 0.09184216528548621,
2152
+ "grad_norm": NaN,
2153
+ "learning_rate": 7.709445222403577e-09,
2154
+ "loss": 0.0,
2155
+ "step": 302
2156
+ },
2157
+ {
2158
+ "epoch": 0.09214627841557059,
2159
+ "grad_norm": NaN,
2160
+ "learning_rate": 0.0,
2161
+ "loss": 0.0,
2162
+ "step": 303
2163
  }
2164
  ],
2165
  "logging_steps": 1,
 
2174
  "should_evaluate": false,
2175
  "should_log": false,
2176
  "should_save": true,
2177
+ "should_training_stop": true
2178
  },
2179
  "attributes": {}
2180
  }
2181
  },
2182
+ "total_flos": 2.06572946950665e+18,
2183
  "train_batch_size": 2,
2184
  "trial_name": null,
2185
  "trial_params": null