Training in progress, step 12100, checkpoint
Browse files
last-checkpoint/model-00001-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4978139416
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:990f79ce5d5fce5b54c543410da49311c8727e5393eaf8de5beb75ddea62f025
|
3 |
size 4978139416
|
last-checkpoint/model-00002-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3659223436
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:348350ce5be165c3b33d0652bff5953348b5181192242ec398df2e8b058bc2bb
|
3 |
size 3659223436
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 17241500333
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7fda19efc8188b89a824ef6b745bab7a4b2df0fcc62fc3ee12571612ab5443e8
|
3 |
size 17241500333
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac3b1bd46b3911f03359a3982a0c03f865d3787800599fe7d28e536bbc352b08
|
3 |
+
size 14567
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 623
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fca80c682586ea565475c8cb2e3f5097ebcafda0408dbe21093035fc5d9ba92
|
3 |
size 623
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 100,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -1847,6 +1847,102 @@
|
|
1847 |
"eval_samples_per_second": 26.401,
|
1848 |
"eval_steps_per_second": 3.301,
|
1849 |
"step": 11500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1850 |
}
|
1851 |
],
|
1852 |
"logging_steps": 100,
|
@@ -1854,7 +1950,7 @@
|
|
1854 |
"num_input_tokens_seen": 0,
|
1855 |
"num_train_epochs": 30,
|
1856 |
"save_steps": 100,
|
1857 |
-
"total_flos": 1.
|
1858 |
"train_batch_size": 8,
|
1859 |
"trial_name": null,
|
1860 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 5.093664491685961,
|
5 |
"eval_steps": 100,
|
6 |
+
"global_step": 12100,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
1847 |
"eval_samples_per_second": 26.401,
|
1848 |
"eval_steps_per_second": 3.301,
|
1849 |
"step": 11500
|
1850 |
+
},
|
1851 |
+
{
|
1852 |
+
"epoch": 4.88,
|
1853 |
+
"grad_norm": 12.522397994995117,
|
1854 |
+
"learning_rate": 4.216254416961131e-05,
|
1855 |
+
"loss": 1.4244,
|
1856 |
+
"step": 11600
|
1857 |
+
},
|
1858 |
+
{
|
1859 |
+
"epoch": 4.88,
|
1860 |
+
"eval_cer": 0.48068612881780143,
|
1861 |
+
"eval_loss": 2.598745822906494,
|
1862 |
+
"eval_runtime": 385.9814,
|
1863 |
+
"eval_samples_per_second": 24.556,
|
1864 |
+
"eval_steps_per_second": 3.07,
|
1865 |
+
"step": 11600
|
1866 |
+
},
|
1867 |
+
{
|
1868 |
+
"epoch": 4.93,
|
1869 |
+
"grad_norm": 3.1015026569366455,
|
1870 |
+
"learning_rate": 4.209187279151944e-05,
|
1871 |
+
"loss": 3.7378,
|
1872 |
+
"step": 11700
|
1873 |
+
},
|
1874 |
+
{
|
1875 |
+
"epoch": 4.93,
|
1876 |
+
"eval_cer": 0.47458791208791207,
|
1877 |
+
"eval_loss": 2.3908824920654297,
|
1878 |
+
"eval_runtime": 373.2148,
|
1879 |
+
"eval_samples_per_second": 25.396,
|
1880 |
+
"eval_steps_per_second": 3.175,
|
1881 |
+
"step": 11700
|
1882 |
+
},
|
1883 |
+
{
|
1884 |
+
"epoch": 4.97,
|
1885 |
+
"grad_norm": 86.87032318115234,
|
1886 |
+
"learning_rate": 4.2021201413427565e-05,
|
1887 |
+
"loss": 2.8329,
|
1888 |
+
"step": 11800
|
1889 |
+
},
|
1890 |
+
{
|
1891 |
+
"epoch": 4.97,
|
1892 |
+
"eval_cer": 0.4754898126784248,
|
1893 |
+
"eval_loss": 2.441450357437134,
|
1894 |
+
"eval_runtime": 446.0173,
|
1895 |
+
"eval_samples_per_second": 21.25,
|
1896 |
+
"eval_steps_per_second": 2.657,
|
1897 |
+
"step": 11800
|
1898 |
+
},
|
1899 |
+
{
|
1900 |
+
"epoch": 5.01,
|
1901 |
+
"grad_norm": 2.7503468990325928,
|
1902 |
+
"learning_rate": 4.195053003533569e-05,
|
1903 |
+
"loss": 2.4912,
|
1904 |
+
"step": 11900
|
1905 |
+
},
|
1906 |
+
{
|
1907 |
+
"epoch": 5.01,
|
1908 |
+
"eval_cer": 0.488764127331743,
|
1909 |
+
"eval_loss": 1.6247801780700684,
|
1910 |
+
"eval_runtime": 361.3079,
|
1911 |
+
"eval_samples_per_second": 26.232,
|
1912 |
+
"eval_steps_per_second": 3.28,
|
1913 |
+
"step": 11900
|
1914 |
+
},
|
1915 |
+
{
|
1916 |
+
"epoch": 5.05,
|
1917 |
+
"grad_norm": 2.511701822280884,
|
1918 |
+
"learning_rate": 4.187985865724382e-05,
|
1919 |
+
"loss": 2.009,
|
1920 |
+
"step": 12000
|
1921 |
+
},
|
1922 |
+
{
|
1923 |
+
"epoch": 5.05,
|
1924 |
+
"eval_cer": 0.46521938915177347,
|
1925 |
+
"eval_loss": 1.8090691566467285,
|
1926 |
+
"eval_runtime": 401.8599,
|
1927 |
+
"eval_samples_per_second": 23.585,
|
1928 |
+
"eval_steps_per_second": 2.949,
|
1929 |
+
"step": 12000
|
1930 |
+
},
|
1931 |
+
{
|
1932 |
+
"epoch": 5.09,
|
1933 |
+
"grad_norm": 4.231322765350342,
|
1934 |
+
"learning_rate": 4.180918727915194e-05,
|
1935 |
+
"loss": 1.6484,
|
1936 |
+
"step": 12100
|
1937 |
+
},
|
1938 |
+
{
|
1939 |
+
"epoch": 5.09,
|
1940 |
+
"eval_cer": 0.483240291736733,
|
1941 |
+
"eval_loss": 1.89494788646698,
|
1942 |
+
"eval_runtime": 367.7673,
|
1943 |
+
"eval_samples_per_second": 25.772,
|
1944 |
+
"eval_steps_per_second": 3.222,
|
1945 |
+
"step": 12100
|
1946 |
}
|
1947 |
],
|
1948 |
"logging_steps": 100,
|
|
|
1950 |
"num_input_tokens_seen": 0,
|
1951 |
"num_train_epochs": 30,
|
1952 |
"save_steps": 100,
|
1953 |
+
"total_flos": 1.3265676741232484e+20,
|
1954 |
"train_batch_size": 8,
|
1955 |
"trial_name": null,
|
1956 |
"trial_params": null
|