hsethu's picture
Upload folder using huggingface_hub
2abeb7e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 22486,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 9.991105576803343e-07,
"loss": 3.5693,
"step": 64
},
{
"epoch": 0.01,
"learning_rate": 9.981618192060246e-07,
"loss": 1.3039,
"step": 128
},
{
"epoch": 0.01,
"learning_rate": 9.972130807317145e-07,
"loss": 0.6178,
"step": 192
},
{
"epoch": 0.01,
"learning_rate": 9.962643422574046e-07,
"loss": 0.2217,
"step": 256
},
{
"epoch": 0.01,
"learning_rate": 9.953156037830946e-07,
"loss": 0.135,
"step": 320
},
{
"epoch": 0.02,
"learning_rate": 9.943668653087847e-07,
"loss": 0.2547,
"step": 384
},
{
"epoch": 0.02,
"learning_rate": 9.934181268344748e-07,
"loss": 0.2357,
"step": 448
},
{
"epoch": 0.02,
"learning_rate": 9.92513860476148e-07,
"loss": 0.4032,
"step": 512
},
{
"epoch": 0.03,
"learning_rate": 9.915651220018381e-07,
"loss": 0.2513,
"step": 576
},
{
"epoch": 0.03,
"learning_rate": 9.906163835275282e-07,
"loss": 0.2058,
"step": 640
},
{
"epoch": 0.03,
"learning_rate": 9.89667645053218e-07,
"loss": 0.2276,
"step": 704
},
{
"epoch": 0.03,
"learning_rate": 9.887189065789084e-07,
"loss": 0.1798,
"step": 768
},
{
"epoch": 0.04,
"learning_rate": 9.877701681045983e-07,
"loss": 0.2646,
"step": 832
},
{
"epoch": 0.04,
"learning_rate": 9.868214296302884e-07,
"loss": 0.2434,
"step": 896
},
{
"epoch": 0.04,
"learning_rate": 9.858726911559785e-07,
"loss": 0.3211,
"step": 960
},
{
"epoch": 0.05,
"learning_rate": 9.849239526816686e-07,
"loss": 0.1601,
"step": 1024
},
{
"epoch": 0.05,
"learning_rate": 9.839752142073586e-07,
"loss": 0.2012,
"step": 1088
},
{
"epoch": 0.05,
"learning_rate": 9.830264757330487e-07,
"loss": 0.1635,
"step": 1152
},
{
"epoch": 0.05,
"learning_rate": 9.820777372587388e-07,
"loss": 0.2808,
"step": 1216
},
{
"epoch": 0.06,
"learning_rate": 9.811289987844287e-07,
"loss": 0.1622,
"step": 1280
},
{
"epoch": 0.06,
"learning_rate": 9.801802603101188e-07,
"loss": 0.23,
"step": 1344
},
{
"epoch": 0.06,
"learning_rate": 9.792315218358089e-07,
"loss": 0.1615,
"step": 1408
},
{
"epoch": 0.07,
"learning_rate": 9.78282783361499e-07,
"loss": 0.1424,
"step": 1472
},
{
"epoch": 0.07,
"learning_rate": 9.77334044887189e-07,
"loss": 0.2007,
"step": 1536
},
{
"epoch": 0.07,
"learning_rate": 9.763853064128792e-07,
"loss": 0.3649,
"step": 1600
},
{
"epoch": 0.07,
"learning_rate": 9.75436567938569e-07,
"loss": 0.1149,
"step": 1664
},
{
"epoch": 0.08,
"learning_rate": 9.744878294642591e-07,
"loss": 0.1882,
"step": 1728
},
{
"epoch": 0.08,
"learning_rate": 9.735390909899492e-07,
"loss": 0.2598,
"step": 1792
},
{
"epoch": 0.08,
"learning_rate": 9.725903525156393e-07,
"loss": 0.2198,
"step": 1856
},
{
"epoch": 0.09,
"learning_rate": 9.716416140413294e-07,
"loss": 0.1786,
"step": 1920
},
{
"epoch": 0.09,
"learning_rate": 9.706928755670195e-07,
"loss": 0.2713,
"step": 1984
},
{
"epoch": 0.09,
"learning_rate": 9.697441370927096e-07,
"loss": 0.2076,
"step": 2048
},
{
"epoch": 0.09,
"learning_rate": 9.687953986183995e-07,
"loss": 0.2343,
"step": 2112
},
{
"epoch": 0.1,
"learning_rate": 9.678466601440898e-07,
"loss": 0.3182,
"step": 2176
},
{
"epoch": 0.1,
"learning_rate": 9.668979216697796e-07,
"loss": 0.2102,
"step": 2240
},
{
"epoch": 0.1,
"learning_rate": 9.659491831954697e-07,
"loss": 0.2687,
"step": 2304
},
{
"epoch": 0.11,
"learning_rate": 9.650004447211598e-07,
"loss": 0.2186,
"step": 2368
},
{
"epoch": 0.11,
"learning_rate": 9.6405170624685e-07,
"loss": 0.1295,
"step": 2432
},
{
"epoch": 0.11,
"learning_rate": 9.631029677725398e-07,
"loss": 0.1542,
"step": 2496
},
{
"epoch": 0.11,
"learning_rate": 9.621542292982299e-07,
"loss": 0.1365,
"step": 2560
},
{
"epoch": 0.12,
"learning_rate": 9.6120549082392e-07,
"loss": 0.2106,
"step": 2624
},
{
"epoch": 0.12,
"learning_rate": 9.6025675234961e-07,
"loss": 0.2753,
"step": 2688
},
{
"epoch": 0.12,
"learning_rate": 9.593080138753002e-07,
"loss": 0.2162,
"step": 2752
},
{
"epoch": 0.13,
"learning_rate": 9.583592754009902e-07,
"loss": 0.2186,
"step": 2816
},
{
"epoch": 0.13,
"learning_rate": 9.574105369266803e-07,
"loss": 0.0991,
"step": 2880
},
{
"epoch": 0.13,
"learning_rate": 9.564617984523702e-07,
"loss": 0.1616,
"step": 2944
},
{
"epoch": 0.13,
"learning_rate": 9.555130599780605e-07,
"loss": 0.2029,
"step": 3008
},
{
"epoch": 0.14,
"learning_rate": 9.545643215037504e-07,
"loss": 0.1659,
"step": 3072
},
{
"epoch": 0.14,
"learning_rate": 9.536155830294405e-07,
"loss": 0.1887,
"step": 3136
},
{
"epoch": 0.14,
"learning_rate": 9.526668445551306e-07,
"loss": 0.2626,
"step": 3200
},
{
"epoch": 0.15,
"learning_rate": 9.517181060808206e-07,
"loss": 0.1325,
"step": 3264
},
{
"epoch": 0.15,
"learning_rate": 9.507693676065108e-07,
"loss": 0.2125,
"step": 3328
},
{
"epoch": 0.15,
"learning_rate": 9.498206291322007e-07,
"loss": 0.1926,
"step": 3392
},
{
"epoch": 0.15,
"learning_rate": 9.488718906578907e-07,
"loss": 0.2237,
"step": 3456
},
{
"epoch": 0.16,
"learning_rate": 9.479231521835809e-07,
"loss": 0.1283,
"step": 3520
},
{
"epoch": 0.16,
"learning_rate": 9.469744137092709e-07,
"loss": 0.2052,
"step": 3584
},
{
"epoch": 0.16,
"learning_rate": 9.46025675234961e-07,
"loss": 0.1983,
"step": 3648
},
{
"epoch": 0.17,
"learning_rate": 9.45076936760651e-07,
"loss": 0.1976,
"step": 3712
},
{
"epoch": 0.17,
"learning_rate": 9.441281982863411e-07,
"loss": 0.1999,
"step": 3776
},
{
"epoch": 0.17,
"learning_rate": 9.431794598120312e-07,
"loss": 0.2231,
"step": 3840
},
{
"epoch": 0.17,
"learning_rate": 9.422307213377212e-07,
"loss": 0.2057,
"step": 3904
},
{
"epoch": 0.18,
"learning_rate": 9.412819828634113e-07,
"loss": 0.2376,
"step": 3968
},
{
"epoch": 0.18,
"learning_rate": 9.403332443891013e-07,
"loss": 0.1325,
"step": 4032
},
{
"epoch": 0.18,
"learning_rate": 9.393845059147913e-07,
"loss": 0.2595,
"step": 4096
},
{
"epoch": 0.19,
"learning_rate": 9.384357674404815e-07,
"loss": 0.1817,
"step": 4160
},
{
"epoch": 0.19,
"learning_rate": 9.374870289661715e-07,
"loss": 0.2224,
"step": 4224
},
{
"epoch": 0.19,
"learning_rate": 9.365382904918616e-07,
"loss": 0.1444,
"step": 4288
},
{
"epoch": 0.19,
"learning_rate": 9.355895520175517e-07,
"loss": 0.2695,
"step": 4352
},
{
"epoch": 0.2,
"learning_rate": 9.346408135432417e-07,
"loss": 0.1859,
"step": 4416
},
{
"epoch": 0.2,
"learning_rate": 9.336920750689318e-07,
"loss": 0.1551,
"step": 4480
},
{
"epoch": 0.2,
"learning_rate": 9.327433365946217e-07,
"loss": 0.2093,
"step": 4544
},
{
"epoch": 0.2,
"learning_rate": 9.317945981203118e-07,
"loss": 0.1922,
"step": 4608
},
{
"epoch": 0.21,
"learning_rate": 9.308458596460019e-07,
"loss": 0.2056,
"step": 4672
},
{
"epoch": 0.21,
"learning_rate": 9.298971211716919e-07,
"loss": 0.2087,
"step": 4736
},
{
"epoch": 0.21,
"learning_rate": 9.289483826973821e-07,
"loss": 0.1552,
"step": 4800
},
{
"epoch": 0.22,
"learning_rate": 9.279996442230721e-07,
"loss": 0.1619,
"step": 4864
},
{
"epoch": 0.22,
"learning_rate": 9.270509057487621e-07,
"loss": 0.2634,
"step": 4928
},
{
"epoch": 0.22,
"learning_rate": 9.261021672744523e-07,
"loss": 0.1459,
"step": 4992
},
{
"epoch": 0.22,
"learning_rate": 9.251534288001423e-07,
"loss": 0.225,
"step": 5056
},
{
"epoch": 0.23,
"learning_rate": 9.242046903258323e-07,
"loss": 0.1878,
"step": 5120
},
{
"epoch": 0.23,
"learning_rate": 9.232559518515224e-07,
"loss": 0.1286,
"step": 5184
},
{
"epoch": 0.23,
"learning_rate": 9.223072133772124e-07,
"loss": 0.1059,
"step": 5248
},
{
"epoch": 0.24,
"learning_rate": 9.213584749029025e-07,
"loss": 0.1541,
"step": 5312
},
{
"epoch": 0.24,
"learning_rate": 9.204097364285925e-07,
"loss": 0.2427,
"step": 5376
},
{
"epoch": 0.24,
"learning_rate": 9.194609979542827e-07,
"loss": 0.1368,
"step": 5440
},
{
"epoch": 0.24,
"learning_rate": 9.185122594799727e-07,
"loss": 0.2477,
"step": 5504
},
{
"epoch": 0.25,
"learning_rate": 9.175635210056627e-07,
"loss": 0.1385,
"step": 5568
},
{
"epoch": 0.25,
"learning_rate": 9.166147825313529e-07,
"loss": 0.2537,
"step": 5632
},
{
"epoch": 0.25,
"learning_rate": 9.156660440570428e-07,
"loss": 0.2374,
"step": 5696
},
{
"epoch": 0.26,
"learning_rate": 9.147173055827329e-07,
"loss": 0.1942,
"step": 5760
},
{
"epoch": 0.26,
"learning_rate": 9.13768567108423e-07,
"loss": 0.1416,
"step": 5824
},
{
"epoch": 0.26,
"learning_rate": 9.12819828634113e-07,
"loss": 0.2205,
"step": 5888
},
{
"epoch": 0.26,
"learning_rate": 9.118710901598031e-07,
"loss": 0.1265,
"step": 5952
},
{
"epoch": 0.27,
"learning_rate": 9.109223516854932e-07,
"loss": 0.1972,
"step": 6016
},
{
"epoch": 0.27,
"learning_rate": 9.099736132111833e-07,
"loss": 0.2119,
"step": 6080
},
{
"epoch": 0.27,
"learning_rate": 9.090248747368733e-07,
"loss": 0.1284,
"step": 6144
},
{
"epoch": 0.28,
"learning_rate": 9.080761362625632e-07,
"loss": 0.1665,
"step": 6208
},
{
"epoch": 0.28,
"learning_rate": 9.071273977882534e-07,
"loss": 0.1545,
"step": 6272
},
{
"epoch": 0.28,
"learning_rate": 9.061786593139434e-07,
"loss": 0.1505,
"step": 6336
},
{
"epoch": 0.28,
"learning_rate": 9.052299208396334e-07,
"loss": 0.1871,
"step": 6400
},
{
"epoch": 0.29,
"learning_rate": 9.042811823653236e-07,
"loss": 0.2302,
"step": 6464
},
{
"epoch": 0.29,
"learning_rate": 9.033324438910136e-07,
"loss": 0.1395,
"step": 6528
},
{
"epoch": 0.29,
"learning_rate": 9.023837054167037e-07,
"loss": 0.2195,
"step": 6592
},
{
"epoch": 0.3,
"learning_rate": 9.014349669423938e-07,
"loss": 0.2419,
"step": 6656
},
{
"epoch": 0.3,
"learning_rate": 9.004862284680838e-07,
"loss": 0.2173,
"step": 6720
},
{
"epoch": 0.3,
"learning_rate": 8.995374899937739e-07,
"loss": 0.2522,
"step": 6784
},
{
"epoch": 0.3,
"learning_rate": 8.985887515194639e-07,
"loss": 0.2317,
"step": 6848
},
{
"epoch": 0.31,
"learning_rate": 8.97640013045154e-07,
"loss": 0.1959,
"step": 6912
},
{
"epoch": 0.31,
"learning_rate": 8.96691274570844e-07,
"loss": 0.1889,
"step": 6976
},
{
"epoch": 0.31,
"learning_rate": 8.95742536096534e-07,
"loss": 0.2131,
"step": 7040
},
{
"epoch": 0.32,
"learning_rate": 8.947937976222242e-07,
"loss": 0.1321,
"step": 7104
},
{
"epoch": 0.32,
"learning_rate": 8.938450591479142e-07,
"loss": 0.1679,
"step": 7168
},
{
"epoch": 0.32,
"learning_rate": 8.928963206736043e-07,
"loss": 0.1826,
"step": 7232
},
{
"epoch": 0.32,
"learning_rate": 8.919475821992944e-07,
"loss": 0.2144,
"step": 7296
},
{
"epoch": 0.33,
"learning_rate": 8.909988437249843e-07,
"loss": 0.1614,
"step": 7360
},
{
"epoch": 0.33,
"learning_rate": 8.900501052506744e-07,
"loss": 0.241,
"step": 7424
},
{
"epoch": 0.33,
"learning_rate": 8.891013667763645e-07,
"loss": 0.1825,
"step": 7488
},
{
"epoch": 0.34,
"learning_rate": 8.881526283020546e-07,
"loss": 0.17,
"step": 7552
},
{
"epoch": 0.34,
"learning_rate": 8.872038898277446e-07,
"loss": 0.1562,
"step": 7616
},
{
"epoch": 0.34,
"learning_rate": 8.862551513534347e-07,
"loss": 0.2264,
"step": 7680
},
{
"epoch": 0.34,
"learning_rate": 8.853064128791248e-07,
"loss": 0.1325,
"step": 7744
},
{
"epoch": 0.35,
"learning_rate": 8.843576744048148e-07,
"loss": 0.1601,
"step": 7808
},
{
"epoch": 0.35,
"learning_rate": 8.83408935930505e-07,
"loss": 0.1784,
"step": 7872
},
{
"epoch": 0.35,
"learning_rate": 8.82460197456195e-07,
"loss": 0.1447,
"step": 7936
},
{
"epoch": 0.36,
"learning_rate": 8.815114589818849e-07,
"loss": 0.166,
"step": 8000
},
{
"epoch": 0.36,
"learning_rate": 8.80562720507575e-07,
"loss": 0.1554,
"step": 8064
},
{
"epoch": 0.36,
"learning_rate": 8.796139820332651e-07,
"loss": 0.1097,
"step": 8128
},
{
"epoch": 0.36,
"learning_rate": 8.786652435589552e-07,
"loss": 0.1322,
"step": 8192
},
{
"epoch": 0.37,
"learning_rate": 8.777165050846452e-07,
"loss": 0.262,
"step": 8256
},
{
"epoch": 0.37,
"learning_rate": 8.767677666103353e-07,
"loss": 0.1755,
"step": 8320
},
{
"epoch": 0.37,
"learning_rate": 8.758190281360254e-07,
"loss": 0.1646,
"step": 8384
},
{
"epoch": 0.38,
"learning_rate": 8.748702896617154e-07,
"loss": 0.1481,
"step": 8448
},
{
"epoch": 0.38,
"learning_rate": 8.739215511874055e-07,
"loss": 0.0985,
"step": 8512
},
{
"epoch": 0.38,
"learning_rate": 8.729728127130955e-07,
"loss": 0.1401,
"step": 8576
},
{
"epoch": 0.38,
"learning_rate": 8.720240742387855e-07,
"loss": 0.2057,
"step": 8640
},
{
"epoch": 0.39,
"learning_rate": 8.710753357644757e-07,
"loss": 0.1677,
"step": 8704
},
{
"epoch": 0.39,
"learning_rate": 8.701265972901657e-07,
"loss": 0.1398,
"step": 8768
},
{
"epoch": 0.39,
"learning_rate": 8.691778588158557e-07,
"loss": 0.1665,
"step": 8832
},
{
"epoch": 0.4,
"learning_rate": 8.682439443802069e-07,
"loss": 0.1737,
"step": 8896
},
{
"epoch": 0.4,
"learning_rate": 8.67295205905897e-07,
"loss": 0.1642,
"step": 8960
},
{
"epoch": 0.4,
"learning_rate": 8.66346467431587e-07,
"loss": 0.1487,
"step": 9024
},
{
"epoch": 0.4,
"learning_rate": 8.653977289572771e-07,
"loss": 0.186,
"step": 9088
},
{
"epoch": 0.41,
"learning_rate": 8.644489904829672e-07,
"loss": 0.1902,
"step": 9152
},
{
"epoch": 0.41,
"learning_rate": 8.635002520086571e-07,
"loss": 0.1977,
"step": 9216
},
{
"epoch": 0.41,
"learning_rate": 8.625515135343473e-07,
"loss": 0.1853,
"step": 9280
},
{
"epoch": 0.42,
"learning_rate": 8.616027750600373e-07,
"loss": 0.1156,
"step": 9344
},
{
"epoch": 0.42,
"learning_rate": 8.606540365857273e-07,
"loss": 0.179,
"step": 9408
},
{
"epoch": 0.42,
"learning_rate": 8.597052981114175e-07,
"loss": 0.1978,
"step": 9472
},
{
"epoch": 0.42,
"learning_rate": 8.587565596371075e-07,
"loss": 0.1735,
"step": 9536
},
{
"epoch": 0.43,
"learning_rate": 8.578078211627976e-07,
"loss": 0.1579,
"step": 9600
},
{
"epoch": 0.43,
"learning_rate": 8.568590826884876e-07,
"loss": 0.1444,
"step": 9664
},
{
"epoch": 0.43,
"learning_rate": 8.559103442141777e-07,
"loss": 0.1664,
"step": 9728
},
{
"epoch": 0.44,
"learning_rate": 8.549616057398677e-07,
"loss": 0.1715,
"step": 9792
},
{
"epoch": 0.44,
"learning_rate": 8.540128672655577e-07,
"loss": 0.1189,
"step": 9856
},
{
"epoch": 0.44,
"learning_rate": 8.530641287912479e-07,
"loss": 0.1255,
"step": 9920
},
{
"epoch": 0.44,
"learning_rate": 8.521153903169379e-07,
"loss": 0.2314,
"step": 9984
},
{
"epoch": 0.45,
"learning_rate": 8.511814758812891e-07,
"loss": 0.18,
"step": 10048
},
{
"epoch": 0.45,
"learning_rate": 8.502327374069791e-07,
"loss": 0.1948,
"step": 10112
},
{
"epoch": 0.45,
"learning_rate": 8.492839989326692e-07,
"loss": 0.1763,
"step": 10176
},
{
"epoch": 0.46,
"learning_rate": 8.483352604583593e-07,
"loss": 0.1984,
"step": 10240
},
{
"epoch": 0.46,
"learning_rate": 8.473865219840493e-07,
"loss": 0.169,
"step": 10304
},
{
"epoch": 0.46,
"learning_rate": 8.464377835097394e-07,
"loss": 0.1268,
"step": 10368
},
{
"epoch": 0.46,
"learning_rate": 8.454890450354294e-07,
"loss": 0.2261,
"step": 10432
},
{
"epoch": 0.47,
"learning_rate": 8.445403065611195e-07,
"loss": 0.1751,
"step": 10496
},
{
"epoch": 0.47,
"learning_rate": 8.435915680868095e-07,
"loss": 0.1758,
"step": 10560
},
{
"epoch": 0.47,
"learning_rate": 8.426428296124995e-07,
"loss": 0.1565,
"step": 10624
},
{
"epoch": 0.48,
"learning_rate": 8.416940911381897e-07,
"loss": 0.0861,
"step": 10688
},
{
"epoch": 0.48,
"learning_rate": 8.407453526638797e-07,
"loss": 0.1382,
"step": 10752
},
{
"epoch": 0.48,
"learning_rate": 8.397966141895698e-07,
"loss": 0.1268,
"step": 10816
},
{
"epoch": 0.48,
"learning_rate": 8.388478757152599e-07,
"loss": 0.1418,
"step": 10880
},
{
"epoch": 0.49,
"learning_rate": 8.378991372409498e-07,
"loss": 0.2417,
"step": 10944
},
{
"epoch": 0.49,
"learning_rate": 8.369503987666399e-07,
"loss": 0.1285,
"step": 11008
},
{
"epoch": 0.49,
"learning_rate": 8.3600166029233e-07,
"loss": 0.1719,
"step": 11072
},
{
"epoch": 0.5,
"learning_rate": 8.350529218180201e-07,
"loss": 0.1432,
"step": 11136
},
{
"epoch": 0.5,
"learning_rate": 8.341041833437101e-07,
"loss": 0.2443,
"step": 11200
},
{
"epoch": 0.5,
"learning_rate": 8.331554448694002e-07,
"loss": 0.1348,
"step": 11264
},
{
"epoch": 0.5,
"learning_rate": 8.322067063950903e-07,
"loss": 0.1251,
"step": 11328
},
{
"epoch": 0.51,
"learning_rate": 8.312579679207803e-07,
"loss": 0.156,
"step": 11392
},
{
"epoch": 0.51,
"learning_rate": 8.303092294464703e-07,
"loss": 0.3104,
"step": 11456
},
{
"epoch": 0.51,
"learning_rate": 8.293604909721605e-07,
"loss": 0.1834,
"step": 11520
},
{
"epoch": 0.52,
"learning_rate": 8.284117524978504e-07,
"loss": 0.1312,
"step": 11584
},
{
"epoch": 0.52,
"learning_rate": 8.274630140235405e-07,
"loss": 0.1026,
"step": 11648
},
{
"epoch": 0.52,
"learning_rate": 8.265142755492306e-07,
"loss": 0.1805,
"step": 11712
},
{
"epoch": 0.52,
"learning_rate": 8.255655370749206e-07,
"loss": 0.2334,
"step": 11776
},
{
"epoch": 0.53,
"learning_rate": 8.246167986006107e-07,
"loss": 0.1606,
"step": 11840
},
{
"epoch": 0.53,
"learning_rate": 8.236680601263008e-07,
"loss": 0.1009,
"step": 11904
},
{
"epoch": 0.53,
"learning_rate": 8.227193216519909e-07,
"loss": 0.1337,
"step": 11968
},
{
"epoch": 0.54,
"learning_rate": 8.217705831776809e-07,
"loss": 0.2247,
"step": 12032
},
{
"epoch": 0.54,
"learning_rate": 8.20821844703371e-07,
"loss": 0.163,
"step": 12096
},
{
"epoch": 0.54,
"learning_rate": 8.19873106229061e-07,
"loss": 0.1729,
"step": 12160
},
{
"epoch": 0.54,
"learning_rate": 8.18924367754751e-07,
"loss": 0.2133,
"step": 12224
},
{
"epoch": 0.55,
"learning_rate": 8.179756292804412e-07,
"loss": 0.2887,
"step": 12288
},
{
"epoch": 0.55,
"learning_rate": 8.170268908061312e-07,
"loss": 0.128,
"step": 12352
},
{
"epoch": 0.55,
"learning_rate": 8.160781523318212e-07,
"loss": 0.2019,
"step": 12416
},
{
"epoch": 0.56,
"learning_rate": 8.151294138575113e-07,
"loss": 0.128,
"step": 12480
},
{
"epoch": 0.56,
"learning_rate": 8.141806753832014e-07,
"loss": 0.1888,
"step": 12544
},
{
"epoch": 0.56,
"learning_rate": 8.132319369088915e-07,
"loss": 0.2176,
"step": 12608
},
{
"epoch": 0.56,
"learning_rate": 8.122831984345814e-07,
"loss": 0.1555,
"step": 12672
},
{
"epoch": 0.57,
"learning_rate": 8.113344599602715e-07,
"loss": 0.1948,
"step": 12736
},
{
"epoch": 0.57,
"learning_rate": 8.103857214859616e-07,
"loss": 0.233,
"step": 12800
},
{
"epoch": 0.57,
"learning_rate": 8.094369830116516e-07,
"loss": 0.1574,
"step": 12864
},
{
"epoch": 0.57,
"learning_rate": 8.084882445373418e-07,
"loss": 0.1377,
"step": 12928
},
{
"epoch": 0.58,
"learning_rate": 8.075395060630318e-07,
"loss": 0.1563,
"step": 12992
},
{
"epoch": 0.58,
"learning_rate": 8.065907675887218e-07,
"loss": 0.1345,
"step": 13056
},
{
"epoch": 0.58,
"learning_rate": 8.05642029114412e-07,
"loss": 0.1489,
"step": 13120
},
{
"epoch": 0.59,
"learning_rate": 8.047081146787631e-07,
"loss": 0.2524,
"step": 13184
},
{
"epoch": 0.59,
"learning_rate": 8.037742002431142e-07,
"loss": 0.1534,
"step": 13248
},
{
"epoch": 0.59,
"learning_rate": 8.028254617688043e-07,
"loss": 0.136,
"step": 13312
},
{
"epoch": 0.59,
"learning_rate": 8.018767232944942e-07,
"loss": 0.151,
"step": 13376
},
{
"epoch": 0.6,
"learning_rate": 8.009279848201844e-07,
"loss": 0.1423,
"step": 13440
},
{
"epoch": 0.6,
"learning_rate": 7.999792463458744e-07,
"loss": 0.2284,
"step": 13504
},
{
"epoch": 0.6,
"learning_rate": 7.990305078715644e-07,
"loss": 0.1515,
"step": 13568
},
{
"epoch": 0.61,
"learning_rate": 7.980817693972546e-07,
"loss": 0.1431,
"step": 13632
},
{
"epoch": 0.61,
"learning_rate": 7.971330309229446e-07,
"loss": 0.1759,
"step": 13696
},
{
"epoch": 0.61,
"learning_rate": 7.961842924486347e-07,
"loss": 0.2942,
"step": 13760
},
{
"epoch": 0.61,
"learning_rate": 7.952355539743248e-07,
"loss": 0.1382,
"step": 13824
},
{
"epoch": 0.62,
"learning_rate": 7.942868155000148e-07,
"loss": 0.181,
"step": 13888
},
{
"epoch": 0.62,
"learning_rate": 7.933380770257049e-07,
"loss": 0.2471,
"step": 13952
},
{
"epoch": 0.62,
"learning_rate": 7.923893385513949e-07,
"loss": 0.1487,
"step": 14016
},
{
"epoch": 0.63,
"learning_rate": 7.91440600077085e-07,
"loss": 0.1653,
"step": 14080
},
{
"epoch": 0.63,
"learning_rate": 7.90491861602775e-07,
"loss": 0.193,
"step": 14144
},
{
"epoch": 0.63,
"learning_rate": 7.89543123128465e-07,
"loss": 0.115,
"step": 14208
},
{
"epoch": 0.63,
"learning_rate": 7.885943846541552e-07,
"loss": 0.1413,
"step": 14272
},
{
"epoch": 0.64,
"learning_rate": 7.876456461798452e-07,
"loss": 0.1508,
"step": 14336
},
{
"epoch": 0.64,
"learning_rate": 7.866969077055352e-07,
"loss": 0.1752,
"step": 14400
},
{
"epoch": 0.64,
"learning_rate": 7.857481692312254e-07,
"loss": 0.2432,
"step": 14464
},
{
"epoch": 0.65,
"learning_rate": 7.847994307569153e-07,
"loss": 0.1978,
"step": 14528
},
{
"epoch": 0.65,
"learning_rate": 7.838506922826054e-07,
"loss": 0.1445,
"step": 14592
},
{
"epoch": 0.65,
"learning_rate": 7.829019538082955e-07,
"loss": 0.1484,
"step": 14656
},
{
"epoch": 0.65,
"learning_rate": 7.819532153339855e-07,
"loss": 0.1887,
"step": 14720
},
{
"epoch": 0.66,
"learning_rate": 7.810044768596756e-07,
"loss": 0.216,
"step": 14784
},
{
"epoch": 0.66,
"learning_rate": 7.800557383853657e-07,
"loss": 0.1803,
"step": 14848
},
{
"epoch": 0.66,
"learning_rate": 7.791069999110558e-07,
"loss": 0.1332,
"step": 14912
},
{
"epoch": 0.67,
"learning_rate": 7.781582614367458e-07,
"loss": 0.2439,
"step": 14976
},
{
"epoch": 0.67,
"learning_rate": 7.772095229624358e-07,
"loss": 0.1689,
"step": 15040
},
{
"epoch": 0.67,
"learning_rate": 7.76260784488126e-07,
"loss": 0.1823,
"step": 15104
},
{
"epoch": 0.67,
"learning_rate": 7.753120460138159e-07,
"loss": 0.1905,
"step": 15168
},
{
"epoch": 0.68,
"learning_rate": 7.74363307539506e-07,
"loss": 0.2558,
"step": 15232
},
{
"epoch": 0.68,
"learning_rate": 7.734145690651961e-07,
"loss": 0.1531,
"step": 15296
},
{
"epoch": 0.68,
"learning_rate": 7.724658305908861e-07,
"loss": 0.1849,
"step": 15360
},
{
"epoch": 0.69,
"learning_rate": 7.715170921165762e-07,
"loss": 0.1317,
"step": 15424
},
{
"epoch": 0.69,
"learning_rate": 7.705683536422663e-07,
"loss": 0.1096,
"step": 15488
},
{
"epoch": 0.69,
"learning_rate": 7.696196151679564e-07,
"loss": 0.2193,
"step": 15552
},
{
"epoch": 0.69,
"learning_rate": 7.686708766936464e-07,
"loss": 0.1658,
"step": 15616
},
{
"epoch": 0.7,
"learning_rate": 7.677221382193365e-07,
"loss": 0.1553,
"step": 15680
},
{
"epoch": 0.7,
"learning_rate": 7.667733997450265e-07,
"loss": 0.1772,
"step": 15744
},
{
"epoch": 0.7,
"learning_rate": 7.658246612707165e-07,
"loss": 0.2147,
"step": 15808
},
{
"epoch": 0.71,
"learning_rate": 7.648759227964067e-07,
"loss": 0.1096,
"step": 15872
},
{
"epoch": 0.71,
"learning_rate": 7.639271843220967e-07,
"loss": 0.1613,
"step": 15936
},
{
"epoch": 0.71,
"learning_rate": 7.629784458477867e-07,
"loss": 0.1488,
"step": 16000
},
{
"epoch": 0.71,
"learning_rate": 7.620297073734768e-07,
"loss": 0.2256,
"step": 16064
},
{
"epoch": 0.72,
"learning_rate": 7.610809688991669e-07,
"loss": 0.2512,
"step": 16128
},
{
"epoch": 0.72,
"learning_rate": 7.60132230424857e-07,
"loss": 0.1264,
"step": 16192
},
{
"epoch": 0.72,
"learning_rate": 7.59183491950547e-07,
"loss": 0.1162,
"step": 16256
},
{
"epoch": 0.73,
"learning_rate": 7.58234753476237e-07,
"loss": 0.1401,
"step": 16320
},
{
"epoch": 0.73,
"learning_rate": 7.572860150019271e-07,
"loss": 0.1336,
"step": 16384
},
{
"epoch": 0.73,
"learning_rate": 7.563372765276171e-07,
"loss": 0.1234,
"step": 16448
},
{
"epoch": 0.73,
"learning_rate": 7.553885380533072e-07,
"loss": 0.1195,
"step": 16512
},
{
"epoch": 0.74,
"learning_rate": 7.544397995789973e-07,
"loss": 0.2435,
"step": 16576
},
{
"epoch": 0.74,
"learning_rate": 7.534910611046873e-07,
"loss": 0.1109,
"step": 16640
},
{
"epoch": 0.74,
"learning_rate": 7.525423226303775e-07,
"loss": 0.2088,
"step": 16704
},
{
"epoch": 0.75,
"learning_rate": 7.515935841560675e-07,
"loss": 0.141,
"step": 16768
},
{
"epoch": 0.75,
"learning_rate": 7.506448456817574e-07,
"loss": 0.1428,
"step": 16832
},
{
"epoch": 0.75,
"learning_rate": 7.496961072074475e-07,
"loss": 0.1505,
"step": 16896
},
{
"epoch": 0.75,
"learning_rate": 7.487473687331376e-07,
"loss": 0.2152,
"step": 16960
},
{
"epoch": 0.76,
"learning_rate": 7.477986302588277e-07,
"loss": 0.2008,
"step": 17024
},
{
"epoch": 0.76,
"learning_rate": 7.468498917845177e-07,
"loss": 0.1872,
"step": 17088
},
{
"epoch": 0.76,
"learning_rate": 7.459011533102078e-07,
"loss": 0.1313,
"step": 17152
},
{
"epoch": 0.77,
"learning_rate": 7.449524148358979e-07,
"loss": 0.1099,
"step": 17216
},
{
"epoch": 0.77,
"learning_rate": 7.440036763615879e-07,
"loss": 0.138,
"step": 17280
},
{
"epoch": 0.77,
"learning_rate": 7.430549378872781e-07,
"loss": 0.1871,
"step": 17344
},
{
"epoch": 0.77,
"learning_rate": 7.42106199412968e-07,
"loss": 0.18,
"step": 17408
},
{
"epoch": 0.78,
"learning_rate": 7.41157460938658e-07,
"loss": 0.1337,
"step": 17472
},
{
"epoch": 0.78,
"learning_rate": 7.402087224643482e-07,
"loss": 0.1222,
"step": 17536
},
{
"epoch": 0.78,
"learning_rate": 7.392599839900382e-07,
"loss": 0.1434,
"step": 17600
},
{
"epoch": 0.79,
"learning_rate": 7.383112455157283e-07,
"loss": 0.1538,
"step": 17664
},
{
"epoch": 0.79,
"learning_rate": 7.373625070414183e-07,
"loss": 0.1908,
"step": 17728
},
{
"epoch": 0.79,
"learning_rate": 7.364137685671084e-07,
"loss": 0.1244,
"step": 17792
},
{
"epoch": 0.79,
"learning_rate": 7.354650300927985e-07,
"loss": 0.1593,
"step": 17856
},
{
"epoch": 0.8,
"learning_rate": 7.345162916184885e-07,
"loss": 0.1588,
"step": 17920
},
{
"epoch": 0.8,
"learning_rate": 7.335675531441787e-07,
"loss": 0.1639,
"step": 17984
},
{
"epoch": 0.8,
"learning_rate": 7.326188146698686e-07,
"loss": 0.1431,
"step": 18048
},
{
"epoch": 0.81,
"learning_rate": 7.316700761955586e-07,
"loss": 0.2002,
"step": 18112
},
{
"epoch": 0.81,
"learning_rate": 7.307213377212488e-07,
"loss": 0.1761,
"step": 18176
},
{
"epoch": 0.81,
"learning_rate": 7.297725992469388e-07,
"loss": 0.1597,
"step": 18240
},
{
"epoch": 0.81,
"learning_rate": 7.288238607726288e-07,
"loss": 0.1952,
"step": 18304
},
{
"epoch": 0.82,
"learning_rate": 7.27875122298319e-07,
"loss": 0.1843,
"step": 18368
},
{
"epoch": 0.82,
"learning_rate": 7.26926383824009e-07,
"loss": 0.1032,
"step": 18432
},
{
"epoch": 0.82,
"learning_rate": 7.259776453496991e-07,
"loss": 0.1952,
"step": 18496
},
{
"epoch": 0.83,
"learning_rate": 7.25028906875389e-07,
"loss": 0.193,
"step": 18560
},
{
"epoch": 0.83,
"learning_rate": 7.240801684010791e-07,
"loss": 0.137,
"step": 18624
},
{
"epoch": 0.83,
"learning_rate": 7.231314299267692e-07,
"loss": 0.1992,
"step": 18688
},
{
"epoch": 0.83,
"learning_rate": 7.221826914524592e-07,
"loss": 0.138,
"step": 18752
},
{
"epoch": 0.84,
"learning_rate": 7.212339529781494e-07,
"loss": 0.2263,
"step": 18816
},
{
"epoch": 0.84,
"learning_rate": 7.202852145038394e-07,
"loss": 0.2101,
"step": 18880
},
{
"epoch": 0.84,
"learning_rate": 7.193364760295294e-07,
"loss": 0.1731,
"step": 18944
},
{
"epoch": 0.85,
"learning_rate": 7.183877375552196e-07,
"loss": 0.1523,
"step": 19008
},
{
"epoch": 0.85,
"learning_rate": 7.174389990809096e-07,
"loss": 0.1671,
"step": 19072
},
{
"epoch": 0.85,
"learning_rate": 7.164902606065996e-07,
"loss": 0.1549,
"step": 19136
},
{
"epoch": 0.85,
"learning_rate": 7.155415221322897e-07,
"loss": 0.1346,
"step": 19200
},
{
"epoch": 0.86,
"learning_rate": 7.145927836579797e-07,
"loss": 0.2403,
"step": 19264
},
{
"epoch": 0.86,
"learning_rate": 7.136588692223308e-07,
"loss": 0.1909,
"step": 19328
},
{
"epoch": 0.86,
"learning_rate": 7.12710130748021e-07,
"loss": 0.1801,
"step": 19392
},
{
"epoch": 0.87,
"learning_rate": 7.11761392273711e-07,
"loss": 0.1196,
"step": 19456
},
{
"epoch": 0.87,
"learning_rate": 7.10812653799401e-07,
"loss": 0.0749,
"step": 19520
},
{
"epoch": 0.87,
"learning_rate": 7.098639153250912e-07,
"loss": 0.1386,
"step": 19584
},
{
"epoch": 0.87,
"learning_rate": 7.089151768507812e-07,
"loss": 0.219,
"step": 19648
},
{
"epoch": 0.88,
"learning_rate": 7.079664383764713e-07,
"loss": 0.1572,
"step": 19712
},
{
"epoch": 0.88,
"learning_rate": 7.070176999021614e-07,
"loss": 0.191,
"step": 19776
},
{
"epoch": 0.88,
"learning_rate": 7.060689614278513e-07,
"loss": 0.1882,
"step": 19840
},
{
"epoch": 0.89,
"learning_rate": 7.051202229535414e-07,
"loss": 0.1654,
"step": 19904
},
{
"epoch": 0.89,
"learning_rate": 7.041714844792315e-07,
"loss": 0.1397,
"step": 19968
},
{
"epoch": 0.89,
"learning_rate": 7.032227460049216e-07,
"loss": 0.1948,
"step": 20032
},
{
"epoch": 0.89,
"learning_rate": 7.022740075306116e-07,
"loss": 0.2171,
"step": 20096
},
{
"epoch": 0.9,
"learning_rate": 7.013252690563016e-07,
"loss": 0.2474,
"step": 20160
},
{
"epoch": 0.9,
"learning_rate": 7.003765305819918e-07,
"loss": 0.2014,
"step": 20224
},
{
"epoch": 0.9,
"learning_rate": 6.994277921076818e-07,
"loss": 0.1256,
"step": 20288
},
{
"epoch": 0.91,
"learning_rate": 6.984790536333718e-07,
"loss": 0.1634,
"step": 20352
},
{
"epoch": 0.91,
"learning_rate": 6.975303151590619e-07,
"loss": 0.1672,
"step": 20416
},
{
"epoch": 0.91,
"learning_rate": 6.965815766847519e-07,
"loss": 0.1773,
"step": 20480
},
{
"epoch": 0.91,
"learning_rate": 6.95632838210442e-07,
"loss": 0.1157,
"step": 20544
},
{
"epoch": 0.92,
"learning_rate": 6.946840997361321e-07,
"loss": 0.2241,
"step": 20608
},
{
"epoch": 0.92,
"learning_rate": 6.937353612618222e-07,
"loss": 0.1108,
"step": 20672
},
{
"epoch": 0.92,
"learning_rate": 6.927866227875122e-07,
"loss": 0.1821,
"step": 20736
},
{
"epoch": 0.93,
"learning_rate": 6.918378843132023e-07,
"loss": 0.1459,
"step": 20800
},
{
"epoch": 0.93,
"learning_rate": 6.908891458388924e-07,
"loss": 0.2022,
"step": 20864
},
{
"epoch": 0.93,
"learning_rate": 6.899404073645823e-07,
"loss": 0.1864,
"step": 20928
},
{
"epoch": 0.93,
"learning_rate": 6.889916688902723e-07,
"loss": 0.1436,
"step": 20992
},
{
"epoch": 0.94,
"learning_rate": 6.880429304159625e-07,
"loss": 0.1771,
"step": 21056
},
{
"epoch": 0.94,
"learning_rate": 6.870941919416525e-07,
"loss": 0.1782,
"step": 21120
},
{
"epoch": 0.94,
"learning_rate": 6.861454534673426e-07,
"loss": 0.1754,
"step": 21184
},
{
"epoch": 0.94,
"learning_rate": 6.851967149930327e-07,
"loss": 0.1483,
"step": 21248
},
{
"epoch": 0.95,
"learning_rate": 6.842479765187227e-07,
"loss": 0.1373,
"step": 21312
},
{
"epoch": 0.95,
"learning_rate": 6.833140620830739e-07,
"loss": 0.219,
"step": 21376
},
{
"epoch": 0.95,
"learning_rate": 6.82365323608764e-07,
"loss": 0.1474,
"step": 21440
},
{
"epoch": 0.96,
"learning_rate": 6.81416585134454e-07,
"loss": 0.1713,
"step": 21504
},
{
"epoch": 0.96,
"learning_rate": 6.80467846660144e-07,
"loss": 0.1034,
"step": 21568
},
{
"epoch": 0.96,
"learning_rate": 6.795191081858341e-07,
"loss": 0.185,
"step": 21632
},
{
"epoch": 0.96,
"learning_rate": 6.785703697115241e-07,
"loss": 0.284,
"step": 21696
},
{
"epoch": 0.97,
"learning_rate": 6.776216312372143e-07,
"loss": 0.1953,
"step": 21760
},
{
"epoch": 0.97,
"learning_rate": 6.766728927629043e-07,
"loss": 0.168,
"step": 21824
},
{
"epoch": 0.97,
"learning_rate": 6.757241542885943e-07,
"loss": 0.1852,
"step": 21888
},
{
"epoch": 0.98,
"learning_rate": 6.747754158142845e-07,
"loss": 0.1358,
"step": 21952
},
{
"epoch": 0.98,
"learning_rate": 6.738266773399745e-07,
"loss": 0.1885,
"step": 22016
},
{
"epoch": 0.98,
"learning_rate": 6.728779388656646e-07,
"loss": 0.22,
"step": 22080
},
{
"epoch": 0.98,
"learning_rate": 6.719292003913545e-07,
"loss": 0.214,
"step": 22144
},
{
"epoch": 0.99,
"learning_rate": 6.709804619170446e-07,
"loss": 0.1198,
"step": 22208
},
{
"epoch": 0.99,
"learning_rate": 6.700317234427347e-07,
"loss": 0.1458,
"step": 22272
},
{
"epoch": 0.99,
"learning_rate": 6.690829849684247e-07,
"loss": 0.2405,
"step": 22336
},
{
"epoch": 1.0,
"learning_rate": 6.681342464941149e-07,
"loss": 0.1162,
"step": 22400
},
{
"epoch": 1.0,
"learning_rate": 6.671855080198049e-07,
"loss": 0.1508,
"step": 22464
}
],
"logging_steps": 64,
"max_steps": 67458,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 22486,
"total_flos": 4.77416235737088e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}