{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 10.0, "global_step": 1388, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 0.0004999359657561555, "loss": 0.5112, "step": 10 }, { "epoch": 0.03, "learning_rate": 0.0004997438958276968, "loss": 0.2283, "step": 20 }, { "epoch": 0.04, "learning_rate": 0.0004994238886070454, "loss": 0.2077, "step": 30 }, { "epoch": 0.06, "learning_rate": 0.0004989761080255641, "loss": 0.2174, "step": 40 }, { "epoch": 0.07, "learning_rate": 0.0004984007834695806, "loss": 0.1994, "step": 50 }, { "epoch": 0.09, "learning_rate": 0.0004976982096628782, "loss": 0.2143, "step": 60 }, { "epoch": 0.1, "learning_rate": 0.0004968687465157165, "loss": 0.2168, "step": 70 }, { "epoch": 0.12, "learning_rate": 0.0004959128189404589, "loss": 0.2175, "step": 80 }, { "epoch": 0.13, "learning_rate": 0.000494830916633901, "loss": 0.2134, "step": 90 }, { "epoch": 0.14, "learning_rate": 0.0004936235938264118, "loss": 0.2064, "step": 100 }, { "epoch": 0.16, "learning_rate": 0.0004922914689980156, "loss": 0.2032, "step": 110 }, { "epoch": 0.17, "learning_rate": 0.0004908352245615613, "loss": 0.2017, "step": 120 }, { "epoch": 0.19, "learning_rate": 0.0004892556065131395, "loss": 0.2127, "step": 130 }, { "epoch": 0.2, "learning_rate": 0.0004875534240499285, "loss": 0.2074, "step": 140 }, { "epoch": 0.22, "learning_rate": 0.00048572954915566393, "loss": 0.2013, "step": 150 }, { "epoch": 0.23, "learning_rate": 0.0004837849161539432, "loss": 0.2188, "step": 160 }, { "epoch": 0.24, "learning_rate": 0.00048172052122959707, "loss": 0.1919, "step": 170 }, { "epoch": 0.26, "learning_rate": 0.0004795374219183694, "loss": 0.1966, "step": 180 }, { "epoch": 0.27, "learning_rate": 0.0004772367365651691, "loss": 0.2088, "step": 190 }, { "epoch": 0.29, "learning_rate": 0.0004748196437511716, "loss": 0.2163, "step": 200 }, { "epoch": 0.3, "learning_rate": 0.00047228738169006204, "loss": 0.2013, "step": 210 }, { "epoch": 0.32, "learning_rate": 0.00046964124759373086, "loss": 0.194, "step": 220 }, { "epoch": 0.33, "learning_rate": 0.0004668825970077457, "loss": 0.2002, "step": 230 }, { "epoch": 0.35, "learning_rate": 0.0004640128431169411, "loss": 0.2095, "step": 240 }, { "epoch": 0.36, "learning_rate": 0.0004613362851634204, "loss": 0.1917, "step": 250 }, { "epoch": 0.37, "learning_rate": 0.0004582595315535216, "loss": 0.2054, "step": 260 }, { "epoch": 0.39, "learning_rate": 0.0004550760920106112, "loss": 0.1925, "step": 270 }, { "epoch": 0.4, "learning_rate": 0.00045178759732784093, "loss": 0.2157, "step": 280 }, { "epoch": 0.42, "learning_rate": 0.0004483957321153738, "loss": 0.2246, "step": 290 }, { "epoch": 0.43, "learning_rate": 0.0004449022339374027, "loss": 0.191, "step": 300 }, { "epoch": 0.45, "learning_rate": 0.00044130889242204133, "loss": 0.1808, "step": 310 }, { "epoch": 0.46, "learning_rate": 0.000437617548344544, "loss": 0.2044, "step": 320 }, { "epoch": 0.48, "learning_rate": 0.000433830092684325, "loss": 0.1829, "step": 330 }, { "epoch": 0.49, "learning_rate": 0.00042994846565625865, "loss": 0.2009, "step": 340 }, { "epoch": 0.5, "learning_rate": 0.0004259746557167581, "loss": 0.1963, "step": 350 }, { "epoch": 0.52, "learning_rate": 0.0004219106985451405, "loss": 0.1933, "step": 360 }, { "epoch": 0.53, "learning_rate": 0.00041775867600080184, "loss": 0.1948, "step": 370 }, { "epoch": 0.55, "learning_rate": 0.0004135207150567347, "loss": 0.2036, "step": 380 }, { "epoch": 0.56, "learning_rate": 0.00040919898670993484, "loss": 0.2048, "step": 390 }, { "epoch": 0.58, "learning_rate": 0.0004047957048692567, "loss": 0.1881, "step": 400 }, { "epoch": 0.59, "learning_rate": 0.00040031312522128507, "loss": 0.2143, "step": 410 }, { "epoch": 0.61, "learning_rate": 0.00039575354407480576, "loss": 0.1904, "step": 420 }, { "epoch": 0.62, "learning_rate": 0.00039111929718446654, "loss": 0.1989, "step": 430 }, { "epoch": 0.63, "learning_rate": 0.00038641275855423065, "loss": 0.1971, "step": 440 }, { "epoch": 0.65, "learning_rate": 0.0003816363392212366, "loss": 0.1899, "step": 450 }, { "epoch": 0.66, "learning_rate": 0.00037679248602068657, "loss": 0.2091, "step": 460 }, { "epoch": 0.68, "learning_rate": 0.0003718836803323966, "loss": 0.1943, "step": 470 }, { "epoch": 0.69, "learning_rate": 0.00036691243680964987, "loss": 0.1907, "step": 480 }, { "epoch": 0.71, "learning_rate": 0.0003618813020910061, "loss": 0.1983, "step": 490 }, { "epoch": 0.72, "learning_rate": 0.00035679285349572454, "loss": 0.2036, "step": 500 }, { "epoch": 0.73, "learning_rate": 0.00035164969770347013, "loss": 0.1852, "step": 510 }, { "epoch": 0.75, "learning_rate": 0.0003464544694189799, "loss": 0.1825, "step": 520 }, { "epoch": 0.76, "learning_rate": 0.0003412098300223723, "loss": 0.2118, "step": 530 }, { "epoch": 0.78, "learning_rate": 0.0003359184662057914, "loss": 0.19, "step": 540 }, { "epoch": 0.79, "learning_rate": 0.00033058308859708433, "loss": 0.1877, "step": 550 }, { "epoch": 0.81, "learning_rate": 0.0003252064303712174, "loss": 0.1932, "step": 560 }, { "epoch": 0.82, "learning_rate": 0.000319791245850142, "loss": 0.1983, "step": 570 }, { "epoch": 0.84, "learning_rate": 0.00031434030909182665, "loss": 0.2146, "step": 580 }, { "epoch": 0.85, "learning_rate": 0.00030885641246917994, "loss": 0.1957, "step": 590 }, { "epoch": 0.86, "learning_rate": 0.00030334236523959037, "loss": 0.1933, "step": 600 }, { "epoch": 0.88, "learning_rate": 0.00029780099210581685, "loss": 0.1981, "step": 610 }, { "epoch": 0.89, "learning_rate": 0.0002922351317689671, "loss": 0.2009, "step": 620 }, { "epoch": 0.91, "learning_rate": 0.0002866476354743054, "loss": 0.1849, "step": 630 }, { "epoch": 0.92, "learning_rate": 0.0002810413655506334, "loss": 0.2011, "step": 640 }, { "epoch": 0.94, "learning_rate": 0.00027541919394399395, "loss": 0.1786, "step": 650 }, { "epoch": 0.95, "learning_rate": 0.00026978400074644773, "loss": 0.1929, "step": 660 }, { "epoch": 0.97, "learning_rate": 0.0002641386727206773, "loss": 0.2051, "step": 670 }, { "epoch": 0.98, "learning_rate": 0.0002584861018211739, "loss": 0.1916, "step": 680 }, { "epoch": 0.99, "learning_rate": 0.00025282918371276396, "loss": 0.1748, "step": 690 }, { "epoch": 1.01, "learning_rate": 0.0002471708162872361, "loss": 0.1949, "step": 700 }, { "epoch": 1.02, "learning_rate": 0.00024151389817882616, "loss": 0.1775, "step": 710 }, { "epoch": 1.04, "learning_rate": 0.0002358613272793227, "loss": 0.1885, "step": 720 }, { "epoch": 1.05, "learning_rate": 0.0002302159992535523, "loss": 0.1959, "step": 730 }, { "epoch": 1.07, "learning_rate": 0.00022458080605600617, "loss": 0.1842, "step": 740 }, { "epoch": 1.08, "learning_rate": 0.00021895863444936663, "loss": 0.1943, "step": 750 }, { "epoch": 1.1, "learning_rate": 0.0002133523645256946, "loss": 0.1779, "step": 760 }, { "epoch": 1.11, "learning_rate": 0.0002077648682310329, "loss": 0.178, "step": 770 }, { "epoch": 1.12, "learning_rate": 0.0002021990078941832, "loss": 0.1978, "step": 780 }, { "epoch": 1.14, "learning_rate": 0.00019665763476040964, "loss": 0.165, "step": 790 }, { "epoch": 1.15, "learning_rate": 0.00019114358753082, "loss": 0.1731, "step": 800 }, { "epoch": 1.17, "learning_rate": 0.0001856596909081734, "loss": 0.1926, "step": 810 }, { "epoch": 1.18, "learning_rate": 0.0001802087541498581, "loss": 0.1844, "step": 820 }, { "epoch": 1.2, "learning_rate": 0.0001747935696287826, "loss": 0.1672, "step": 830 }, { "epoch": 1.21, "learning_rate": 0.0001694169114029157, "loss": 0.1857, "step": 840 }, { "epoch": 1.22, "learning_rate": 0.0001640815337942086, "loss": 0.171, "step": 850 }, { "epoch": 1.24, "learning_rate": 0.00015879016997762767, "loss": 0.1693, "step": 860 }, { "epoch": 1.25, "learning_rate": 0.00015354553058102009, "loss": 0.175, "step": 870 }, { "epoch": 1.27, "learning_rate": 0.00014835030229652999, "loss": 0.1764, "step": 880 }, { "epoch": 1.28, "learning_rate": 0.00014320714650427552, "loss": 0.1803, "step": 890 }, { "epoch": 1.3, "learning_rate": 0.0001381186979089939, "loss": 0.1916, "step": 900 }, { "epoch": 1.31, "learning_rate": 0.00013308756319035017, "loss": 0.1639, "step": 910 }, { "epoch": 1.33, "learning_rate": 0.0001281163196676035, "loss": 0.1835, "step": 920 }, { "epoch": 1.34, "learning_rate": 0.00012320751397931342, "loss": 0.1869, "step": 930 }, { "epoch": 1.35, "learning_rate": 0.0001183636607787634, "loss": 0.1858, "step": 940 }, { "epoch": 1.37, "learning_rate": 0.00011358724144576943, "loss": 0.1773, "step": 950 }, { "epoch": 1.38, "learning_rate": 0.00010888070281553353, "loss": 0.177, "step": 960 }, { "epoch": 1.4, "learning_rate": 0.0001042464559251943, "loss": 0.2, "step": 970 }, { "epoch": 1.41, "learning_rate": 9.9686874778715e-05, "loss": 0.1809, "step": 980 }, { "epoch": 1.43, "learning_rate": 9.520429513074334e-05, "loss": 0.1784, "step": 990 }, { "epoch": 1.44, "learning_rate": 9.080101329006519e-05, "loss": 0.1873, "step": 1000 }, { "epoch": 1.46, "learning_rate": 8.647928494326535e-05, "loss": 0.1681, "step": 1010 }, { "epoch": 1.47, "learning_rate": 8.224132399919812e-05, "loss": 0.1873, "step": 1020 }, { "epoch": 1.48, "learning_rate": 7.808930145485957e-05, "loss": 0.1849, "step": 1030 }, { "epoch": 1.5, "learning_rate": 7.402534428324195e-05, "loss": 0.2012, "step": 1040 }, { "epoch": 1.51, "learning_rate": 7.005153434374137e-05, "loss": 0.193, "step": 1050 }, { "epoch": 1.53, "learning_rate": 6.616990731567505e-05, "loss": 0.1779, "step": 1060 }, { "epoch": 1.54, "learning_rate": 6.238245165545606e-05, "loss": 0.1651, "step": 1070 }, { "epoch": 1.56, "learning_rate": 5.869110757795876e-05, "loss": 0.1727, "step": 1080 }, { "epoch": 1.57, "learning_rate": 5.509776606259731e-05, "loss": 0.1933, "step": 1090 }, { "epoch": 1.59, "learning_rate": 5.16042678846263e-05, "loss": 0.1969, "step": 1100 }, { "epoch": 1.6, "learning_rate": 4.821240267215918e-05, "loss": 0.1819, "step": 1110 }, { "epoch": 1.61, "learning_rate": 4.492390798938883e-05, "loss": 0.168, "step": 1120 }, { "epoch": 1.63, "learning_rate": 4.174046844647844e-05, "loss": 0.1774, "step": 1130 }, { "epoch": 1.64, "learning_rate": 3.8663714836579624e-05, "loss": 0.1836, "step": 1140 }, { "epoch": 1.66, "learning_rate": 3.569522330041969e-05, "loss": 0.1664, "step": 1150 }, { "epoch": 1.67, "learning_rate": 3.2836514518885615e-05, "loss": 0.1638, "step": 1160 }, { "epoch": 1.69, "learning_rate": 3.008905293401895e-05, "loss": 0.2014, "step": 1170 }, { "epoch": 1.7, "learning_rate": 2.7454245998820376e-05, "loss": 0.1918, "step": 1180 }, { "epoch": 1.71, "learning_rate": 2.4933443456248068e-05, "loss": 0.1968, "step": 1190 }, { "epoch": 1.73, "learning_rate": 2.2527936647779522e-05, "loss": 0.1702, "step": 1200 }, { "epoch": 1.74, "learning_rate": 2.0238957851891172e-05, "loss": 0.1611, "step": 1210 }, { "epoch": 1.76, "learning_rate": 1.8067679652793905e-05, "loss": 0.186, "step": 1220 }, { "epoch": 1.77, "learning_rate": 1.6015214339749063e-05, "loss": 0.1758, "step": 1230 }, { "epoch": 1.79, "learning_rate": 1.4082613337271234e-05, "loss": 0.1624, "step": 1240 }, { "epoch": 1.8, "learning_rate": 1.2270866666511254e-05, "loss": 0.1843, "step": 1250 }, { "epoch": 1.82, "learning_rate": 1.0580902438093925e-05, "loss": 0.191, "step": 1260 }, { "epoch": 1.83, "learning_rate": 9.013586376671157e-06, "loss": 0.1807, "step": 1270 }, { "epoch": 1.84, "learning_rate": 7.569721377433808e-06, "loss": 0.1891, "step": 1280 }, { "epoch": 1.86, "learning_rate": 6.250047094809447e-06, "loss": 0.1855, "step": 1290 }, { "epoch": 1.87, "learning_rate": 5.055239563556496e-06, "loss": 0.1924, "step": 1300 }, { "epoch": 1.89, "learning_rate": 3.985910852449398e-06, "loss": 0.1995, "step": 1310 }, { "epoch": 1.9, "learning_rate": 3.0426087507316735e-06, "loss": 0.1938, "step": 1320 }, { "epoch": 1.92, "learning_rate": 2.2258164874976726e-06, "loss": 0.1813, "step": 1330 }, { "epoch": 1.93, "learning_rate": 1.5359524841470907e-06, "loss": 0.1723, "step": 1340 }, { "epoch": 1.95, "learning_rate": 9.73370140038371e-07, "loss": 0.1804, "step": 1350 }, { "epoch": 1.96, "learning_rate": 5.383576514515487e-07, "loss": 0.1838, "step": 1360 }, { "epoch": 1.97, "learning_rate": 2.3113786395281833e-07, "loss": 0.1753, "step": 1370 }, { "epoch": 1.99, "learning_rate": 5.1868158236434427e-08, "loss": 0.1876, "step": 1380 }, { "epoch": 2.0, "step": 1388, "total_flos": 3.2633842202899907e+18, "train_loss": 0.19332071949837873, "train_runtime": 22791.3585, "train_samples_per_second": 3.893, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 1388, "num_train_epochs": 2, "save_steps": 300, "total_flos": 3.2633842202899907e+18, "trial_name": null, "trial_params": null }