{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999898011218766, "eval_steps": 500, "global_step": 9804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.7619047619047613e-08, "loss": 5.1575, "step": 32 }, { "epoch": 0.01, "learning_rate": 1.0204081632653061e-07, "loss": 4.7952, "step": 64 }, { "epoch": 0.01, "learning_rate": 1.564625850340136e-07, "loss": 4.5513, "step": 96 }, { "epoch": 0.01, "learning_rate": 2.0918367346938776e-07, "loss": 3.9771, "step": 128 }, { "epoch": 0.02, "learning_rate": 2.619047619047619e-07, "loss": 3.2318, "step": 160 }, { "epoch": 0.02, "learning_rate": 3.163265306122449e-07, "loss": 2.508, "step": 192 }, { "epoch": 0.02, "learning_rate": 3.707482993197279e-07, "loss": 2.0866, "step": 224 }, { "epoch": 0.03, "learning_rate": 4.2517006802721085e-07, "loss": 1.8228, "step": 256 }, { "epoch": 0.03, "learning_rate": 4.795918367346938e-07, "loss": 1.6854, "step": 288 }, { "epoch": 0.03, "learning_rate": 5.340136054421769e-07, "loss": 1.4892, "step": 320 }, { "epoch": 0.04, "learning_rate": 5.884353741496599e-07, "loss": 1.2407, "step": 352 }, { "epoch": 0.04, "learning_rate": 6.428571428571429e-07, "loss": 1.0248, "step": 384 }, { "epoch": 0.04, "learning_rate": 6.95578231292517e-07, "loss": 0.577, "step": 416 }, { "epoch": 0.05, "learning_rate": 7.5e-07, "loss": 0.401, "step": 448 }, { "epoch": 0.05, "learning_rate": 8.04421768707483e-07, "loss": 0.3855, "step": 480 }, { "epoch": 0.05, "learning_rate": 8.58843537414966e-07, "loss": 0.3437, "step": 512 }, { "epoch": 0.06, "learning_rate": 9.132653061224489e-07, "loss": 0.3528, "step": 544 }, { "epoch": 0.06, "learning_rate": 9.67687074829932e-07, "loss": 0.4408, "step": 576 }, { "epoch": 0.06, "learning_rate": 9.985895627644569e-07, "loss": 0.4262, "step": 608 }, { "epoch": 0.07, "learning_rate": 9.951177172615818e-07, "loss": 0.3917, "step": 640 }, { "epoch": 0.07, "learning_rate": 9.916458717587067e-07, "loss": 0.4139, "step": 672 }, { "epoch": 0.07, "learning_rate": 9.881740262558316e-07, "loss": 0.3101, "step": 704 }, { "epoch": 0.08, "learning_rate": 9.847021807529563e-07, "loss": 0.4566, "step": 736 }, { "epoch": 0.08, "learning_rate": 9.812303352500815e-07, "loss": 0.2914, "step": 768 }, { "epoch": 0.08, "learning_rate": 9.777584897472062e-07, "loss": 0.4062, "step": 800 }, { "epoch": 0.08, "learning_rate": 9.74286644244331e-07, "loss": 0.3887, "step": 832 }, { "epoch": 0.09, "learning_rate": 9.70814798741456e-07, "loss": 0.3655, "step": 864 }, { "epoch": 0.09, "learning_rate": 9.67342953238581e-07, "loss": 0.3668, "step": 896 }, { "epoch": 0.09, "learning_rate": 9.638711077357056e-07, "loss": 0.3761, "step": 928 }, { "epoch": 0.1, "learning_rate": 9.603992622328305e-07, "loss": 0.3528, "step": 960 }, { "epoch": 0.1, "learning_rate": 9.569274167299554e-07, "loss": 0.2583, "step": 992 }, { "epoch": 0.1, "learning_rate": 9.534555712270804e-07, "loss": 0.3145, "step": 1024 }, { "epoch": 0.11, "learning_rate": 9.499837257242053e-07, "loss": 0.2842, "step": 1056 }, { "epoch": 0.11, "learning_rate": 9.465118802213301e-07, "loss": 0.2259, "step": 1088 }, { "epoch": 0.11, "learning_rate": 9.430400347184549e-07, "loss": 0.3493, "step": 1120 }, { "epoch": 0.12, "learning_rate": 9.395681892155799e-07, "loss": 0.3421, "step": 1152 }, { "epoch": 0.12, "learning_rate": 9.360963437127047e-07, "loss": 0.4288, "step": 1184 }, { "epoch": 0.12, "learning_rate": 9.326244982098295e-07, "loss": 0.2881, "step": 1216 }, { "epoch": 0.13, "learning_rate": 9.291526527069546e-07, "loss": 0.3361, "step": 1248 }, { "epoch": 0.13, "learning_rate": 9.256808072040794e-07, "loss": 0.2797, "step": 1280 }, { "epoch": 0.13, "learning_rate": 9.222089617012042e-07, "loss": 0.2531, "step": 1312 }, { "epoch": 0.14, "learning_rate": 9.187371161983291e-07, "loss": 0.282, "step": 1344 }, { "epoch": 0.14, "learning_rate": 9.15265270695454e-07, "loss": 0.3408, "step": 1376 }, { "epoch": 0.14, "learning_rate": 9.117934251925788e-07, "loss": 0.2354, "step": 1408 }, { "epoch": 0.15, "learning_rate": 9.083215796897038e-07, "loss": 0.3474, "step": 1440 }, { "epoch": 0.15, "learning_rate": 9.048497341868287e-07, "loss": 0.3546, "step": 1472 }, { "epoch": 0.15, "learning_rate": 9.013778886839535e-07, "loss": 0.3966, "step": 1504 }, { "epoch": 0.16, "learning_rate": 8.979060431810784e-07, "loss": 0.3376, "step": 1536 }, { "epoch": 0.16, "learning_rate": 8.944341976782033e-07, "loss": 0.3938, "step": 1568 }, { "epoch": 0.16, "learning_rate": 8.909623521753281e-07, "loss": 0.3343, "step": 1600 }, { "epoch": 0.17, "learning_rate": 8.87490506672453e-07, "loss": 0.3399, "step": 1632 }, { "epoch": 0.17, "learning_rate": 8.840186611695779e-07, "loss": 0.3072, "step": 1664 }, { "epoch": 0.17, "learning_rate": 8.805468156667028e-07, "loss": 0.262, "step": 1696 }, { "epoch": 0.18, "learning_rate": 8.770749701638277e-07, "loss": 0.2995, "step": 1728 }, { "epoch": 0.18, "learning_rate": 8.736031246609525e-07, "loss": 0.2804, "step": 1760 }, { "epoch": 0.18, "learning_rate": 8.701312791580774e-07, "loss": 0.3693, "step": 1792 }, { "epoch": 0.19, "learning_rate": 8.666594336552023e-07, "loss": 0.2589, "step": 1824 }, { "epoch": 0.19, "learning_rate": 8.631875881523272e-07, "loss": 0.2638, "step": 1856 }, { "epoch": 0.19, "learning_rate": 8.59715742649452e-07, "loss": 0.3516, "step": 1888 }, { "epoch": 0.2, "learning_rate": 8.56243897146577e-07, "loss": 0.369, "step": 1920 }, { "epoch": 0.2, "learning_rate": 8.527720516437018e-07, "loss": 0.3453, "step": 1952 }, { "epoch": 0.2, "learning_rate": 8.493002061408266e-07, "loss": 0.3813, "step": 1984 }, { "epoch": 0.21, "learning_rate": 8.458283606379516e-07, "loss": 0.2657, "step": 2016 }, { "epoch": 0.21, "learning_rate": 8.423565151350764e-07, "loss": 0.3514, "step": 2048 }, { "epoch": 0.21, "learning_rate": 8.388846696322013e-07, "loss": 0.2214, "step": 2080 }, { "epoch": 0.22, "learning_rate": 8.354128241293263e-07, "loss": 0.4351, "step": 2112 }, { "epoch": 0.22, "learning_rate": 8.319409786264511e-07, "loss": 0.2986, "step": 2144 }, { "epoch": 0.22, "learning_rate": 8.284691331235759e-07, "loss": 0.2211, "step": 2176 }, { "epoch": 0.23, "learning_rate": 8.249972876207008e-07, "loss": 0.2291, "step": 2208 }, { "epoch": 0.23, "learning_rate": 8.215254421178257e-07, "loss": 0.3048, "step": 2240 }, { "epoch": 0.23, "learning_rate": 8.180535966149505e-07, "loss": 0.3369, "step": 2272 }, { "epoch": 0.23, "learning_rate": 8.145817511120755e-07, "loss": 0.2413, "step": 2304 }, { "epoch": 0.24, "learning_rate": 8.111099056092004e-07, "loss": 0.2656, "step": 2336 }, { "epoch": 0.24, "learning_rate": 8.076380601063252e-07, "loss": 0.2661, "step": 2368 }, { "epoch": 0.24, "learning_rate": 8.041662146034501e-07, "loss": 0.3069, "step": 2400 }, { "epoch": 0.25, "learning_rate": 8.00694369100575e-07, "loss": 0.2357, "step": 2432 }, { "epoch": 0.25, "learning_rate": 7.972225235976998e-07, "loss": 0.3374, "step": 2464 }, { "epoch": 0.25, "learning_rate": 7.937506780948248e-07, "loss": 0.2678, "step": 2496 }, { "epoch": 0.26, "learning_rate": 7.902788325919496e-07, "loss": 0.2813, "step": 2528 }, { "epoch": 0.26, "learning_rate": 7.868069870890745e-07, "loss": 0.386, "step": 2560 }, { "epoch": 0.26, "learning_rate": 7.833351415861994e-07, "loss": 0.2247, "step": 2592 }, { "epoch": 0.27, "learning_rate": 7.798632960833242e-07, "loss": 0.3458, "step": 2624 }, { "epoch": 0.27, "learning_rate": 7.763914505804491e-07, "loss": 0.2967, "step": 2656 }, { "epoch": 0.27, "learning_rate": 7.72919605077574e-07, "loss": 0.277, "step": 2688 }, { "epoch": 0.28, "learning_rate": 7.694477595746989e-07, "loss": 0.2149, "step": 2720 }, { "epoch": 0.28, "learning_rate": 7.659759140718237e-07, "loss": 0.3624, "step": 2752 }, { "epoch": 0.28, "learning_rate": 7.625040685689487e-07, "loss": 0.3259, "step": 2784 }, { "epoch": 0.29, "learning_rate": 7.590322230660735e-07, "loss": 0.2905, "step": 2816 }, { "epoch": 0.29, "learning_rate": 7.555603775631984e-07, "loss": 0.2397, "step": 2848 }, { "epoch": 0.29, "learning_rate": 7.520885320603233e-07, "loss": 0.4047, "step": 2880 }, { "epoch": 0.3, "learning_rate": 7.486166865574481e-07, "loss": 0.258, "step": 2912 }, { "epoch": 0.3, "learning_rate": 7.451448410545731e-07, "loss": 0.3667, "step": 2944 }, { "epoch": 0.3, "learning_rate": 7.41672995551698e-07, "loss": 0.3022, "step": 2976 }, { "epoch": 0.31, "learning_rate": 7.382011500488228e-07, "loss": 0.2435, "step": 3008 }, { "epoch": 0.31, "learning_rate": 7.347293045459477e-07, "loss": 0.3491, "step": 3040 }, { "epoch": 0.31, "learning_rate": 7.312574590430725e-07, "loss": 0.2885, "step": 3072 }, { "epoch": 0.32, "learning_rate": 7.277856135401974e-07, "loss": 0.3822, "step": 3104 }, { "epoch": 0.32, "learning_rate": 7.243137680373224e-07, "loss": 0.3265, "step": 3136 }, { "epoch": 0.32, "learning_rate": 7.208419225344472e-07, "loss": 0.2879, "step": 3168 }, { "epoch": 0.33, "learning_rate": 7.173700770315721e-07, "loss": 0.3678, "step": 3200 }, { "epoch": 0.33, "learning_rate": 7.13898231528697e-07, "loss": 0.2696, "step": 3232 }, { "epoch": 0.33, "learning_rate": 7.104263860258218e-07, "loss": 0.3353, "step": 3264 }, { "epoch": 0.34, "learning_rate": 7.069545405229467e-07, "loss": 0.3552, "step": 3296 }, { "epoch": 0.34, "learning_rate": 7.034826950200716e-07, "loss": 0.2246, "step": 3328 }, { "epoch": 0.34, "learning_rate": 7.000108495171965e-07, "loss": 0.2622, "step": 3360 }, { "epoch": 0.35, "learning_rate": 6.965390040143213e-07, "loss": 0.286, "step": 3392 }, { "epoch": 0.35, "learning_rate": 6.930671585114463e-07, "loss": 0.2755, "step": 3424 }, { "epoch": 0.35, "learning_rate": 6.895953130085711e-07, "loss": 0.3111, "step": 3456 }, { "epoch": 0.36, "learning_rate": 6.861234675056959e-07, "loss": 0.2431, "step": 3488 }, { "epoch": 0.36, "learning_rate": 6.826516220028209e-07, "loss": 0.2097, "step": 3520 }, { "epoch": 0.36, "learning_rate": 6.791797764999457e-07, "loss": 0.3134, "step": 3552 }, { "epoch": 0.37, "learning_rate": 6.757079309970706e-07, "loss": 0.2535, "step": 3584 }, { "epoch": 0.37, "learning_rate": 6.722360854941956e-07, "loss": 0.3329, "step": 3616 }, { "epoch": 0.37, "learning_rate": 6.687642399913204e-07, "loss": 0.2857, "step": 3648 }, { "epoch": 0.38, "learning_rate": 6.652923944884452e-07, "loss": 0.2697, "step": 3680 }, { "epoch": 0.38, "learning_rate": 6.618205489855701e-07, "loss": 0.2815, "step": 3712 }, { "epoch": 0.38, "learning_rate": 6.58348703482695e-07, "loss": 0.2856, "step": 3744 }, { "epoch": 0.39, "learning_rate": 6.548768579798198e-07, "loss": 0.2912, "step": 3776 }, { "epoch": 0.39, "learning_rate": 6.514050124769448e-07, "loss": 0.2916, "step": 3808 }, { "epoch": 0.39, "learning_rate": 6.479331669740697e-07, "loss": 0.308, "step": 3840 }, { "epoch": 0.39, "learning_rate": 6.444613214711945e-07, "loss": 0.2934, "step": 3872 }, { "epoch": 0.4, "learning_rate": 6.409894759683194e-07, "loss": 0.2869, "step": 3904 }, { "epoch": 0.4, "learning_rate": 6.375176304654442e-07, "loss": 0.3232, "step": 3936 }, { "epoch": 0.4, "learning_rate": 6.340457849625691e-07, "loss": 0.2486, "step": 3968 }, { "epoch": 0.41, "learning_rate": 6.305739394596941e-07, "loss": 0.3123, "step": 4000 }, { "epoch": 0.41, "learning_rate": 6.271020939568189e-07, "loss": 0.3183, "step": 4032 }, { "epoch": 0.41, "learning_rate": 6.236302484539438e-07, "loss": 0.3688, "step": 4064 }, { "epoch": 0.42, "learning_rate": 6.201584029510687e-07, "loss": 0.3236, "step": 4096 }, { "epoch": 0.42, "learning_rate": 6.166865574481935e-07, "loss": 0.2206, "step": 4128 }, { "epoch": 0.42, "learning_rate": 6.132147119453183e-07, "loss": 0.2519, "step": 4160 }, { "epoch": 0.43, "learning_rate": 6.097428664424433e-07, "loss": 0.2467, "step": 4192 }, { "epoch": 0.43, "learning_rate": 6.062710209395682e-07, "loss": 0.2555, "step": 4224 }, { "epoch": 0.43, "learning_rate": 6.02799175436693e-07, "loss": 0.2391, "step": 4256 }, { "epoch": 0.44, "learning_rate": 5.99327329933818e-07, "loss": 0.3167, "step": 4288 }, { "epoch": 0.44, "learning_rate": 5.958554844309428e-07, "loss": 0.2342, "step": 4320 }, { "epoch": 0.44, "learning_rate": 5.923836389280676e-07, "loss": 0.1986, "step": 4352 }, { "epoch": 0.45, "learning_rate": 5.889117934251926e-07, "loss": 0.2359, "step": 4384 }, { "epoch": 0.45, "learning_rate": 5.854399479223174e-07, "loss": 0.2453, "step": 4416 }, { "epoch": 0.45, "learning_rate": 5.819681024194423e-07, "loss": 0.3077, "step": 4448 }, { "epoch": 0.46, "learning_rate": 5.784962569165672e-07, "loss": 0.2052, "step": 4480 }, { "epoch": 0.46, "learning_rate": 5.750244114136921e-07, "loss": 0.2313, "step": 4512 }, { "epoch": 0.46, "learning_rate": 5.715525659108169e-07, "loss": 0.3616, "step": 4544 }, { "epoch": 0.47, "learning_rate": 5.680807204079418e-07, "loss": 0.2848, "step": 4576 }, { "epoch": 0.47, "learning_rate": 5.646088749050667e-07, "loss": 0.2016, "step": 4608 }, { "epoch": 0.47, "learning_rate": 5.611370294021915e-07, "loss": 0.2467, "step": 4640 }, { "epoch": 0.48, "learning_rate": 5.576651838993165e-07, "loss": 0.3043, "step": 4672 }, { "epoch": 0.48, "learning_rate": 5.541933383964414e-07, "loss": 0.2283, "step": 4704 }, { "epoch": 0.48, "learning_rate": 5.507214928935662e-07, "loss": 0.1914, "step": 4736 }, { "epoch": 0.49, "learning_rate": 5.472496473906911e-07, "loss": 0.1776, "step": 4768 }, { "epoch": 0.49, "learning_rate": 5.437778018878159e-07, "loss": 0.2395, "step": 4800 }, { "epoch": 0.49, "learning_rate": 5.403059563849408e-07, "loss": 0.3197, "step": 4832 }, { "epoch": 0.5, "learning_rate": 5.368341108820658e-07, "loss": 0.2135, "step": 4864 }, { "epoch": 0.5, "learning_rate": 5.333622653791906e-07, "loss": 0.2104, "step": 4896 }, { "epoch": 0.5, "learning_rate": 5.298904198763155e-07, "loss": 0.2355, "step": 4928 }, { "epoch": 0.51, "learning_rate": 5.264185743734404e-07, "loss": 0.34, "step": 4960 }, { "epoch": 0.51, "learning_rate": 5.229467288705652e-07, "loss": 0.3004, "step": 4992 }, { "epoch": 0.51, "learning_rate": 5.1947488336769e-07, "loss": 0.3025, "step": 5024 }, { "epoch": 0.52, "learning_rate": 5.16003037864815e-07, "loss": 0.2561, "step": 5056 }, { "epoch": 0.52, "learning_rate": 5.125311923619399e-07, "loss": 0.2891, "step": 5088 }, { "epoch": 0.52, "learning_rate": 5.090593468590647e-07, "loss": 0.2668, "step": 5120 }, { "epoch": 0.53, "learning_rate": 5.055875013561897e-07, "loss": 0.2539, "step": 5152 }, { "epoch": 0.53, "learning_rate": 5.021156558533145e-07, "loss": 0.2294, "step": 5184 }, { "epoch": 0.53, "learning_rate": 4.986438103504394e-07, "loss": 0.2286, "step": 5216 }, { "epoch": 0.54, "learning_rate": 4.951719648475642e-07, "loss": 0.2987, "step": 5248 }, { "epoch": 0.54, "learning_rate": 4.917001193446891e-07, "loss": 0.2871, "step": 5280 }, { "epoch": 0.54, "learning_rate": 4.882282738418141e-07, "loss": 0.3253, "step": 5312 }, { "epoch": 0.55, "learning_rate": 4.847564283389389e-07, "loss": 0.2106, "step": 5344 }, { "epoch": 0.55, "learning_rate": 4.812845828360638e-07, "loss": 0.2428, "step": 5376 }, { "epoch": 0.55, "learning_rate": 4.778127373331887e-07, "loss": 0.1568, "step": 5408 }, { "epoch": 0.55, "learning_rate": 4.743408918303135e-07, "loss": 0.2879, "step": 5440 }, { "epoch": 0.56, "learning_rate": 4.708690463274384e-07, "loss": 0.2221, "step": 5472 }, { "epoch": 0.56, "learning_rate": 4.673972008245633e-07, "loss": 0.3336, "step": 5504 }, { "epoch": 0.56, "learning_rate": 4.639253553216881e-07, "loss": 0.2265, "step": 5536 }, { "epoch": 0.57, "learning_rate": 4.6045350981881303e-07, "loss": 0.2408, "step": 5568 }, { "epoch": 0.57, "learning_rate": 4.5698166431593795e-07, "loss": 0.3562, "step": 5600 }, { "epoch": 0.57, "learning_rate": 4.5350981881306276e-07, "loss": 0.1839, "step": 5632 }, { "epoch": 0.58, "learning_rate": 4.500379733101877e-07, "loss": 0.2341, "step": 5664 }, { "epoch": 0.58, "learning_rate": 4.4656612780731254e-07, "loss": 0.2683, "step": 5696 }, { "epoch": 0.58, "learning_rate": 4.430942823044374e-07, "loss": 0.3978, "step": 5728 }, { "epoch": 0.59, "learning_rate": 4.396224368015623e-07, "loss": 0.2493, "step": 5760 }, { "epoch": 0.59, "learning_rate": 4.361505912986872e-07, "loss": 0.28, "step": 5792 }, { "epoch": 0.59, "learning_rate": 4.3267874579581205e-07, "loss": 0.2263, "step": 5824 }, { "epoch": 0.6, "learning_rate": 4.292069002929369e-07, "loss": 0.3391, "step": 5856 }, { "epoch": 0.6, "learning_rate": 4.2573505479006183e-07, "loss": 0.3003, "step": 5888 }, { "epoch": 0.6, "learning_rate": 4.2226320928718675e-07, "loss": 0.2635, "step": 5920 }, { "epoch": 0.61, "learning_rate": 4.1879136378431156e-07, "loss": 0.2997, "step": 5952 }, { "epoch": 0.61, "learning_rate": 4.153195182814365e-07, "loss": 0.2135, "step": 5984 }, { "epoch": 0.61, "learning_rate": 4.1184767277856134e-07, "loss": 0.2651, "step": 6016 }, { "epoch": 0.62, "learning_rate": 4.083758272756862e-07, "loss": 0.1867, "step": 6048 }, { "epoch": 0.62, "learning_rate": 4.049039817728111e-07, "loss": 0.1986, "step": 6080 }, { "epoch": 0.62, "learning_rate": 4.01432136269936e-07, "loss": 0.2905, "step": 6112 }, { "epoch": 0.63, "learning_rate": 3.9796029076706085e-07, "loss": 0.2226, "step": 6144 }, { "epoch": 0.63, "learning_rate": 3.944884452641857e-07, "loss": 0.2472, "step": 6176 }, { "epoch": 0.63, "learning_rate": 3.9101659976131063e-07, "loss": 0.2103, "step": 6208 }, { "epoch": 0.64, "learning_rate": 3.8754475425843544e-07, "loss": 0.2976, "step": 6240 }, { "epoch": 0.64, "learning_rate": 3.8407290875556036e-07, "loss": 0.2777, "step": 6272 }, { "epoch": 0.64, "learning_rate": 3.806010632526853e-07, "loss": 0.2452, "step": 6304 }, { "epoch": 0.65, "learning_rate": 3.771292177498101e-07, "loss": 0.2243, "step": 6336 }, { "epoch": 0.65, "learning_rate": 3.73657372246935e-07, "loss": 0.3171, "step": 6368 }, { "epoch": 0.65, "learning_rate": 3.7018552674405987e-07, "loss": 0.2472, "step": 6400 }, { "epoch": 0.66, "learning_rate": 3.6671368124118474e-07, "loss": 0.2778, "step": 6432 }, { "epoch": 0.66, "learning_rate": 3.6335033091027445e-07, "loss": 0.2834, "step": 6464 }, { "epoch": 0.66, "learning_rate": 3.5987848540739937e-07, "loss": 0.23, "step": 6496 }, { "epoch": 0.67, "learning_rate": 3.5640663990452423e-07, "loss": 0.2185, "step": 6528 }, { "epoch": 0.67, "learning_rate": 3.529347944016491e-07, "loss": 0.282, "step": 6560 }, { "epoch": 0.67, "learning_rate": 3.4946294889877396e-07, "loss": 0.1885, "step": 6592 }, { "epoch": 0.68, "learning_rate": 3.459911033958989e-07, "loss": 0.263, "step": 6624 }, { "epoch": 0.68, "learning_rate": 3.425192578930238e-07, "loss": 0.2431, "step": 6656 }, { "epoch": 0.68, "learning_rate": 3.390474123901486e-07, "loss": 0.2472, "step": 6688 }, { "epoch": 0.69, "learning_rate": 3.355755668872735e-07, "loss": 0.284, "step": 6720 }, { "epoch": 0.69, "learning_rate": 3.321037213843984e-07, "loss": 0.2947, "step": 6752 }, { "epoch": 0.69, "learning_rate": 3.2863187588152325e-07, "loss": 0.2448, "step": 6784 }, { "epoch": 0.7, "learning_rate": 3.251600303786481e-07, "loss": 0.3035, "step": 6816 }, { "epoch": 0.7, "learning_rate": 3.2168818487577303e-07, "loss": 0.3378, "step": 6848 }, { "epoch": 0.7, "learning_rate": 3.182163393728979e-07, "loss": 0.2878, "step": 6880 }, { "epoch": 0.7, "learning_rate": 3.1474449387002276e-07, "loss": 0.1969, "step": 6912 }, { "epoch": 0.71, "learning_rate": 3.112726483671477e-07, "loss": 0.2282, "step": 6944 }, { "epoch": 0.71, "learning_rate": 3.078008028642725e-07, "loss": 0.2472, "step": 6976 }, { "epoch": 0.71, "learning_rate": 3.043289573613974e-07, "loss": 0.3127, "step": 7008 }, { "epoch": 0.72, "learning_rate": 3.008571118585223e-07, "loss": 0.3757, "step": 7040 }, { "epoch": 0.72, "learning_rate": 2.9738526635564714e-07, "loss": 0.2801, "step": 7072 }, { "epoch": 0.72, "learning_rate": 2.9402191602473685e-07, "loss": 0.3094, "step": 7104 }, { "epoch": 0.73, "learning_rate": 2.9055007052186177e-07, "loss": 0.3006, "step": 7136 }, { "epoch": 0.73, "learning_rate": 2.8707822501898663e-07, "loss": 0.2431, "step": 7168 }, { "epoch": 0.73, "learning_rate": 2.836063795161115e-07, "loss": 0.217, "step": 7200 }, { "epoch": 0.74, "learning_rate": 2.801345340132364e-07, "loss": 0.1981, "step": 7232 }, { "epoch": 0.74, "learning_rate": 2.766626885103613e-07, "loss": 0.242, "step": 7264 }, { "epoch": 0.74, "learning_rate": 2.7319084300748614e-07, "loss": 0.2426, "step": 7296 }, { "epoch": 0.75, "learning_rate": 2.69718997504611e-07, "loss": 0.2771, "step": 7328 }, { "epoch": 0.75, "learning_rate": 2.662471520017359e-07, "loss": 0.2508, "step": 7360 }, { "epoch": 0.75, "learning_rate": 2.6277530649886084e-07, "loss": 0.2495, "step": 7392 }, { "epoch": 0.76, "learning_rate": 2.5930346099598565e-07, "loss": 0.2353, "step": 7424 }, { "epoch": 0.76, "learning_rate": 2.5583161549311057e-07, "loss": 0.2047, "step": 7456 }, { "epoch": 0.76, "learning_rate": 2.5235976999023543e-07, "loss": 0.36, "step": 7488 }, { "epoch": 0.77, "learning_rate": 2.488879244873603e-07, "loss": 0.2685, "step": 7520 }, { "epoch": 0.77, "learning_rate": 2.4541607898448516e-07, "loss": 0.2505, "step": 7552 }, { "epoch": 0.77, "learning_rate": 2.419442334816101e-07, "loss": 0.2223, "step": 7584 }, { "epoch": 0.78, "learning_rate": 2.3847238797873494e-07, "loss": 0.2466, "step": 7616 }, { "epoch": 0.78, "learning_rate": 2.350005424758598e-07, "loss": 0.2872, "step": 7648 }, { "epoch": 0.78, "learning_rate": 2.315286969729847e-07, "loss": 0.2257, "step": 7680 }, { "epoch": 0.79, "learning_rate": 2.2805685147010956e-07, "loss": 0.2488, "step": 7712 }, { "epoch": 0.79, "learning_rate": 2.2458500596723445e-07, "loss": 0.3667, "step": 7744 }, { "epoch": 0.79, "learning_rate": 2.2111316046435932e-07, "loss": 0.2617, "step": 7776 }, { "epoch": 0.8, "learning_rate": 2.1764131496148418e-07, "loss": 0.1686, "step": 7808 }, { "epoch": 0.8, "learning_rate": 2.141694694586091e-07, "loss": 0.2216, "step": 7840 }, { "epoch": 0.8, "learning_rate": 2.1069762395573396e-07, "loss": 0.2418, "step": 7872 }, { "epoch": 0.81, "learning_rate": 2.0722577845285885e-07, "loss": 0.2023, "step": 7904 }, { "epoch": 0.81, "learning_rate": 2.0375393294998372e-07, "loss": 0.3171, "step": 7936 }, { "epoch": 0.81, "learning_rate": 2.0028208744710858e-07, "loss": 0.2518, "step": 7968 }, { "epoch": 0.82, "learning_rate": 1.9681024194423347e-07, "loss": 0.2388, "step": 8000 }, { "epoch": 0.82, "learning_rate": 1.9333839644135836e-07, "loss": 0.2612, "step": 8032 }, { "epoch": 0.82, "learning_rate": 1.8986655093848323e-07, "loss": 0.3058, "step": 8064 }, { "epoch": 0.83, "learning_rate": 1.8639470543560812e-07, "loss": 0.2292, "step": 8096 }, { "epoch": 0.83, "learning_rate": 1.8292285993273298e-07, "loss": 0.3214, "step": 8128 }, { "epoch": 0.83, "learning_rate": 1.7945101442985785e-07, "loss": 0.2081, "step": 8160 }, { "epoch": 0.84, "learning_rate": 1.7597916892698276e-07, "loss": 0.3473, "step": 8192 }, { "epoch": 0.84, "learning_rate": 1.7250732342410763e-07, "loss": 0.2174, "step": 8224 }, { "epoch": 0.84, "learning_rate": 1.690354779212325e-07, "loss": 0.2021, "step": 8256 }, { "epoch": 0.85, "learning_rate": 1.6556363241835738e-07, "loss": 0.2808, "step": 8288 }, { "epoch": 0.85, "learning_rate": 1.6209178691548225e-07, "loss": 0.2753, "step": 8320 }, { "epoch": 0.85, "learning_rate": 1.586199414126071e-07, "loss": 0.1965, "step": 8352 }, { "epoch": 0.86, "learning_rate": 1.5514809590973203e-07, "loss": 0.2076, "step": 8384 }, { "epoch": 0.86, "learning_rate": 1.516762504068569e-07, "loss": 0.2856, "step": 8416 }, { "epoch": 0.86, "learning_rate": 1.4820440490398176e-07, "loss": 0.2812, "step": 8448 }, { "epoch": 0.86, "learning_rate": 1.4473255940110665e-07, "loss": 0.28, "step": 8480 }, { "epoch": 0.87, "learning_rate": 1.412607138982315e-07, "loss": 0.2871, "step": 8512 }, { "epoch": 0.87, "learning_rate": 1.377888683953564e-07, "loss": 0.2063, "step": 8544 }, { "epoch": 0.87, "learning_rate": 1.343170228924813e-07, "loss": 0.2929, "step": 8576 }, { "epoch": 0.88, "learning_rate": 1.3084517738960616e-07, "loss": 0.2784, "step": 8608 }, { "epoch": 0.88, "learning_rate": 1.2737333188673102e-07, "loss": 0.2889, "step": 8640 }, { "epoch": 0.88, "learning_rate": 1.239014863838559e-07, "loss": 0.2367, "step": 8672 }, { "epoch": 0.89, "learning_rate": 1.2042964088098078e-07, "loss": 0.2572, "step": 8704 }, { "epoch": 0.89, "learning_rate": 1.1695779537810567e-07, "loss": 0.2167, "step": 8736 }, { "epoch": 0.89, "learning_rate": 1.1348594987523056e-07, "loss": 0.2448, "step": 8768 }, { "epoch": 0.9, "learning_rate": 1.1001410437235542e-07, "loss": 0.2384, "step": 8800 }, { "epoch": 0.9, "learning_rate": 1.065422588694803e-07, "loss": 0.2742, "step": 8832 }, { "epoch": 0.9, "learning_rate": 1.0307041336660519e-07, "loss": 0.2615, "step": 8864 }, { "epoch": 0.91, "learning_rate": 9.959856786373005e-08, "loss": 0.2511, "step": 8896 }, { "epoch": 0.91, "learning_rate": 9.612672236085493e-08, "loss": 0.218, "step": 8928 }, { "epoch": 0.91, "learning_rate": 9.265487685797982e-08, "loss": 0.1846, "step": 8960 }, { "epoch": 0.92, "learning_rate": 8.918303135510468e-08, "loss": 0.2521, "step": 8992 }, { "epoch": 0.92, "learning_rate": 8.571118585222958e-08, "loss": 0.1836, "step": 9024 }, { "epoch": 0.92, "learning_rate": 8.223934034935445e-08, "loss": 0.2421, "step": 9056 }, { "epoch": 0.93, "learning_rate": 7.876749484647933e-08, "loss": 0.2631, "step": 9088 }, { "epoch": 0.93, "learning_rate": 7.529564934360421e-08, "loss": 0.2602, "step": 9120 }, { "epoch": 0.93, "learning_rate": 7.182380384072909e-08, "loss": 0.2896, "step": 9152 }, { "epoch": 0.94, "learning_rate": 6.835195833785396e-08, "loss": 0.2901, "step": 9184 }, { "epoch": 0.94, "learning_rate": 6.488011283497884e-08, "loss": 0.2488, "step": 9216 }, { "epoch": 0.94, "learning_rate": 6.140826733210372e-08, "loss": 0.2209, "step": 9248 }, { "epoch": 0.95, "learning_rate": 5.7936421829228595e-08, "loss": 0.2494, "step": 9280 }, { "epoch": 0.95, "learning_rate": 5.446457632635348e-08, "loss": 0.251, "step": 9312 }, { "epoch": 0.95, "learning_rate": 5.099273082347835e-08, "loss": 0.2243, "step": 9344 }, { "epoch": 0.96, "learning_rate": 4.752088532060323e-08, "loss": 0.259, "step": 9376 }, { "epoch": 0.96, "learning_rate": 4.404903981772811e-08, "loss": 0.267, "step": 9408 }, { "epoch": 0.96, "learning_rate": 4.057719431485299e-08, "loss": 0.2674, "step": 9440 }, { "epoch": 0.97, "learning_rate": 3.7105348811977866e-08, "loss": 0.2383, "step": 9472 }, { "epoch": 0.97, "learning_rate": 3.363350330910274e-08, "loss": 0.2545, "step": 9504 }, { "epoch": 0.97, "learning_rate": 3.016165780622762e-08, "loss": 0.2904, "step": 9536 }, { "epoch": 0.98, "learning_rate": 2.6689812303352498e-08, "loss": 0.2211, "step": 9568 }, { "epoch": 0.98, "learning_rate": 2.321796680047738e-08, "loss": 0.2802, "step": 9600 }, { "epoch": 0.98, "learning_rate": 1.9746121297602256e-08, "loss": 0.2471, "step": 9632 }, { "epoch": 0.99, "learning_rate": 1.6274275794727136e-08, "loss": 0.2593, "step": 9664 }, { "epoch": 0.99, "learning_rate": 1.2802430291852012e-08, "loss": 0.1877, "step": 9696 }, { "epoch": 0.99, "learning_rate": 9.33058478897689e-09, "loss": 0.2619, "step": 9728 }, { "epoch": 1.0, "learning_rate": 5.858739286101768e-09, "loss": 0.1832, "step": 9760 }, { "epoch": 1.0, "learning_rate": 2.3868937832266464e-09, "loss": 0.1479, "step": 9792 } ], "logging_steps": 32, "max_steps": 9805, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9804, "total_flos": 4.16311373758464e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }