{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993906154783668, "eval_steps": 500, "global_step": 615, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.142857142857143e-07, "loss": 1.2236, "step": 2 }, { "epoch": 0.01, "learning_rate": 1.4285714285714286e-06, "loss": 1.4631, "step": 4 }, { "epoch": 0.01, "learning_rate": 2.1428571428571427e-06, "loss": 1.405, "step": 6 }, { "epoch": 0.01, "learning_rate": 2.8571428571428573e-06, "loss": 1.7103, "step": 8 }, { "epoch": 0.02, "learning_rate": 3.5714285714285718e-06, "loss": 1.6165, "step": 10 }, { "epoch": 0.02, "learning_rate": 4.2857142857142855e-06, "loss": 1.7487, "step": 12 }, { "epoch": 0.02, "learning_rate": 5e-06, "loss": 1.7254, "step": 14 }, { "epoch": 0.03, "learning_rate": 5.7142857142857145e-06, "loss": 1.6604, "step": 16 }, { "epoch": 0.03, "learning_rate": 6.4285714285714295e-06, "loss": 1.622, "step": 18 }, { "epoch": 0.03, "learning_rate": 7.1428571428571436e-06, "loss": 1.8001, "step": 20 }, { "epoch": 0.04, "learning_rate": 7.857142857142858e-06, "loss": 1.5987, "step": 22 }, { "epoch": 0.04, "learning_rate": 8.571428571428571e-06, "loss": 1.7409, "step": 24 }, { "epoch": 0.04, "learning_rate": 9.285714285714288e-06, "loss": 1.9162, "step": 26 }, { "epoch": 0.05, "learning_rate": 1e-05, "loss": 1.8572, "step": 28 }, { "epoch": 0.05, "learning_rate": 1.0714285714285714e-05, "loss": 1.9669, "step": 30 }, { "epoch": 0.05, "learning_rate": 1.1428571428571429e-05, "loss": 1.9432, "step": 32 }, { "epoch": 0.06, "learning_rate": 1.2142857142857142e-05, "loss": 1.9297, "step": 34 }, { "epoch": 0.06, "learning_rate": 1.2857142857142859e-05, "loss": 2.0854, "step": 36 }, { "epoch": 0.06, "learning_rate": 1.3571428571428574e-05, "loss": 2.092, "step": 38 }, { "epoch": 0.07, "learning_rate": 1.4285714285714287e-05, "loss": 2.1925, "step": 40 }, { "epoch": 0.07, "learning_rate": 1.5000000000000002e-05, "loss": 2.3367, "step": 42 }, { "epoch": 0.07, "learning_rate": 1.5714285714285715e-05, "loss": 2.4136, "step": 44 }, { "epoch": 0.07, "learning_rate": 1.642857142857143e-05, "loss": 2.5252, "step": 46 }, { "epoch": 0.08, "learning_rate": 1.7142857142857142e-05, "loss": 2.5727, "step": 48 }, { "epoch": 0.08, "learning_rate": 1.785714285714286e-05, "loss": 3.1009, "step": 50 }, { "epoch": 0.08, "learning_rate": 1.8571428571428575e-05, "loss": 1.4247, "step": 52 }, { "epoch": 0.09, "learning_rate": 1.928571428571429e-05, "loss": 1.3148, "step": 54 }, { "epoch": 0.09, "learning_rate": 2e-05, "loss": 1.6329, "step": 56 }, { "epoch": 0.09, "learning_rate": 1.999993832507735e-05, "loss": 1.4144, "step": 58 }, { "epoch": 0.1, "learning_rate": 1.9999753301070156e-05, "loss": 1.6329, "step": 60 }, { "epoch": 0.1, "learning_rate": 1.9999444930260684e-05, "loss": 1.5957, "step": 62 }, { "epoch": 0.1, "learning_rate": 1.9999013216452688e-05, "loss": 1.4545, "step": 64 }, { "epoch": 0.11, "learning_rate": 1.999845816497135e-05, "loss": 1.7986, "step": 66 }, { "epoch": 0.11, "learning_rate": 1.9997779782663217e-05, "loss": 1.7363, "step": 68 }, { "epoch": 0.11, "learning_rate": 1.999697807789613e-05, "loss": 1.7775, "step": 70 }, { "epoch": 0.12, "learning_rate": 1.99960530605591e-05, "loss": 1.6756, "step": 72 }, { "epoch": 0.12, "learning_rate": 1.9995004742062206e-05, "loss": 1.5798, "step": 74 }, { "epoch": 0.12, "learning_rate": 1.999383313533644e-05, "loss": 1.8129, "step": 76 }, { "epoch": 0.13, "learning_rate": 1.9992538254833548e-05, "loss": 1.7426, "step": 78 }, { "epoch": 0.13, "learning_rate": 1.9991120116525866e-05, "loss": 1.7854, "step": 80 }, { "epoch": 0.13, "learning_rate": 1.9989578737906107e-05, "loss": 1.9305, "step": 82 }, { "epoch": 0.14, "learning_rate": 1.9987914137987153e-05, "loss": 1.9058, "step": 84 }, { "epoch": 0.14, "learning_rate": 1.9986126337301814e-05, "loss": 1.8474, "step": 86 }, { "epoch": 0.14, "learning_rate": 1.9984215357902586e-05, "loss": 1.8505, "step": 88 }, { "epoch": 0.15, "learning_rate": 1.9982181223361373e-05, "loss": 1.8879, "step": 90 }, { "epoch": 0.15, "learning_rate": 1.9980023958769195e-05, "loss": 2.229, "step": 92 }, { "epoch": 0.15, "learning_rate": 1.9977743590735866e-05, "loss": 2.0788, "step": 94 }, { "epoch": 0.16, "learning_rate": 1.9975340147389707e-05, "loss": 2.3521, "step": 96 }, { "epoch": 0.16, "learning_rate": 1.997281365837714e-05, "loss": 2.2187, "step": 98 }, { "epoch": 0.16, "learning_rate": 1.9970164154862375e-05, "loss": 2.2225, "step": 100 }, { "epoch": 0.17, "learning_rate": 1.9967391669526995e-05, "loss": 1.2585, "step": 102 }, { "epoch": 0.17, "learning_rate": 1.996449623656956e-05, "loss": 1.4347, "step": 104 }, { "epoch": 0.17, "learning_rate": 1.9961477891705203e-05, "loss": 1.552, "step": 106 }, { "epoch": 0.18, "learning_rate": 1.9958336672165147e-05, "loss": 1.571, "step": 108 }, { "epoch": 0.18, "learning_rate": 1.9955072616696294e-05, "loss": 1.5051, "step": 110 }, { "epoch": 0.18, "learning_rate": 1.9951685765560717e-05, "loss": 1.6931, "step": 112 }, { "epoch": 0.19, "learning_rate": 1.994817616053517e-05, "loss": 1.5936, "step": 114 }, { "epoch": 0.19, "learning_rate": 1.994454384491058e-05, "loss": 1.5303, "step": 116 }, { "epoch": 0.19, "learning_rate": 1.9940788863491503e-05, "loss": 1.5092, "step": 118 }, { "epoch": 0.2, "learning_rate": 1.9936911262595574e-05, "loss": 1.678, "step": 120 }, { "epoch": 0.2, "learning_rate": 1.993291109005294e-05, "loss": 1.4946, "step": 122 }, { "epoch": 0.2, "learning_rate": 1.9928788395205673e-05, "loss": 1.7905, "step": 124 }, { "epoch": 0.2, "learning_rate": 1.9924543228907147e-05, "loss": 1.788, "step": 126 }, { "epoch": 0.21, "learning_rate": 1.992017564352142e-05, "loss": 1.6872, "step": 128 }, { "epoch": 0.21, "learning_rate": 1.9915685692922592e-05, "loss": 1.7038, "step": 130 }, { "epoch": 0.21, "learning_rate": 1.9911073432494138e-05, "loss": 1.7661, "step": 132 }, { "epoch": 0.22, "learning_rate": 1.9906338919128214e-05, "loss": 1.7958, "step": 134 }, { "epoch": 0.22, "learning_rate": 1.990148221122497e-05, "loss": 1.8836, "step": 136 }, { "epoch": 0.22, "learning_rate": 1.9896503368691826e-05, "loss": 1.9028, "step": 138 }, { "epoch": 0.23, "learning_rate": 1.989140245294272e-05, "loss": 2.077, "step": 140 }, { "epoch": 0.23, "learning_rate": 1.988617952689738e-05, "loss": 1.9555, "step": 142 }, { "epoch": 0.23, "learning_rate": 1.988083465498051e-05, "loss": 2.2974, "step": 144 }, { "epoch": 0.24, "learning_rate": 1.9875367903121022e-05, "loss": 2.1295, "step": 146 }, { "epoch": 0.24, "learning_rate": 1.9869779338751217e-05, "loss": 2.2783, "step": 148 }, { "epoch": 0.24, "learning_rate": 1.9864069030805955e-05, "loss": 2.0852, "step": 150 }, { "epoch": 0.25, "learning_rate": 1.9858237049721793e-05, "loss": 1.5204, "step": 152 }, { "epoch": 0.25, "learning_rate": 1.9852283467436124e-05, "loss": 1.4498, "step": 154 }, { "epoch": 0.25, "learning_rate": 1.98462083573863e-05, "loss": 1.4835, "step": 156 }, { "epoch": 0.26, "learning_rate": 1.9840011794508702e-05, "loss": 1.5079, "step": 158 }, { "epoch": 0.26, "learning_rate": 1.983369385523784e-05, "loss": 1.5743, "step": 160 }, { "epoch": 0.26, "learning_rate": 1.98272546175054e-05, "loss": 1.5409, "step": 162 }, { "epoch": 0.27, "learning_rate": 1.982069416073928e-05, "loss": 1.5208, "step": 164 }, { "epoch": 0.27, "learning_rate": 1.9814012565862607e-05, "loss": 1.7327, "step": 166 }, { "epoch": 0.27, "learning_rate": 1.9807209915292754e-05, "loss": 1.7466, "step": 168 }, { "epoch": 0.28, "learning_rate": 1.9800286292940313e-05, "loss": 1.6901, "step": 170 }, { "epoch": 0.28, "learning_rate": 1.9793241784208054e-05, "loss": 1.7233, "step": 172 }, { "epoch": 0.28, "learning_rate": 1.978607647598989e-05, "loss": 1.6692, "step": 174 }, { "epoch": 0.29, "learning_rate": 1.9778790456669777e-05, "loss": 1.7248, "step": 176 }, { "epoch": 0.29, "learning_rate": 1.9771383816120658e-05, "loss": 1.8977, "step": 178 }, { "epoch": 0.29, "learning_rate": 1.976385664570333e-05, "loss": 1.8373, "step": 180 }, { "epoch": 0.3, "learning_rate": 1.9756209038265317e-05, "loss": 1.8188, "step": 182 }, { "epoch": 0.3, "learning_rate": 1.9748441088139746e-05, "loss": 1.8918, "step": 184 }, { "epoch": 0.3, "learning_rate": 1.9740552891144157e-05, "loss": 1.9215, "step": 186 }, { "epoch": 0.31, "learning_rate": 1.973254454457934e-05, "loss": 1.9047, "step": 188 }, { "epoch": 0.31, "learning_rate": 1.9724416147228127e-05, "loss": 2.0088, "step": 190 }, { "epoch": 0.31, "learning_rate": 1.971616779935417e-05, "loss": 2.1103, "step": 192 }, { "epoch": 0.32, "learning_rate": 1.9707799602700712e-05, "loss": 2.1014, "step": 194 }, { "epoch": 0.32, "learning_rate": 1.9699311660489333e-05, "loss": 1.8813, "step": 196 }, { "epoch": 0.32, "learning_rate": 1.969070407741867e-05, "loss": 2.2869, "step": 198 }, { "epoch": 0.33, "learning_rate": 1.968197695966312e-05, "loss": 2.2136, "step": 200 }, { "epoch": 0.33, "learning_rate": 1.9673130414871556e-05, "loss": 1.473, "step": 202 }, { "epoch": 0.33, "learning_rate": 1.9664164552165957e-05, "loss": 1.4018, "step": 204 }, { "epoch": 0.33, "learning_rate": 1.9655079482140115e-05, "loss": 1.3433, "step": 206 }, { "epoch": 0.34, "learning_rate": 1.964587531685822e-05, "loss": 1.4818, "step": 208 }, { "epoch": 0.34, "learning_rate": 1.9636552169853514e-05, "loss": 1.5572, "step": 210 }, { "epoch": 0.34, "learning_rate": 1.9627110156126862e-05, "loss": 1.5843, "step": 212 }, { "epoch": 0.35, "learning_rate": 1.9617549392145365e-05, "loss": 1.6586, "step": 214 }, { "epoch": 0.35, "learning_rate": 1.96078699958409e-05, "loss": 1.4021, "step": 216 }, { "epoch": 0.35, "learning_rate": 1.9598072086608663e-05, "loss": 1.4863, "step": 218 }, { "epoch": 0.36, "learning_rate": 1.958815578530572e-05, "loss": 1.6731, "step": 220 }, { "epoch": 0.36, "learning_rate": 1.9578121214249485e-05, "loss": 1.6004, "step": 222 }, { "epoch": 0.36, "learning_rate": 1.956796849721625e-05, "loss": 1.5723, "step": 224 }, { "epoch": 0.37, "learning_rate": 1.9557697759439613e-05, "loss": 1.7214, "step": 226 }, { "epoch": 0.37, "learning_rate": 1.954730912760897e-05, "loss": 1.7316, "step": 228 }, { "epoch": 0.37, "learning_rate": 1.9536802729867926e-05, "loss": 1.7336, "step": 230 }, { "epoch": 0.38, "learning_rate": 1.9526178695812747e-05, "loss": 1.7207, "step": 232 }, { "epoch": 0.38, "learning_rate": 1.9515437156490724e-05, "loss": 2.0022, "step": 234 }, { "epoch": 0.38, "learning_rate": 1.950457824439857e-05, "loss": 1.7205, "step": 236 }, { "epoch": 0.39, "learning_rate": 1.9493602093480807e-05, "loss": 1.9002, "step": 238 }, { "epoch": 0.39, "learning_rate": 1.9482508839128087e-05, "loss": 2.0414, "step": 240 }, { "epoch": 0.39, "learning_rate": 1.9471298618175523e-05, "loss": 2.1061, "step": 242 }, { "epoch": 0.4, "learning_rate": 1.9459971568901026e-05, "loss": 2.2256, "step": 244 }, { "epoch": 0.4, "learning_rate": 1.944852783102357e-05, "loss": 2.0902, "step": 246 }, { "epoch": 0.4, "learning_rate": 1.9436967545701485e-05, "loss": 2.1776, "step": 248 }, { "epoch": 0.41, "learning_rate": 1.9425290855530705e-05, "loss": 2.1136, "step": 250 }, { "epoch": 0.41, "learning_rate": 1.9413497904543033e-05, "loss": 1.2511, "step": 252 }, { "epoch": 0.41, "learning_rate": 1.9401588838204334e-05, "loss": 1.3177, "step": 254 }, { "epoch": 0.42, "learning_rate": 1.9389563803412753e-05, "loss": 1.4358, "step": 256 }, { "epoch": 0.42, "learning_rate": 1.9377422948496912e-05, "loss": 1.672, "step": 258 }, { "epoch": 0.42, "learning_rate": 1.9365166423214065e-05, "loss": 1.6008, "step": 260 }, { "epoch": 0.43, "learning_rate": 1.9352794378748267e-05, "loss": 1.5744, "step": 262 }, { "epoch": 0.43, "learning_rate": 1.934030696770849e-05, "loss": 1.4593, "step": 264 }, { "epoch": 0.43, "learning_rate": 1.932770434412676e-05, "loss": 1.7566, "step": 266 }, { "epoch": 0.44, "learning_rate": 1.931498666345624e-05, "loss": 1.6279, "step": 268 }, { "epoch": 0.44, "learning_rate": 1.9302154082569328e-05, "loss": 1.5859, "step": 270 }, { "epoch": 0.44, "learning_rate": 1.928920675975571e-05, "loss": 1.5577, "step": 272 }, { "epoch": 0.45, "learning_rate": 1.9276144854720412e-05, "loss": 1.5837, "step": 274 }, { "epoch": 0.45, "learning_rate": 1.9262968528581828e-05, "loss": 1.6993, "step": 276 }, { "epoch": 0.45, "learning_rate": 1.9249677943869742e-05, "loss": 1.6503, "step": 278 }, { "epoch": 0.46, "learning_rate": 1.9236273264523304e-05, "loss": 1.8387, "step": 280 }, { "epoch": 0.46, "learning_rate": 1.9222754655889035e-05, "loss": 1.7513, "step": 282 }, { "epoch": 0.46, "learning_rate": 1.9209122284718757e-05, "loss": 1.842, "step": 284 }, { "epoch": 0.46, "learning_rate": 1.919537631916756e-05, "loss": 1.9954, "step": 286 }, { "epoch": 0.47, "learning_rate": 1.9181516928791715e-05, "loss": 2.0364, "step": 288 }, { "epoch": 0.47, "learning_rate": 1.916754428454659e-05, "loss": 2.0113, "step": 290 }, { "epoch": 0.47, "learning_rate": 1.9153458558784536e-05, "loss": 2.0972, "step": 292 }, { "epoch": 0.48, "learning_rate": 1.9139259925252756e-05, "loss": 2.1826, "step": 294 }, { "epoch": 0.48, "learning_rate": 1.912494855909118e-05, "loss": 2.2134, "step": 296 }, { "epoch": 0.48, "learning_rate": 1.911052463683029e-05, "loss": 2.1347, "step": 298 }, { "epoch": 0.49, "learning_rate": 1.9095988336388945e-05, "loss": 2.1557, "step": 300 }, { "epoch": 0.49, "learning_rate": 1.908133983707218e-05, "loss": 1.6206, "step": 302 }, { "epoch": 0.49, "learning_rate": 1.906657931956901e-05, "loss": 1.466, "step": 304 }, { "epoch": 0.5, "learning_rate": 1.9051706965950192e-05, "loss": 1.4884, "step": 306 }, { "epoch": 0.5, "learning_rate": 1.9036722959665975e-05, "loss": 1.485, "step": 308 }, { "epoch": 0.5, "learning_rate": 1.9021627485543844e-05, "loss": 1.6297, "step": 310 }, { "epoch": 0.51, "learning_rate": 1.9006420729786246e-05, "loss": 1.6544, "step": 312 }, { "epoch": 0.51, "learning_rate": 1.899110287996827e-05, "loss": 1.552, "step": 314 }, { "epoch": 0.51, "learning_rate": 1.897567412503536e-05, "loss": 1.7586, "step": 316 }, { "epoch": 0.52, "learning_rate": 1.8960134655300966e-05, "loss": 1.6673, "step": 318 }, { "epoch": 0.52, "learning_rate": 1.894448466244421e-05, "loss": 1.6194, "step": 320 }, { "epoch": 0.52, "learning_rate": 1.8928724339507515e-05, "loss": 1.4731, "step": 322 }, { "epoch": 0.53, "learning_rate": 1.8912853880894215e-05, "loss": 1.6042, "step": 324 }, { "epoch": 0.53, "learning_rate": 1.8896873482366173e-05, "loss": 1.7247, "step": 326 }, { "epoch": 0.53, "learning_rate": 1.8880783341041357e-05, "loss": 1.7958, "step": 328 }, { "epoch": 0.54, "learning_rate": 1.8864583655391417e-05, "loss": 1.6097, "step": 330 }, { "epoch": 0.54, "learning_rate": 1.8848274625239216e-05, "loss": 1.7074, "step": 332 }, { "epoch": 0.54, "learning_rate": 1.8831856451756394e-05, "loss": 1.928, "step": 334 }, { "epoch": 0.55, "learning_rate": 1.881532933746087e-05, "loss": 1.7286, "step": 336 }, { "epoch": 0.55, "learning_rate": 1.879869348621433e-05, "loss": 1.7516, "step": 338 }, { "epoch": 0.55, "learning_rate": 1.8781949103219758e-05, "loss": 1.95, "step": 340 }, { "epoch": 0.56, "learning_rate": 1.876509639501885e-05, "loss": 1.893, "step": 342 }, { "epoch": 0.56, "learning_rate": 1.8748135569489504e-05, "loss": 2.0016, "step": 344 }, { "epoch": 0.56, "learning_rate": 1.8731066835843237e-05, "loss": 2.1922, "step": 346 }, { "epoch": 0.57, "learning_rate": 1.8713890404622618e-05, "loss": 2.1049, "step": 348 }, { "epoch": 0.57, "learning_rate": 1.869660648769866e-05, "loss": 2.3154, "step": 350 }, { "epoch": 0.57, "learning_rate": 1.867921529826821e-05, "loss": 1.2, "step": 352 }, { "epoch": 0.58, "learning_rate": 1.8661717050851323e-05, "loss": 1.3897, "step": 354 }, { "epoch": 0.58, "learning_rate": 1.8644111961288605e-05, "loss": 1.4177, "step": 356 }, { "epoch": 0.58, "learning_rate": 1.8626400246738568e-05, "loss": 1.3511, "step": 358 }, { "epoch": 0.59, "learning_rate": 1.8608582125674933e-05, "loss": 1.6614, "step": 360 }, { "epoch": 0.59, "learning_rate": 1.8590657817883952e-05, "loss": 1.5387, "step": 362 }, { "epoch": 0.59, "learning_rate": 1.8572627544461682e-05, "loss": 1.6492, "step": 364 }, { "epoch": 0.59, "learning_rate": 1.8554491527811266e-05, "loss": 1.5073, "step": 366 }, { "epoch": 0.6, "learning_rate": 1.8536249991640192e-05, "loss": 1.7497, "step": 368 }, { "epoch": 0.6, "learning_rate": 1.8517903160957523e-05, "loss": 1.4961, "step": 370 }, { "epoch": 0.6, "learning_rate": 1.8499451262071134e-05, "loss": 1.7061, "step": 372 }, { "epoch": 0.61, "learning_rate": 1.848089452258491e-05, "loss": 1.6356, "step": 374 }, { "epoch": 0.61, "learning_rate": 1.846223317139595e-05, "loss": 1.6088, "step": 376 }, { "epoch": 0.61, "learning_rate": 1.844346743869173e-05, "loss": 1.6825, "step": 378 }, { "epoch": 0.62, "learning_rate": 1.8424597555947268e-05, "loss": 1.6737, "step": 380 }, { "epoch": 0.62, "learning_rate": 1.840562375592228e-05, "loss": 1.8381, "step": 382 }, { "epoch": 0.62, "learning_rate": 1.8386546272658296e-05, "loss": 1.8862, "step": 384 }, { "epoch": 0.63, "learning_rate": 1.8367365341475777e-05, "loss": 1.8186, "step": 386 }, { "epoch": 0.63, "learning_rate": 1.834808119897121e-05, "loss": 1.8536, "step": 388 }, { "epoch": 0.63, "learning_rate": 1.8328694083014196e-05, "loss": 2.0862, "step": 390 }, { "epoch": 0.64, "learning_rate": 1.830920423274451e-05, "loss": 2.2656, "step": 392 }, { "epoch": 0.64, "learning_rate": 1.8289611888569158e-05, "loss": 2.085, "step": 394 }, { "epoch": 0.64, "learning_rate": 1.8269917292159393e-05, "loss": 2.2241, "step": 396 }, { "epoch": 0.65, "learning_rate": 1.8250120686447767e-05, "loss": 2.0626, "step": 398 }, { "epoch": 0.65, "learning_rate": 1.82302223156251e-05, "loss": 2.3965, "step": 400 }, { "epoch": 0.65, "learning_rate": 1.8210222425137485e-05, "loss": 1.4121, "step": 402 }, { "epoch": 0.66, "learning_rate": 1.8190121261683268e-05, "loss": 1.4354, "step": 404 }, { "epoch": 0.66, "learning_rate": 1.816991907320999e-05, "loss": 1.493, "step": 406 }, { "epoch": 0.66, "learning_rate": 1.8149616108911327e-05, "loss": 1.5228, "step": 408 }, { "epoch": 0.67, "learning_rate": 1.8129212619224034e-05, "loss": 1.4622, "step": 410 }, { "epoch": 0.67, "learning_rate": 1.8108708855824838e-05, "loss": 1.6105, "step": 412 }, { "epoch": 0.67, "learning_rate": 1.808810507162735e-05, "loss": 1.4222, "step": 414 }, { "epoch": 0.68, "learning_rate": 1.8067401520778918e-05, "loss": 1.5116, "step": 416 }, { "epoch": 0.68, "learning_rate": 1.8046598458657528e-05, "loss": 1.6267, "step": 418 }, { "epoch": 0.68, "learning_rate": 1.8025696141868635e-05, "loss": 1.6338, "step": 420 }, { "epoch": 0.69, "learning_rate": 1.800469482824198e-05, "loss": 1.6313, "step": 422 }, { "epoch": 0.69, "learning_rate": 1.798359477682845e-05, "loss": 1.6542, "step": 424 }, { "epoch": 0.69, "learning_rate": 1.7962396247896855e-05, "loss": 1.7611, "step": 426 }, { "epoch": 0.7, "learning_rate": 1.7941099502930716e-05, "loss": 1.6318, "step": 428 }, { "epoch": 0.7, "learning_rate": 1.7919704804625055e-05, "loss": 1.7348, "step": 430 }, { "epoch": 0.7, "learning_rate": 1.789821241688315e-05, "loss": 1.7121, "step": 432 }, { "epoch": 0.71, "learning_rate": 1.787662260481326e-05, "loss": 1.8076, "step": 434 }, { "epoch": 0.71, "learning_rate": 1.785493563472539e-05, "loss": 1.9255, "step": 436 }, { "epoch": 0.71, "learning_rate": 1.7833151774127978e-05, "loss": 1.8057, "step": 438 }, { "epoch": 0.72, "learning_rate": 1.781127129172461e-05, "loss": 1.8363, "step": 440 }, { "epoch": 0.72, "learning_rate": 1.7789294457410693e-05, "loss": 2.1584, "step": 442 }, { "epoch": 0.72, "learning_rate": 1.7767221542270146e-05, "loss": 2.0338, "step": 444 }, { "epoch": 0.72, "learning_rate": 1.7745052818572033e-05, "loss": 2.1332, "step": 446 }, { "epoch": 0.73, "learning_rate": 1.772278855976721e-05, "loss": 2.0207, "step": 448 }, { "epoch": 0.73, "learning_rate": 1.770042904048498e-05, "loss": 1.9562, "step": 450 }, { "epoch": 0.73, "learning_rate": 1.7677974536529657e-05, "loss": 1.4179, "step": 452 }, { "epoch": 0.74, "learning_rate": 1.76554253248772e-05, "loss": 1.306, "step": 454 }, { "epoch": 0.74, "learning_rate": 1.7632781683671787e-05, "loss": 1.5095, "step": 456 }, { "epoch": 0.74, "learning_rate": 1.7610043892222382e-05, "loss": 1.4135, "step": 458 }, { "epoch": 0.75, "learning_rate": 1.7587212230999298e-05, "loss": 1.665, "step": 460 }, { "epoch": 0.75, "learning_rate": 1.7564286981630713e-05, "loss": 1.527, "step": 462 }, { "epoch": 0.75, "learning_rate": 1.7541268426899222e-05, "loss": 1.515, "step": 464 }, { "epoch": 0.76, "learning_rate": 1.751815685073835e-05, "loss": 1.544, "step": 466 }, { "epoch": 0.76, "learning_rate": 1.7494952538229034e-05, "loss": 1.6127, "step": 468 }, { "epoch": 0.76, "learning_rate": 1.7471655775596097e-05, "loss": 1.5762, "step": 470 }, { "epoch": 0.77, "learning_rate": 1.7448266850204754e-05, "loss": 1.7019, "step": 472 }, { "epoch": 0.77, "learning_rate": 1.7424786050557036e-05, "loss": 1.6097, "step": 474 }, { "epoch": 0.77, "learning_rate": 1.740121366628824e-05, "loss": 1.6101, "step": 476 }, { "epoch": 0.78, "learning_rate": 1.7377549988163373e-05, "loss": 1.6759, "step": 478 }, { "epoch": 0.78, "learning_rate": 1.7353795308073526e-05, "loss": 1.7808, "step": 480 }, { "epoch": 0.78, "learning_rate": 1.7329949919032315e-05, "loss": 1.6366, "step": 482 }, { "epoch": 0.79, "learning_rate": 1.7306014115172244e-05, "loss": 1.8784, "step": 484 }, { "epoch": 0.79, "learning_rate": 1.7281988191741085e-05, "loss": 1.7896, "step": 486 }, { "epoch": 0.79, "learning_rate": 1.7257872445098232e-05, "loss": 1.9701, "step": 488 }, { "epoch": 0.8, "learning_rate": 1.7233667172711045e-05, "loss": 1.7598, "step": 490 }, { "epoch": 0.8, "learning_rate": 1.7209372673151186e-05, "loss": 1.993, "step": 492 }, { "epoch": 0.8, "learning_rate": 1.718498924609093e-05, "loss": 1.9188, "step": 494 }, { "epoch": 0.81, "learning_rate": 1.7160517192299474e-05, "loss": 2.099, "step": 496 }, { "epoch": 0.81, "learning_rate": 1.7135956813639222e-05, "loss": 2.0652, "step": 498 }, { "epoch": 0.81, "learning_rate": 1.7111308413062063e-05, "loss": 2.0021, "step": 500 }, { "epoch": 0.82, "learning_rate": 1.7086572294605642e-05, "loss": 1.3524, "step": 502 }, { "epoch": 0.82, "learning_rate": 1.7061748763389593e-05, "loss": 1.3037, "step": 504 }, { "epoch": 0.82, "learning_rate": 1.703683812561179e-05, "loss": 1.6416, "step": 506 }, { "epoch": 0.83, "learning_rate": 1.701184068854457e-05, "loss": 1.3975, "step": 508 }, { "epoch": 0.83, "learning_rate": 1.698675676053092e-05, "loss": 1.3872, "step": 510 }, { "epoch": 0.83, "learning_rate": 1.696158665098072e-05, "loss": 1.5235, "step": 512 }, { "epoch": 0.84, "learning_rate": 1.6936330670366867e-05, "loss": 1.6902, "step": 514 }, { "epoch": 0.84, "learning_rate": 1.69109891302215e-05, "loss": 1.6784, "step": 516 }, { "epoch": 0.84, "learning_rate": 1.6885562343132124e-05, "loss": 1.7635, "step": 518 }, { "epoch": 0.85, "learning_rate": 1.6860050622737764e-05, "loss": 1.5534, "step": 520 }, { "epoch": 0.85, "learning_rate": 1.6834454283725094e-05, "loss": 1.6135, "step": 522 }, { "epoch": 0.85, "learning_rate": 1.6808773641824562e-05, "loss": 1.6691, "step": 524 }, { "epoch": 0.85, "learning_rate": 1.678300901380649e-05, "loss": 1.6357, "step": 526 }, { "epoch": 0.86, "learning_rate": 1.6757160717477157e-05, "loss": 1.6624, "step": 528 }, { "epoch": 0.86, "learning_rate": 1.6731229071674914e-05, "loss": 1.6901, "step": 530 }, { "epoch": 0.86, "learning_rate": 1.6705214396266196e-05, "loss": 1.8505, "step": 532 }, { "epoch": 0.87, "learning_rate": 1.667911701214163e-05, "loss": 1.9421, "step": 534 }, { "epoch": 0.87, "learning_rate": 1.665293724121204e-05, "loss": 1.9344, "step": 536 }, { "epoch": 0.87, "learning_rate": 1.6626675406404503e-05, "loss": 1.9095, "step": 538 }, { "epoch": 0.88, "learning_rate": 1.660033183165834e-05, "loss": 2.0502, "step": 540 }, { "epoch": 0.88, "learning_rate": 1.6573906841921138e-05, "loss": 2.2613, "step": 542 }, { "epoch": 0.88, "learning_rate": 1.654740076314474e-05, "loss": 2.0965, "step": 544 }, { "epoch": 0.89, "learning_rate": 1.652081392228121e-05, "loss": 2.1539, "step": 546 }, { "epoch": 0.89, "learning_rate": 1.649414664727883e-05, "loss": 2.1094, "step": 548 }, { "epoch": 0.89, "learning_rate": 1.646739926707801e-05, "loss": 2.0737, "step": 550 }, { "epoch": 0.9, "learning_rate": 1.6440572111607287e-05, "loss": 1.2207, "step": 552 }, { "epoch": 0.9, "learning_rate": 1.6413665511779197e-05, "loss": 1.45, "step": 554 }, { "epoch": 0.9, "learning_rate": 1.6386679799486236e-05, "loss": 1.4834, "step": 556 }, { "epoch": 0.91, "learning_rate": 1.635961530759675e-05, "loss": 1.3523, "step": 558 }, { "epoch": 0.91, "learning_rate": 1.6332472369950828e-05, "loss": 1.3546, "step": 560 }, { "epoch": 0.91, "learning_rate": 1.6305251321356183e-05, "loss": 1.5468, "step": 562 }, { "epoch": 0.92, "learning_rate": 1.6277952497584027e-05, "loss": 1.5853, "step": 564 }, { "epoch": 0.92, "learning_rate": 1.6250576235364938e-05, "loss": 1.5904, "step": 566 }, { "epoch": 0.92, "learning_rate": 1.6223122872384675e-05, "loss": 1.6946, "step": 568 }, { "epoch": 0.93, "learning_rate": 1.619559274728005e-05, "loss": 1.5193, "step": 570 }, { "epoch": 0.93, "learning_rate": 1.6167986199634732e-05, "loss": 1.6101, "step": 572 }, { "epoch": 0.93, "learning_rate": 1.6140303569975064e-05, "loss": 1.7246, "step": 574 }, { "epoch": 0.94, "learning_rate": 1.6112545199765844e-05, "loss": 1.7018, "step": 576 }, { "epoch": 0.94, "learning_rate": 1.6084711431406144e-05, "loss": 1.7813, "step": 578 }, { "epoch": 0.94, "learning_rate": 1.605680260822507e-05, "loss": 1.7158, "step": 580 }, { "epoch": 0.95, "learning_rate": 1.6028819074477517e-05, "loss": 1.6571, "step": 582 }, { "epoch": 0.95, "learning_rate": 1.6000761175339944e-05, "loss": 1.8461, "step": 584 }, { "epoch": 0.95, "learning_rate": 1.5972629256906105e-05, "loss": 1.7805, "step": 586 }, { "epoch": 0.96, "learning_rate": 1.5944423666182776e-05, "loss": 2.0146, "step": 588 }, { "epoch": 0.96, "learning_rate": 1.5916144751085485e-05, "loss": 1.9301, "step": 590 }, { "epoch": 0.96, "learning_rate": 1.5887792860434207e-05, "loss": 2.0717, "step": 592 }, { "epoch": 0.97, "learning_rate": 1.5859368343949084e-05, "loss": 2.1568, "step": 594 }, { "epoch": 0.97, "learning_rate": 1.5830871552246076e-05, "loss": 2.0925, "step": 596 }, { "epoch": 0.97, "learning_rate": 1.5802302836832673e-05, "loss": 2.119, "step": 598 }, { "epoch": 0.98, "learning_rate": 1.577366255010354e-05, "loss": 2.1738, "step": 600 }, { "epoch": 0.98, "learning_rate": 1.5744951045336166e-05, "loss": 1.5105, "step": 602 }, { "epoch": 0.98, "learning_rate": 1.5716168676686523e-05, "loss": 1.486, "step": 604 }, { "epoch": 0.98, "learning_rate": 1.568731579918468e-05, "loss": 1.6528, "step": 606 }, { "epoch": 0.99, "learning_rate": 1.5658392768730434e-05, "loss": 1.6643, "step": 608 }, { "epoch": 0.99, "learning_rate": 1.562939994208892e-05, "loss": 1.7925, "step": 610 }, { "epoch": 0.99, "learning_rate": 1.5600337676886205e-05, "loss": 1.8235, "step": 612 }, { "epoch": 1.0, "learning_rate": 1.5571206331604885e-05, "loss": 2.0644, "step": 614 } ], "logging_steps": 2, "max_steps": 1845, "num_train_epochs": 3, "save_steps": 500, "total_flos": 3.3933426003492864e+16, "trial_name": null, "trial_params": null }