{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10003419972640219, "eval_steps": 500, "global_step": 1170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.3728413581848145, "learning_rate": 4.998290013679891e-05, "loss": 10.0117, "step": 4 }, { "epoch": 0.0, "grad_norm": 2.599367380142212, "learning_rate": 4.996580027359781e-05, "loss": 9.1745, "step": 8 }, { "epoch": 0.0, "grad_norm": 2.4951720237731934, "learning_rate": 4.994870041039672e-05, "loss": 8.7624, "step": 12 }, { "epoch": 0.0, "grad_norm": 2.4932913780212402, "learning_rate": 4.9931600547195625e-05, "loss": 8.5709, "step": 16 }, { "epoch": 0.0, "grad_norm": 2.3983781337738037, "learning_rate": 4.991450068399453e-05, "loss": 8.4411, "step": 20 }, { "epoch": 0.0, "grad_norm": 2.2340972423553467, "learning_rate": 4.989740082079344e-05, "loss": 8.1957, "step": 24 }, { "epoch": 0.0, "grad_norm": 2.1964590549468994, "learning_rate": 4.988030095759234e-05, "loss": 8.0598, "step": 28 }, { "epoch": 0.0, "grad_norm": 1.9686617851257324, "learning_rate": 4.986320109439125e-05, "loss": 7.8679, "step": 32 }, { "epoch": 0.0, "grad_norm": 1.8235087394714355, "learning_rate": 4.984610123119015e-05, "loss": 7.7621, "step": 36 }, { "epoch": 0.0, "grad_norm": 1.7598885297775269, "learning_rate": 4.9829001367989056e-05, "loss": 7.5417, "step": 40 }, { "epoch": 0.0, "grad_norm": 1.94789719581604, "learning_rate": 4.981190150478797e-05, "loss": 7.3331, "step": 44 }, { "epoch": 0.0, "grad_norm": 1.9366331100463867, "learning_rate": 4.979480164158687e-05, "loss": 7.1039, "step": 48 }, { "epoch": 0.0, "grad_norm": 2.1228675842285156, "learning_rate": 4.977770177838578e-05, "loss": 6.9965, "step": 52 }, { "epoch": 0.0, "grad_norm": 1.9855167865753174, "learning_rate": 4.976060191518468e-05, "loss": 6.8865, "step": 56 }, { "epoch": 0.01, "grad_norm": 1.8035985231399536, "learning_rate": 4.9743502051983585e-05, "loss": 6.6322, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.8589977025985718, "learning_rate": 4.9726402188782486e-05, "loss": 6.5616, "step": 64 }, { "epoch": 0.01, "grad_norm": 1.745139241218567, "learning_rate": 4.97093023255814e-05, "loss": 6.3479, "step": 68 }, { "epoch": 0.01, "grad_norm": 1.266196608543396, "learning_rate": 4.969220246238031e-05, "loss": 6.2343, "step": 72 }, { "epoch": 0.01, "grad_norm": 2.005223035812378, "learning_rate": 4.967510259917921e-05, "loss": 6.1866, "step": 76 }, { "epoch": 0.01, "grad_norm": 1.5017377138137817, "learning_rate": 4.9658002735978115e-05, "loss": 5.972, "step": 80 }, { "epoch": 0.01, "grad_norm": 1.6136974096298218, "learning_rate": 4.9640902872777016e-05, "loss": 6.1335, "step": 84 }, { "epoch": 0.01, "grad_norm": 1.3865970373153687, "learning_rate": 4.962380300957592e-05, "loss": 5.944, "step": 88 }, { "epoch": 0.01, "grad_norm": 1.283933162689209, "learning_rate": 4.960670314637483e-05, "loss": 5.8713, "step": 92 }, { "epoch": 0.01, "grad_norm": 1.1588549613952637, "learning_rate": 4.958960328317374e-05, "loss": 5.7731, "step": 96 }, { "epoch": 0.01, "grad_norm": 1.3081687688827515, "learning_rate": 4.9572503419972645e-05, "loss": 5.7429, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.6212745904922485, "learning_rate": 4.9555403556771546e-05, "loss": 5.481, "step": 104 }, { "epoch": 0.01, "grad_norm": 1.709843397140503, "learning_rate": 4.953830369357045e-05, "loss": 5.4855, "step": 108 }, { "epoch": 0.01, "grad_norm": 1.2814812660217285, "learning_rate": 4.952120383036936e-05, "loss": 5.4367, "step": 112 }, { "epoch": 0.01, "grad_norm": 1.5539968013763428, "learning_rate": 4.950410396716826e-05, "loss": 5.2652, "step": 116 }, { "epoch": 0.01, "grad_norm": 1.500375509262085, "learning_rate": 4.948700410396717e-05, "loss": 5.5137, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.7938899993896484, "learning_rate": 4.9469904240766076e-05, "loss": 5.4663, "step": 124 }, { "epoch": 0.01, "grad_norm": 1.6169573068618774, "learning_rate": 4.945280437756498e-05, "loss": 5.3137, "step": 128 }, { "epoch": 0.01, "grad_norm": 1.1904114484786987, "learning_rate": 4.943570451436389e-05, "loss": 5.1871, "step": 132 }, { "epoch": 0.01, "grad_norm": 2.5564723014831543, "learning_rate": 4.941860465116279e-05, "loss": 5.4058, "step": 136 }, { "epoch": 0.01, "grad_norm": 1.6187268495559692, "learning_rate": 4.94015047879617e-05, "loss": 5.1919, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.2222367525100708, "learning_rate": 4.93844049247606e-05, "loss": 5.1876, "step": 144 }, { "epoch": 0.01, "grad_norm": 1.5898586511611938, "learning_rate": 4.936730506155951e-05, "loss": 5.1217, "step": 148 }, { "epoch": 0.01, "grad_norm": 1.446902871131897, "learning_rate": 4.935020519835842e-05, "loss": 5.0582, "step": 152 }, { "epoch": 0.01, "grad_norm": 1.862309217453003, "learning_rate": 4.933310533515732e-05, "loss": 4.9028, "step": 156 }, { "epoch": 0.01, "grad_norm": 1.5455704927444458, "learning_rate": 4.931600547195623e-05, "loss": 5.0352, "step": 160 }, { "epoch": 0.01, "grad_norm": 1.772558569908142, "learning_rate": 4.929890560875513e-05, "loss": 4.9707, "step": 164 }, { "epoch": 0.01, "grad_norm": 1.8154480457305908, "learning_rate": 4.9281805745554036e-05, "loss": 5.0588, "step": 168 }, { "epoch": 0.01, "grad_norm": 1.4536504745483398, "learning_rate": 4.9264705882352944e-05, "loss": 4.9997, "step": 172 }, { "epoch": 0.02, "grad_norm": 1.6166527271270752, "learning_rate": 4.924760601915185e-05, "loss": 4.9472, "step": 176 }, { "epoch": 0.02, "grad_norm": 2.1260979175567627, "learning_rate": 4.923050615595076e-05, "loss": 4.944, "step": 180 }, { "epoch": 0.02, "grad_norm": 1.4778213500976562, "learning_rate": 4.921340629274966e-05, "loss": 4.8657, "step": 184 }, { "epoch": 0.02, "grad_norm": 1.255488395690918, "learning_rate": 4.9196306429548566e-05, "loss": 4.7728, "step": 188 }, { "epoch": 0.02, "grad_norm": 1.5698314905166626, "learning_rate": 4.917920656634747e-05, "loss": 4.9433, "step": 192 }, { "epoch": 0.02, "grad_norm": 1.342402696609497, "learning_rate": 4.9162106703146374e-05, "loss": 4.6336, "step": 196 }, { "epoch": 0.02, "grad_norm": 1.4427978992462158, "learning_rate": 4.914500683994528e-05, "loss": 4.7112, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.6528425216674805, "learning_rate": 4.912790697674419e-05, "loss": 4.876, "step": 204 }, { "epoch": 0.02, "grad_norm": 1.706386923789978, "learning_rate": 4.9110807113543096e-05, "loss": 4.5971, "step": 208 }, { "epoch": 0.02, "grad_norm": 1.7773737907409668, "learning_rate": 4.9093707250342e-05, "loss": 4.7944, "step": 212 }, { "epoch": 0.02, "grad_norm": 1.7884222269058228, "learning_rate": 4.9076607387140904e-05, "loss": 4.5912, "step": 216 }, { "epoch": 0.02, "grad_norm": 1.5389587879180908, "learning_rate": 4.905950752393981e-05, "loss": 4.6656, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.7006316184997559, "learning_rate": 4.904240766073871e-05, "loss": 4.6589, "step": 224 }, { "epoch": 0.02, "grad_norm": 1.6314700841903687, "learning_rate": 4.9025307797537626e-05, "loss": 4.5036, "step": 228 }, { "epoch": 0.02, "grad_norm": 1.7512930631637573, "learning_rate": 4.900820793433653e-05, "loss": 4.5105, "step": 232 }, { "epoch": 0.02, "grad_norm": 1.9109033346176147, "learning_rate": 4.8991108071135434e-05, "loss": 4.4449, "step": 236 }, { "epoch": 0.02, "grad_norm": 1.547317385673523, "learning_rate": 4.897400820793434e-05, "loss": 4.4906, "step": 240 }, { "epoch": 0.02, "grad_norm": 1.5143060684204102, "learning_rate": 4.895690834473324e-05, "loss": 4.3545, "step": 244 }, { "epoch": 0.02, "grad_norm": 1.6648136377334595, "learning_rate": 4.893980848153215e-05, "loss": 4.4093, "step": 248 }, { "epoch": 0.02, "grad_norm": 1.5027951002120972, "learning_rate": 4.892270861833106e-05, "loss": 4.3444, "step": 252 }, { "epoch": 0.02, "grad_norm": 1.9429397583007812, "learning_rate": 4.8905608755129964e-05, "loss": 4.4937, "step": 256 }, { "epoch": 0.02, "grad_norm": 1.4009640216827393, "learning_rate": 4.888850889192887e-05, "loss": 4.3625, "step": 260 }, { "epoch": 0.02, "grad_norm": 1.7445279359817505, "learning_rate": 4.887140902872777e-05, "loss": 4.2759, "step": 264 }, { "epoch": 0.02, "grad_norm": 1.4381403923034668, "learning_rate": 4.885430916552668e-05, "loss": 4.574, "step": 268 }, { "epoch": 0.02, "grad_norm": 1.5582979917526245, "learning_rate": 4.883720930232558e-05, "loss": 4.3831, "step": 272 }, { "epoch": 0.02, "grad_norm": 1.461166501045227, "learning_rate": 4.882010943912449e-05, "loss": 4.5049, "step": 276 }, { "epoch": 0.02, "grad_norm": 1.756549596786499, "learning_rate": 4.8803009575923394e-05, "loss": 4.1334, "step": 280 }, { "epoch": 0.02, "grad_norm": 1.9006497859954834, "learning_rate": 4.87859097127223e-05, "loss": 4.3406, "step": 284 }, { "epoch": 0.02, "grad_norm": 1.4551759958267212, "learning_rate": 4.876880984952121e-05, "loss": 4.4688, "step": 288 }, { "epoch": 0.02, "grad_norm": 1.5335747003555298, "learning_rate": 4.875170998632011e-05, "loss": 4.2096, "step": 292 }, { "epoch": 0.03, "grad_norm": 1.4802557229995728, "learning_rate": 4.873461012311902e-05, "loss": 4.0918, "step": 296 }, { "epoch": 0.03, "grad_norm": 1.9993939399719238, "learning_rate": 4.8717510259917924e-05, "loss": 4.3636, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.706895351409912, "learning_rate": 4.8700410396716825e-05, "loss": 4.0891, "step": 304 }, { "epoch": 0.03, "grad_norm": 1.4630484580993652, "learning_rate": 4.868331053351574e-05, "loss": 4.3254, "step": 308 }, { "epoch": 0.03, "grad_norm": 1.4353913068771362, "learning_rate": 4.866621067031464e-05, "loss": 4.1935, "step": 312 }, { "epoch": 0.03, "grad_norm": 1.5042275190353394, "learning_rate": 4.864911080711355e-05, "loss": 4.0913, "step": 316 }, { "epoch": 0.03, "grad_norm": 1.792472004890442, "learning_rate": 4.863201094391245e-05, "loss": 4.2159, "step": 320 }, { "epoch": 0.03, "grad_norm": 1.4059948921203613, "learning_rate": 4.8614911080711355e-05, "loss": 4.2934, "step": 324 }, { "epoch": 0.03, "grad_norm": 1.8408161401748657, "learning_rate": 4.859781121751026e-05, "loss": 4.1482, "step": 328 }, { "epoch": 0.03, "grad_norm": 1.716046690940857, "learning_rate": 4.858071135430917e-05, "loss": 4.0273, "step": 332 }, { "epoch": 0.03, "grad_norm": 1.8415429592132568, "learning_rate": 4.856361149110808e-05, "loss": 4.0837, "step": 336 }, { "epoch": 0.03, "grad_norm": 1.4880731105804443, "learning_rate": 4.854651162790698e-05, "loss": 4.1957, "step": 340 }, { "epoch": 0.03, "grad_norm": 1.8150016069412231, "learning_rate": 4.8529411764705885e-05, "loss": 4.225, "step": 344 }, { "epoch": 0.03, "grad_norm": 1.7828190326690674, "learning_rate": 4.851231190150479e-05, "loss": 4.1053, "step": 348 }, { "epoch": 0.03, "grad_norm": 1.8549565076828003, "learning_rate": 4.849521203830369e-05, "loss": 3.8068, "step": 352 }, { "epoch": 0.03, "grad_norm": 1.4295508861541748, "learning_rate": 4.84781121751026e-05, "loss": 3.9358, "step": 356 }, { "epoch": 0.03, "grad_norm": 1.4277971982955933, "learning_rate": 4.846101231190151e-05, "loss": 4.1512, "step": 360 }, { "epoch": 0.03, "grad_norm": 1.4937466382980347, "learning_rate": 4.8443912448700415e-05, "loss": 4.0162, "step": 364 }, { "epoch": 0.03, "grad_norm": 1.3076913356781006, "learning_rate": 4.842681258549932e-05, "loss": 4.0336, "step": 368 }, { "epoch": 0.03, "grad_norm": 1.673946738243103, "learning_rate": 4.840971272229822e-05, "loss": 4.0915, "step": 372 }, { "epoch": 0.03, "grad_norm": 1.5978224277496338, "learning_rate": 4.839261285909713e-05, "loss": 4.137, "step": 376 }, { "epoch": 0.03, "grad_norm": 2.214573383331299, "learning_rate": 4.837551299589603e-05, "loss": 3.8231, "step": 380 }, { "epoch": 0.03, "grad_norm": 1.4263286590576172, "learning_rate": 4.835841313269494e-05, "loss": 3.9326, "step": 384 }, { "epoch": 0.03, "grad_norm": 1.694931149482727, "learning_rate": 4.8341313269493845e-05, "loss": 4.0673, "step": 388 }, { "epoch": 0.03, "grad_norm": 1.558498740196228, "learning_rate": 4.832421340629275e-05, "loss": 3.9454, "step": 392 }, { "epoch": 0.03, "grad_norm": 1.5231866836547852, "learning_rate": 4.830711354309166e-05, "loss": 4.0214, "step": 396 }, { "epoch": 0.03, "grad_norm": 1.6748324632644653, "learning_rate": 4.829001367989056e-05, "loss": 3.9983, "step": 400 }, { "epoch": 0.03, "grad_norm": 1.3733506202697754, "learning_rate": 4.827291381668947e-05, "loss": 3.8526, "step": 404 }, { "epoch": 0.03, "grad_norm": 1.6695655584335327, "learning_rate": 4.8255813953488375e-05, "loss": 3.8948, "step": 408 }, { "epoch": 0.04, "grad_norm": 1.3915668725967407, "learning_rate": 4.8238714090287276e-05, "loss": 3.8511, "step": 412 }, { "epoch": 0.04, "grad_norm": 1.5736514329910278, "learning_rate": 4.822161422708619e-05, "loss": 3.9446, "step": 416 }, { "epoch": 0.04, "grad_norm": 1.562687635421753, "learning_rate": 4.820451436388509e-05, "loss": 3.6682, "step": 420 }, { "epoch": 0.04, "grad_norm": 1.4215232133865356, "learning_rate": 4.8187414500684e-05, "loss": 3.6618, "step": 424 }, { "epoch": 0.04, "grad_norm": 1.8750267028808594, "learning_rate": 4.8170314637482905e-05, "loss": 3.8246, "step": 428 }, { "epoch": 0.04, "grad_norm": 1.5843207836151123, "learning_rate": 4.8153214774281806e-05, "loss": 3.807, "step": 432 }, { "epoch": 0.04, "grad_norm": 1.6384755373001099, "learning_rate": 4.813611491108071e-05, "loss": 3.8329, "step": 436 }, { "epoch": 0.04, "grad_norm": 1.646612286567688, "learning_rate": 4.811901504787962e-05, "loss": 3.9736, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.592463493347168, "learning_rate": 4.810191518467853e-05, "loss": 3.6107, "step": 444 }, { "epoch": 0.04, "grad_norm": 1.6230803728103638, "learning_rate": 4.808481532147743e-05, "loss": 3.8141, "step": 448 }, { "epoch": 0.04, "grad_norm": 1.7098625898361206, "learning_rate": 4.8067715458276336e-05, "loss": 3.8751, "step": 452 }, { "epoch": 0.04, "grad_norm": 1.441146731376648, "learning_rate": 4.805061559507524e-05, "loss": 3.8461, "step": 456 }, { "epoch": 0.04, "grad_norm": 1.4387036561965942, "learning_rate": 4.8033515731874144e-05, "loss": 3.8432, "step": 460 }, { "epoch": 0.04, "grad_norm": 1.6620376110076904, "learning_rate": 4.801641586867305e-05, "loss": 3.7844, "step": 464 }, { "epoch": 0.04, "grad_norm": 1.5403114557266235, "learning_rate": 4.799931600547196e-05, "loss": 3.6556, "step": 468 }, { "epoch": 0.04, "grad_norm": 1.5642478466033936, "learning_rate": 4.7982216142270866e-05, "loss": 3.7526, "step": 472 }, { "epoch": 0.04, "grad_norm": 1.5027506351470947, "learning_rate": 4.796511627906977e-05, "loss": 3.7862, "step": 476 }, { "epoch": 0.04, "grad_norm": 1.5570485591888428, "learning_rate": 4.7948016415868674e-05, "loss": 3.7656, "step": 480 }, { "epoch": 0.04, "grad_norm": 2.2708029747009277, "learning_rate": 4.793091655266758e-05, "loss": 3.8742, "step": 484 }, { "epoch": 0.04, "grad_norm": 1.5446516275405884, "learning_rate": 4.791381668946648e-05, "loss": 3.6401, "step": 488 }, { "epoch": 0.04, "grad_norm": 1.5480107069015503, "learning_rate": 4.789671682626539e-05, "loss": 3.7441, "step": 492 }, { "epoch": 0.04, "grad_norm": 1.5571659803390503, "learning_rate": 4.78796169630643e-05, "loss": 3.6849, "step": 496 }, { "epoch": 0.04, "grad_norm": 1.9021155834197998, "learning_rate": 4.7862517099863204e-05, "loss": 3.7989, "step": 500 }, { "epoch": 0.04, "grad_norm": 1.5292832851409912, "learning_rate": 4.784541723666211e-05, "loss": 3.8129, "step": 504 }, { "epoch": 0.04, "grad_norm": 1.5750986337661743, "learning_rate": 4.782831737346101e-05, "loss": 3.648, "step": 508 }, { "epoch": 0.04, "grad_norm": 1.6995611190795898, "learning_rate": 4.781121751025992e-05, "loss": 3.7686, "step": 512 }, { "epoch": 0.04, "grad_norm": 1.7573764324188232, "learning_rate": 4.7794117647058826e-05, "loss": 3.6636, "step": 516 }, { "epoch": 0.04, "grad_norm": 1.5131741762161255, "learning_rate": 4.7777017783857733e-05, "loss": 3.5793, "step": 520 }, { "epoch": 0.04, "grad_norm": 1.6838548183441162, "learning_rate": 4.775991792065664e-05, "loss": 3.6336, "step": 524 }, { "epoch": 0.05, "grad_norm": 1.9402954578399658, "learning_rate": 4.774281805745554e-05, "loss": 3.6733, "step": 528 }, { "epoch": 0.05, "grad_norm": 1.7752444744110107, "learning_rate": 4.772571819425445e-05, "loss": 3.8021, "step": 532 }, { "epoch": 0.05, "grad_norm": 1.4470579624176025, "learning_rate": 4.7708618331053356e-05, "loss": 3.6526, "step": 536 }, { "epoch": 0.05, "grad_norm": 1.5560115575790405, "learning_rate": 4.769151846785226e-05, "loss": 3.7236, "step": 540 }, { "epoch": 0.05, "grad_norm": 1.6849740743637085, "learning_rate": 4.7674418604651164e-05, "loss": 3.7586, "step": 544 }, { "epoch": 0.05, "grad_norm": 1.4917711019515991, "learning_rate": 4.765731874145007e-05, "loss": 3.7791, "step": 548 }, { "epoch": 0.05, "grad_norm": 1.791801929473877, "learning_rate": 4.764021887824898e-05, "loss": 3.5659, "step": 552 }, { "epoch": 0.05, "grad_norm": 1.531817078590393, "learning_rate": 4.7623119015047886e-05, "loss": 3.5811, "step": 556 }, { "epoch": 0.05, "grad_norm": 1.8725793361663818, "learning_rate": 4.7606019151846787e-05, "loss": 3.7337, "step": 560 }, { "epoch": 0.05, "grad_norm": 1.5851105451583862, "learning_rate": 4.7588919288645694e-05, "loss": 3.4899, "step": 564 }, { "epoch": 0.05, "grad_norm": 1.4937329292297363, "learning_rate": 4.7571819425444594e-05, "loss": 3.5653, "step": 568 }, { "epoch": 0.05, "grad_norm": 1.8102850914001465, "learning_rate": 4.75547195622435e-05, "loss": 3.517, "step": 572 }, { "epoch": 0.05, "grad_norm": 1.4572982788085938, "learning_rate": 4.753761969904241e-05, "loss": 3.723, "step": 576 }, { "epoch": 0.05, "grad_norm": 1.5815645456314087, "learning_rate": 4.7520519835841317e-05, "loss": 3.6615, "step": 580 }, { "epoch": 0.05, "grad_norm": 1.6406457424163818, "learning_rate": 4.7503419972640224e-05, "loss": 3.6796, "step": 584 }, { "epoch": 0.05, "grad_norm": 1.8512486219406128, "learning_rate": 4.7486320109439124e-05, "loss": 3.5586, "step": 588 }, { "epoch": 0.05, "grad_norm": 1.6738507747650146, "learning_rate": 4.746922024623803e-05, "loss": 3.4724, "step": 592 }, { "epoch": 0.05, "grad_norm": 1.766518235206604, "learning_rate": 4.745212038303693e-05, "loss": 3.7213, "step": 596 }, { "epoch": 0.05, "grad_norm": 1.681229591369629, "learning_rate": 4.7435020519835846e-05, "loss": 3.6538, "step": 600 }, { "epoch": 0.05, "grad_norm": 1.7900785207748413, "learning_rate": 4.7417920656634754e-05, "loss": 3.7952, "step": 604 }, { "epoch": 0.05, "grad_norm": 1.5610637664794922, "learning_rate": 4.7400820793433654e-05, "loss": 3.5255, "step": 608 }, { "epoch": 0.05, "grad_norm": 1.3365826606750488, "learning_rate": 4.738372093023256e-05, "loss": 3.5621, "step": 612 }, { "epoch": 0.05, "grad_norm": 1.8946609497070312, "learning_rate": 4.736662106703146e-05, "loss": 3.5664, "step": 616 }, { "epoch": 0.05, "grad_norm": 1.728330135345459, "learning_rate": 4.734952120383037e-05, "loss": 3.7577, "step": 620 }, { "epoch": 0.05, "grad_norm": 1.7583847045898438, "learning_rate": 4.733242134062928e-05, "loss": 3.5722, "step": 624 }, { "epoch": 0.05, "grad_norm": 1.9061710834503174, "learning_rate": 4.7315321477428184e-05, "loss": 3.3436, "step": 628 }, { "epoch": 0.05, "grad_norm": 1.7074024677276611, "learning_rate": 4.729822161422709e-05, "loss": 3.5426, "step": 632 }, { "epoch": 0.05, "grad_norm": 1.649793028831482, "learning_rate": 4.728112175102599e-05, "loss": 3.5826, "step": 636 }, { "epoch": 0.05, "grad_norm": 1.7260462045669556, "learning_rate": 4.72640218878249e-05, "loss": 3.5353, "step": 640 }, { "epoch": 0.06, "grad_norm": 1.745391607284546, "learning_rate": 4.724692202462381e-05, "loss": 3.3534, "step": 644 }, { "epoch": 0.06, "grad_norm": 1.374226450920105, "learning_rate": 4.722982216142271e-05, "loss": 3.386, "step": 648 }, { "epoch": 0.06, "grad_norm": 1.8732798099517822, "learning_rate": 4.7212722298221615e-05, "loss": 3.6073, "step": 652 }, { "epoch": 0.06, "grad_norm": 2.4075376987457275, "learning_rate": 4.719562243502052e-05, "loss": 3.5393, "step": 656 }, { "epoch": 0.06, "grad_norm": 1.54508638381958, "learning_rate": 4.717852257181943e-05, "loss": 3.475, "step": 660 }, { "epoch": 0.06, "grad_norm": 1.5737498998641968, "learning_rate": 4.716142270861834e-05, "loss": 3.5497, "step": 664 }, { "epoch": 0.06, "grad_norm": 1.8623074293136597, "learning_rate": 4.714432284541724e-05, "loss": 3.4603, "step": 668 }, { "epoch": 0.06, "grad_norm": 1.7199251651763916, "learning_rate": 4.7127222982216145e-05, "loss": 3.5801, "step": 672 }, { "epoch": 0.06, "grad_norm": 1.5866843461990356, "learning_rate": 4.7110123119015045e-05, "loss": 3.3454, "step": 676 }, { "epoch": 0.06, "grad_norm": 1.9907779693603516, "learning_rate": 4.709302325581396e-05, "loss": 3.6247, "step": 680 }, { "epoch": 0.06, "grad_norm": 1.3720687627792358, "learning_rate": 4.707592339261287e-05, "loss": 3.3614, "step": 684 }, { "epoch": 0.06, "grad_norm": 1.739660620689392, "learning_rate": 4.705882352941177e-05, "loss": 3.4211, "step": 688 }, { "epoch": 0.06, "grad_norm": 1.6425236463546753, "learning_rate": 4.7041723666210675e-05, "loss": 3.423, "step": 692 }, { "epoch": 0.06, "grad_norm": 1.5457091331481934, "learning_rate": 4.7024623803009575e-05, "loss": 3.3258, "step": 696 }, { "epoch": 0.06, "grad_norm": 1.5979949235916138, "learning_rate": 4.700752393980848e-05, "loss": 3.2838, "step": 700 }, { "epoch": 0.06, "grad_norm": 1.6761040687561035, "learning_rate": 4.699042407660739e-05, "loss": 3.334, "step": 704 }, { "epoch": 0.06, "grad_norm": 1.552573323249817, "learning_rate": 4.69733242134063e-05, "loss": 3.5355, "step": 708 }, { "epoch": 0.06, "grad_norm": 1.6743354797363281, "learning_rate": 4.6956224350205205e-05, "loss": 3.4076, "step": 712 }, { "epoch": 0.06, "grad_norm": 4.555662155151367, "learning_rate": 4.6939124487004105e-05, "loss": 3.7446, "step": 716 }, { "epoch": 0.06, "grad_norm": 1.5942860841751099, "learning_rate": 4.692202462380301e-05, "loss": 3.553, "step": 720 }, { "epoch": 0.06, "grad_norm": 1.9089189767837524, "learning_rate": 4.690492476060191e-05, "loss": 3.3584, "step": 724 }, { "epoch": 0.06, "grad_norm": 2.0768373012542725, "learning_rate": 4.688782489740082e-05, "loss": 3.4295, "step": 728 }, { "epoch": 0.06, "grad_norm": 3.3060362339019775, "learning_rate": 4.687072503419973e-05, "loss": 3.415, "step": 732 }, { "epoch": 0.06, "grad_norm": 1.9307421445846558, "learning_rate": 4.6853625170998635e-05, "loss": 3.4901, "step": 736 }, { "epoch": 0.06, "grad_norm": 1.7142446041107178, "learning_rate": 4.683652530779754e-05, "loss": 3.6096, "step": 740 }, { "epoch": 0.06, "grad_norm": 1.7756657600402832, "learning_rate": 4.681942544459644e-05, "loss": 3.4534, "step": 744 }, { "epoch": 0.06, "grad_norm": 1.5459442138671875, "learning_rate": 4.680232558139535e-05, "loss": 3.3813, "step": 748 }, { "epoch": 0.06, "grad_norm": 2.1961963176727295, "learning_rate": 4.678522571819426e-05, "loss": 3.3467, "step": 752 }, { "epoch": 0.06, "grad_norm": 1.7320613861083984, "learning_rate": 4.676812585499316e-05, "loss": 3.3927, "step": 756 }, { "epoch": 0.06, "grad_norm": 1.6255303621292114, "learning_rate": 4.6751025991792066e-05, "loss": 3.4644, "step": 760 }, { "epoch": 0.07, "grad_norm": 1.5142388343811035, "learning_rate": 4.673392612859097e-05, "loss": 3.4657, "step": 764 }, { "epoch": 0.07, "grad_norm": 1.5093744993209839, "learning_rate": 4.671682626538988e-05, "loss": 3.3144, "step": 768 }, { "epoch": 0.07, "grad_norm": 1.4730446338653564, "learning_rate": 4.669972640218879e-05, "loss": 3.3382, "step": 772 }, { "epoch": 0.07, "grad_norm": 1.7650116682052612, "learning_rate": 4.668262653898769e-05, "loss": 3.4519, "step": 776 }, { "epoch": 0.07, "grad_norm": 1.637071132659912, "learning_rate": 4.6665526675786596e-05, "loss": 3.3723, "step": 780 }, { "epoch": 0.07, "grad_norm": 1.4582788944244385, "learning_rate": 4.6648426812585496e-05, "loss": 3.4839, "step": 784 }, { "epoch": 0.07, "grad_norm": 1.4654922485351562, "learning_rate": 4.663132694938441e-05, "loss": 3.1257, "step": 788 }, { "epoch": 0.07, "grad_norm": 1.4668978452682495, "learning_rate": 4.661422708618332e-05, "loss": 3.2506, "step": 792 }, { "epoch": 0.07, "grad_norm": 1.5190536975860596, "learning_rate": 4.659712722298222e-05, "loss": 3.4055, "step": 796 }, { "epoch": 0.07, "grad_norm": 1.9057687520980835, "learning_rate": 4.6580027359781126e-05, "loss": 3.4804, "step": 800 }, { "epoch": 0.07, "grad_norm": 1.4573813676834106, "learning_rate": 4.6562927496580026e-05, "loss": 3.3792, "step": 804 }, { "epoch": 0.07, "grad_norm": 2.3757171630859375, "learning_rate": 4.6545827633378933e-05, "loss": 3.4649, "step": 808 }, { "epoch": 0.07, "grad_norm": 1.7024015188217163, "learning_rate": 4.652872777017784e-05, "loss": 3.3149, "step": 812 }, { "epoch": 0.07, "grad_norm": 1.5424153804779053, "learning_rate": 4.651162790697675e-05, "loss": 3.3, "step": 816 }, { "epoch": 0.07, "grad_norm": 1.4667370319366455, "learning_rate": 4.6494528043775655e-05, "loss": 3.2845, "step": 820 }, { "epoch": 0.07, "grad_norm": 1.5216193199157715, "learning_rate": 4.6477428180574556e-05, "loss": 3.3062, "step": 824 }, { "epoch": 0.07, "grad_norm": 1.5499827861785889, "learning_rate": 4.6460328317373463e-05, "loss": 3.263, "step": 828 }, { "epoch": 0.07, "grad_norm": 1.644148588180542, "learning_rate": 4.6443228454172364e-05, "loss": 3.4374, "step": 832 }, { "epoch": 0.07, "grad_norm": 1.6420782804489136, "learning_rate": 4.642612859097127e-05, "loss": 3.4049, "step": 836 }, { "epoch": 0.07, "grad_norm": 1.4325385093688965, "learning_rate": 4.640902872777018e-05, "loss": 3.3499, "step": 840 }, { "epoch": 0.07, "grad_norm": 1.6447879076004028, "learning_rate": 4.6391928864569086e-05, "loss": 3.4098, "step": 844 }, { "epoch": 0.07, "grad_norm": 1.4675137996673584, "learning_rate": 4.637482900136799e-05, "loss": 3.2778, "step": 848 }, { "epoch": 0.07, "grad_norm": 1.5847750902175903, "learning_rate": 4.6357729138166894e-05, "loss": 3.1695, "step": 852 }, { "epoch": 0.07, "grad_norm": 1.514772653579712, "learning_rate": 4.63406292749658e-05, "loss": 3.1556, "step": 856 }, { "epoch": 0.07, "grad_norm": 1.511367917060852, "learning_rate": 4.632352941176471e-05, "loss": 3.3307, "step": 860 }, { "epoch": 0.07, "grad_norm": 1.7640196084976196, "learning_rate": 4.630642954856361e-05, "loss": 3.2985, "step": 864 }, { "epoch": 0.07, "grad_norm": 1.7074265480041504, "learning_rate": 4.628932968536252e-05, "loss": 3.2598, "step": 868 }, { "epoch": 0.07, "grad_norm": 1.52577805519104, "learning_rate": 4.6272229822161424e-05, "loss": 3.0872, "step": 872 }, { "epoch": 0.07, "grad_norm": 1.6908501386642456, "learning_rate": 4.625512995896033e-05, "loss": 3.1931, "step": 876 }, { "epoch": 0.08, "grad_norm": 1.6583012342453003, "learning_rate": 4.623803009575924e-05, "loss": 3.1849, "step": 880 }, { "epoch": 0.08, "grad_norm": 1.4067027568817139, "learning_rate": 4.622093023255814e-05, "loss": 3.1912, "step": 884 }, { "epoch": 0.08, "grad_norm": 1.388487458229065, "learning_rate": 4.6203830369357046e-05, "loss": 3.2845, "step": 888 }, { "epoch": 0.08, "grad_norm": 1.7910948991775513, "learning_rate": 4.6186730506155954e-05, "loss": 3.1146, "step": 892 }, { "epoch": 0.08, "grad_norm": 1.4454721212387085, "learning_rate": 4.616963064295486e-05, "loss": 3.2114, "step": 896 }, { "epoch": 0.08, "grad_norm": 1.4809476137161255, "learning_rate": 4.615253077975377e-05, "loss": 3.3329, "step": 900 }, { "epoch": 0.08, "grad_norm": 1.6668082475662231, "learning_rate": 4.613543091655267e-05, "loss": 3.2517, "step": 904 }, { "epoch": 0.08, "grad_norm": 1.72063410282135, "learning_rate": 4.6118331053351576e-05, "loss": 3.0895, "step": 908 }, { "epoch": 0.08, "grad_norm": 1.5995526313781738, "learning_rate": 4.610123119015048e-05, "loss": 3.2703, "step": 912 }, { "epoch": 0.08, "grad_norm": 1.453747034072876, "learning_rate": 4.6084131326949384e-05, "loss": 3.3619, "step": 916 }, { "epoch": 0.08, "grad_norm": 1.7982878684997559, "learning_rate": 4.606703146374829e-05, "loss": 3.2657, "step": 920 }, { "epoch": 0.08, "grad_norm": 1.3937926292419434, "learning_rate": 4.60499316005472e-05, "loss": 3.1727, "step": 924 }, { "epoch": 0.08, "grad_norm": 1.5705368518829346, "learning_rate": 4.6032831737346106e-05, "loss": 3.0657, "step": 928 }, { "epoch": 0.08, "grad_norm": 1.6821500062942505, "learning_rate": 4.601573187414501e-05, "loss": 3.3191, "step": 932 }, { "epoch": 0.08, "grad_norm": 1.5288244485855103, "learning_rate": 4.5998632010943914e-05, "loss": 3.0876, "step": 936 }, { "epoch": 0.08, "grad_norm": 1.7748781442642212, "learning_rate": 4.598153214774282e-05, "loss": 3.18, "step": 940 }, { "epoch": 0.08, "grad_norm": 1.7024669647216797, "learning_rate": 4.596443228454172e-05, "loss": 3.3601, "step": 944 }, { "epoch": 0.08, "grad_norm": 1.7017521858215332, "learning_rate": 4.5947332421340636e-05, "loss": 2.9751, "step": 948 }, { "epoch": 0.08, "grad_norm": 1.7405025959014893, "learning_rate": 4.593023255813954e-05, "loss": 3.2073, "step": 952 }, { "epoch": 0.08, "grad_norm": 1.6610547304153442, "learning_rate": 4.5913132694938444e-05, "loss": 3.2363, "step": 956 }, { "epoch": 0.08, "grad_norm": 1.4785616397857666, "learning_rate": 4.5896032831737345e-05, "loss": 3.3254, "step": 960 }, { "epoch": 0.08, "grad_norm": 3.453533411026001, "learning_rate": 4.587893296853625e-05, "loss": 3.21, "step": 964 }, { "epoch": 0.08, "grad_norm": 1.4303959608078003, "learning_rate": 4.586183310533516e-05, "loss": 3.3226, "step": 968 }, { "epoch": 0.08, "grad_norm": 1.6272943019866943, "learning_rate": 4.584473324213407e-05, "loss": 3.5944, "step": 972 }, { "epoch": 0.08, "grad_norm": 1.823716402053833, "learning_rate": 4.5827633378932974e-05, "loss": 3.234, "step": 976 }, { "epoch": 0.08, "grad_norm": 1.7006231546401978, "learning_rate": 4.5810533515731875e-05, "loss": 3.0123, "step": 980 }, { "epoch": 0.08, "grad_norm": 1.3645117282867432, "learning_rate": 4.579343365253078e-05, "loss": 3.2243, "step": 984 }, { "epoch": 0.08, "grad_norm": 1.7724872827529907, "learning_rate": 4.577633378932969e-05, "loss": 3.1705, "step": 988 }, { "epoch": 0.08, "grad_norm": 1.6377599239349365, "learning_rate": 4.575923392612859e-05, "loss": 3.2647, "step": 992 }, { "epoch": 0.09, "grad_norm": 1.4828346967697144, "learning_rate": 4.57421340629275e-05, "loss": 3.0613, "step": 996 }, { "epoch": 0.09, "grad_norm": 1.515553593635559, "learning_rate": 4.5725034199726405e-05, "loss": 3.0368, "step": 1000 }, { "epoch": 0.09, "grad_norm": 1.498844861984253, "learning_rate": 4.570793433652531e-05, "loss": 3.102, "step": 1004 }, { "epoch": 0.09, "grad_norm": 1.4339860677719116, "learning_rate": 4.569083447332422e-05, "loss": 3.1874, "step": 1008 }, { "epoch": 0.09, "grad_norm": 1.5611178874969482, "learning_rate": 4.567373461012312e-05, "loss": 3.075, "step": 1012 }, { "epoch": 0.09, "grad_norm": 1.6089402437210083, "learning_rate": 4.565663474692203e-05, "loss": 3.1544, "step": 1016 }, { "epoch": 0.09, "grad_norm": 1.8874709606170654, "learning_rate": 4.563953488372093e-05, "loss": 3.2806, "step": 1020 }, { "epoch": 0.09, "grad_norm": 1.5624805688858032, "learning_rate": 4.5622435020519835e-05, "loss": 3.1442, "step": 1024 }, { "epoch": 0.09, "grad_norm": 1.968634009361267, "learning_rate": 4.560533515731875e-05, "loss": 3.2743, "step": 1028 }, { "epoch": 0.09, "grad_norm": 1.4064279794692993, "learning_rate": 4.558823529411765e-05, "loss": 3.0726, "step": 1032 }, { "epoch": 0.09, "grad_norm": 1.577048897743225, "learning_rate": 4.557113543091656e-05, "loss": 3.1129, "step": 1036 }, { "epoch": 0.09, "grad_norm": 1.548323392868042, "learning_rate": 4.555403556771546e-05, "loss": 3.144, "step": 1040 }, { "epoch": 0.09, "grad_norm": 1.5542961359024048, "learning_rate": 4.5536935704514365e-05, "loss": 3.182, "step": 1044 }, { "epoch": 0.09, "grad_norm": 1.5506356954574585, "learning_rate": 4.551983584131327e-05, "loss": 3.2186, "step": 1048 }, { "epoch": 0.09, "grad_norm": 1.6962274312973022, "learning_rate": 4.550273597811218e-05, "loss": 3.0385, "step": 1052 }, { "epoch": 0.09, "grad_norm": 1.4382833242416382, "learning_rate": 4.548563611491109e-05, "loss": 3.1268, "step": 1056 }, { "epoch": 0.09, "grad_norm": 2.269392728805542, "learning_rate": 4.546853625170999e-05, "loss": 3.2579, "step": 1060 }, { "epoch": 0.09, "grad_norm": 1.5414201021194458, "learning_rate": 4.5451436388508895e-05, "loss": 3.1501, "step": 1064 }, { "epoch": 0.09, "grad_norm": 1.936946153640747, "learning_rate": 4.54343365253078e-05, "loss": 3.1711, "step": 1068 }, { "epoch": 0.09, "grad_norm": 1.7430529594421387, "learning_rate": 4.54172366621067e-05, "loss": 3.1299, "step": 1072 }, { "epoch": 0.09, "grad_norm": 1.3500404357910156, "learning_rate": 4.540013679890561e-05, "loss": 3.0935, "step": 1076 }, { "epoch": 0.09, "grad_norm": 1.5768132209777832, "learning_rate": 4.538303693570452e-05, "loss": 3.1091, "step": 1080 }, { "epoch": 0.09, "grad_norm": 1.4829493761062622, "learning_rate": 4.5365937072503425e-05, "loss": 3.0862, "step": 1084 }, { "epoch": 0.09, "grad_norm": 1.5560483932495117, "learning_rate": 4.5348837209302326e-05, "loss": 3.1024, "step": 1088 }, { "epoch": 0.09, "grad_norm": 1.6295199394226074, "learning_rate": 4.533173734610123e-05, "loss": 2.9539, "step": 1092 }, { "epoch": 0.09, "grad_norm": 2.1724135875701904, "learning_rate": 4.531463748290014e-05, "loss": 3.1936, "step": 1096 }, { "epoch": 0.09, "grad_norm": 1.7400479316711426, "learning_rate": 4.529753761969904e-05, "loss": 3.1873, "step": 1100 }, { "epoch": 0.09, "grad_norm": 1.554962396621704, "learning_rate": 4.528043775649795e-05, "loss": 3.0643, "step": 1104 }, { "epoch": 0.09, "grad_norm": 1.3722119331359863, "learning_rate": 4.5263337893296855e-05, "loss": 3.0088, "step": 1108 }, { "epoch": 0.1, "grad_norm": 1.395577311515808, "learning_rate": 4.524623803009576e-05, "loss": 3.0584, "step": 1112 }, { "epoch": 0.1, "grad_norm": 1.509710431098938, "learning_rate": 4.522913816689467e-05, "loss": 3.2276, "step": 1116 }, { "epoch": 0.1, "grad_norm": 1.5213786363601685, "learning_rate": 4.521203830369357e-05, "loss": 2.9433, "step": 1120 }, { "epoch": 0.1, "grad_norm": 1.51616632938385, "learning_rate": 4.519493844049248e-05, "loss": 2.9902, "step": 1124 }, { "epoch": 0.1, "grad_norm": 1.5429805517196655, "learning_rate": 4.517783857729138e-05, "loss": 3.0884, "step": 1128 }, { "epoch": 0.1, "grad_norm": 1.4714343547821045, "learning_rate": 4.5160738714090286e-05, "loss": 3.0322, "step": 1132 }, { "epoch": 0.1, "grad_norm": 1.4637647867202759, "learning_rate": 4.51436388508892e-05, "loss": 3.0352, "step": 1136 }, { "epoch": 0.1, "grad_norm": 1.3424347639083862, "learning_rate": 4.51265389876881e-05, "loss": 3.0866, "step": 1140 }, { "epoch": 0.1, "grad_norm": 1.689968228340149, "learning_rate": 4.510943912448701e-05, "loss": 3.0002, "step": 1144 }, { "epoch": 0.1, "grad_norm": 1.9935293197631836, "learning_rate": 4.509233926128591e-05, "loss": 2.9518, "step": 1148 }, { "epoch": 0.1, "grad_norm": 1.5461673736572266, "learning_rate": 4.5075239398084816e-05, "loss": 3.0394, "step": 1152 }, { "epoch": 0.1, "grad_norm": 1.5787737369537354, "learning_rate": 4.505813953488372e-05, "loss": 3.0934, "step": 1156 }, { "epoch": 0.1, "grad_norm": 1.4527121782302856, "learning_rate": 4.504103967168263e-05, "loss": 2.9734, "step": 1160 }, { "epoch": 0.1, "grad_norm": 1.6236604452133179, "learning_rate": 4.502393980848154e-05, "loss": 3.021, "step": 1164 }, { "epoch": 0.1, "grad_norm": 1.5616329908370972, "learning_rate": 4.500683994528044e-05, "loss": 3.0602, "step": 1168 } ], "logging_steps": 4, "max_steps": 11696, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1170, "total_flos": 1.585392327327744e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }