{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7991475759190197, "eval_steps": 20.0, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015982951518380393, "grad_norm": 584.8897461802409, "learning_rate": 3.194888178913738e-07, "loss": 2.787, "step": 10 }, { "epoch": 0.031965903036760786, "grad_norm": 36.67515312992242, "learning_rate": 6.389776357827476e-07, "loss": 2.5634, "step": 20 }, { "epoch": 0.04794885455514118, "grad_norm": 10.762606870071352, "learning_rate": 9.584664536741215e-07, "loss": 2.0774, "step": 30 }, { "epoch": 0.06393180607352157, "grad_norm": 8.222655227801516, "learning_rate": 1.2779552715654952e-06, "loss": 1.7751, "step": 40 }, { "epoch": 0.07991475759190197, "grad_norm": 9.439874512071311, "learning_rate": 1.5974440894568691e-06, "loss": 1.6774, "step": 50 }, { "epoch": 0.09589770911028236, "grad_norm": 7.064833162397198, "learning_rate": 1.916932907348243e-06, "loss": 1.5861, "step": 60 }, { "epoch": 0.11188066062866275, "grad_norm": 10.311171109232708, "learning_rate": 2.2364217252396165e-06, "loss": 1.5787, "step": 70 }, { "epoch": 0.12786361214704314, "grad_norm": 9.640566718774098, "learning_rate": 2.5559105431309904e-06, "loss": 1.5406, "step": 80 }, { "epoch": 0.14384656366542356, "grad_norm": 7.158156117158717, "learning_rate": 2.8753993610223648e-06, "loss": 1.5008, "step": 90 }, { "epoch": 0.15982951518380395, "grad_norm": 6.670618524707829, "learning_rate": 3.1948881789137383e-06, "loss": 1.4853, "step": 100 }, { "epoch": 0.17581246670218434, "grad_norm": 6.9280590132917075, "learning_rate": 3.514376996805112e-06, "loss": 1.4688, "step": 110 }, { "epoch": 0.19179541822056473, "grad_norm": 6.984532799464284, "learning_rate": 3.833865814696486e-06, "loss": 1.4501, "step": 120 }, { "epoch": 0.20777836973894512, "grad_norm": 4.6035696826309165, "learning_rate": 4.15335463258786e-06, "loss": 1.4325, "step": 130 }, { "epoch": 0.2237613212573255, "grad_norm": 7.6475074547231054, "learning_rate": 4.472843450479233e-06, "loss": 1.4058, "step": 140 }, { "epoch": 0.23974427277570592, "grad_norm": 5.215774162230619, "learning_rate": 4.792332268370608e-06, "loss": 1.3873, "step": 150 }, { "epoch": 0.2557272242940863, "grad_norm": 5.284228741443329, "learning_rate": 5.111821086261981e-06, "loss": 1.3882, "step": 160 }, { "epoch": 0.2717101758124667, "grad_norm": 4.315985648750639, "learning_rate": 5.431309904153355e-06, "loss": 1.3684, "step": 170 }, { "epoch": 0.2876931273308471, "grad_norm": 4.037073137808611, "learning_rate": 5.7507987220447296e-06, "loss": 1.359, "step": 180 }, { "epoch": 0.3036760788492275, "grad_norm": 6.346300332686573, "learning_rate": 6.070287539936103e-06, "loss": 1.3435, "step": 190 }, { "epoch": 0.3196590303676079, "grad_norm": 4.63092372720661, "learning_rate": 6.3897763578274765e-06, "loss": 1.3468, "step": 200 }, { "epoch": 0.33564198188598826, "grad_norm": 5.057903719523363, "learning_rate": 6.709265175718851e-06, "loss": 1.3341, "step": 210 }, { "epoch": 0.3516249334043687, "grad_norm": 5.25790384573804, "learning_rate": 7.028753993610224e-06, "loss": 1.3233, "step": 220 }, { "epoch": 0.3676078849227491, "grad_norm": 3.3267127292309517, "learning_rate": 7.348242811501598e-06, "loss": 1.3025, "step": 230 }, { "epoch": 0.38359083644112946, "grad_norm": 4.174952594356688, "learning_rate": 7.667731629392972e-06, "loss": 1.3, "step": 240 }, { "epoch": 0.3995737879595099, "grad_norm": 2.7547093096631166, "learning_rate": 7.987220447284347e-06, "loss": 1.299, "step": 250 }, { "epoch": 0.41555673947789024, "grad_norm": 3.0076000117279023, "learning_rate": 8.30670926517572e-06, "loss": 1.2789, "step": 260 }, { "epoch": 0.43153969099627065, "grad_norm": 3.6118960867167695, "learning_rate": 8.626198083067093e-06, "loss": 1.289, "step": 270 }, { "epoch": 0.447522642514651, "grad_norm": 4.187154082722337, "learning_rate": 8.945686900958466e-06, "loss": 1.2772, "step": 280 }, { "epoch": 0.46350559403303143, "grad_norm": 3.3209936896861265, "learning_rate": 9.265175718849841e-06, "loss": 1.2714, "step": 290 }, { "epoch": 0.47948854555141185, "grad_norm": 3.4443001562081874, "learning_rate": 9.584664536741216e-06, "loss": 1.2615, "step": 300 }, { "epoch": 0.4954714970697922, "grad_norm": 3.4861656497713764, "learning_rate": 9.904153354632589e-06, "loss": 1.2756, "step": 310 }, { "epoch": 0.5114544485881726, "grad_norm": 3.2211114109625165, "learning_rate": 9.999847101583393e-06, "loss": 1.2434, "step": 320 }, { "epoch": 0.527437400106553, "grad_norm": 3.1006035508008485, "learning_rate": 9.999098233890869e-06, "loss": 1.2489, "step": 330 }, { "epoch": 0.5434203516249334, "grad_norm": 3.0983188498194516, "learning_rate": 9.997725406892392e-06, "loss": 1.2407, "step": 340 }, { "epoch": 0.5594033031433138, "grad_norm": 3.350092298430153, "learning_rate": 9.995728791936505e-06, "loss": 1.243, "step": 350 }, { "epoch": 0.5753862546616942, "grad_norm": 2.712885657555514, "learning_rate": 9.993108638229449e-06, "loss": 1.2677, "step": 360 }, { "epoch": 0.5913692061800746, "grad_norm": 2.6927845803955983, "learning_rate": 9.989865272804064e-06, "loss": 1.2272, "step": 370 }, { "epoch": 0.607352157698455, "grad_norm": 2.9181755886200675, "learning_rate": 9.985999100478964e-06, "loss": 1.2217, "step": 380 }, { "epoch": 0.6233351092168353, "grad_norm": 2.663582794855719, "learning_rate": 9.981510603808024e-06, "loss": 1.2225, "step": 390 }, { "epoch": 0.6393180607352158, "grad_norm": 2.81821319928569, "learning_rate": 9.976400343020134e-06, "loss": 1.2291, "step": 400 }, { "epoch": 0.6553010122535962, "grad_norm": 2.7190463266479363, "learning_rate": 9.970668955949285e-06, "loss": 1.2017, "step": 410 }, { "epoch": 0.6712839637719765, "grad_norm": 2.6121849551347056, "learning_rate": 9.964317157954955e-06, "loss": 1.1999, "step": 420 }, { "epoch": 0.687266915290357, "grad_norm": 2.8153860907477988, "learning_rate": 9.95734574183282e-06, "loss": 1.2119, "step": 430 }, { "epoch": 0.7032498668087374, "grad_norm": 2.8604210593081016, "learning_rate": 9.949755577715806e-06, "loss": 1.1973, "step": 440 }, { "epoch": 0.7192328183271177, "grad_norm": 2.683531260179304, "learning_rate": 9.941547612965475e-06, "loss": 1.2138, "step": 450 }, { "epoch": 0.7352157698454982, "grad_norm": 3.0411589457571377, "learning_rate": 9.932722872053797e-06, "loss": 1.2221, "step": 460 }, { "epoch": 0.7511987213638786, "grad_norm": 2.993913433024178, "learning_rate": 9.923282456435262e-06, "loss": 1.209, "step": 470 }, { "epoch": 0.7671816728822589, "grad_norm": 2.6707946625116072, "learning_rate": 9.913227544409416e-06, "loss": 1.1918, "step": 480 }, { "epoch": 0.7831646244006393, "grad_norm": 2.7683148756228033, "learning_rate": 9.90255939097379e-06, "loss": 1.1784, "step": 490 }, { "epoch": 0.7991475759190197, "grad_norm": 2.803374407208519, "learning_rate": 9.891279327667252e-06, "loss": 1.1769, "step": 500 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 7.826917846385951e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }