|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7991475759190197, |
|
"eval_steps": 20.0, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015982951518380393, |
|
"grad_norm": 584.8897461802409, |
|
"learning_rate": 3.194888178913738e-07, |
|
"loss": 2.787, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.031965903036760786, |
|
"grad_norm": 36.67515312992242, |
|
"learning_rate": 6.389776357827476e-07, |
|
"loss": 2.5634, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04794885455514118, |
|
"grad_norm": 10.762606870071352, |
|
"learning_rate": 9.584664536741215e-07, |
|
"loss": 2.0774, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06393180607352157, |
|
"grad_norm": 8.222655227801516, |
|
"learning_rate": 1.2779552715654952e-06, |
|
"loss": 1.7751, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07991475759190197, |
|
"grad_norm": 9.439874512071311, |
|
"learning_rate": 1.5974440894568691e-06, |
|
"loss": 1.6774, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09589770911028236, |
|
"grad_norm": 7.064833162397198, |
|
"learning_rate": 1.916932907348243e-06, |
|
"loss": 1.5861, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11188066062866275, |
|
"grad_norm": 10.311171109232708, |
|
"learning_rate": 2.2364217252396165e-06, |
|
"loss": 1.5787, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12786361214704314, |
|
"grad_norm": 9.640566718774098, |
|
"learning_rate": 2.5559105431309904e-06, |
|
"loss": 1.5406, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14384656366542356, |
|
"grad_norm": 7.158156117158717, |
|
"learning_rate": 2.8753993610223648e-06, |
|
"loss": 1.5008, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15982951518380395, |
|
"grad_norm": 6.670618524707829, |
|
"learning_rate": 3.1948881789137383e-06, |
|
"loss": 1.4853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17581246670218434, |
|
"grad_norm": 6.9280590132917075, |
|
"learning_rate": 3.514376996805112e-06, |
|
"loss": 1.4688, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19179541822056473, |
|
"grad_norm": 6.984532799464284, |
|
"learning_rate": 3.833865814696486e-06, |
|
"loss": 1.4501, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.20777836973894512, |
|
"grad_norm": 4.6035696826309165, |
|
"learning_rate": 4.15335463258786e-06, |
|
"loss": 1.4325, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2237613212573255, |
|
"grad_norm": 7.6475074547231054, |
|
"learning_rate": 4.472843450479233e-06, |
|
"loss": 1.4058, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.23974427277570592, |
|
"grad_norm": 5.215774162230619, |
|
"learning_rate": 4.792332268370608e-06, |
|
"loss": 1.3873, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2557272242940863, |
|
"grad_norm": 5.284228741443329, |
|
"learning_rate": 5.111821086261981e-06, |
|
"loss": 1.3882, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2717101758124667, |
|
"grad_norm": 4.315985648750639, |
|
"learning_rate": 5.431309904153355e-06, |
|
"loss": 1.3684, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2876931273308471, |
|
"grad_norm": 4.037073137808611, |
|
"learning_rate": 5.7507987220447296e-06, |
|
"loss": 1.359, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3036760788492275, |
|
"grad_norm": 6.346300332686573, |
|
"learning_rate": 6.070287539936103e-06, |
|
"loss": 1.3435, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3196590303676079, |
|
"grad_norm": 4.63092372720661, |
|
"learning_rate": 6.3897763578274765e-06, |
|
"loss": 1.3468, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.33564198188598826, |
|
"grad_norm": 5.057903719523363, |
|
"learning_rate": 6.709265175718851e-06, |
|
"loss": 1.3341, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3516249334043687, |
|
"grad_norm": 5.25790384573804, |
|
"learning_rate": 7.028753993610224e-06, |
|
"loss": 1.3233, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3676078849227491, |
|
"grad_norm": 3.3267127292309517, |
|
"learning_rate": 7.348242811501598e-06, |
|
"loss": 1.3025, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.38359083644112946, |
|
"grad_norm": 4.174952594356688, |
|
"learning_rate": 7.667731629392972e-06, |
|
"loss": 1.3, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3995737879595099, |
|
"grad_norm": 2.7547093096631166, |
|
"learning_rate": 7.987220447284347e-06, |
|
"loss": 1.299, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.41555673947789024, |
|
"grad_norm": 3.0076000117279023, |
|
"learning_rate": 8.30670926517572e-06, |
|
"loss": 1.2789, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.43153969099627065, |
|
"grad_norm": 3.6118960867167695, |
|
"learning_rate": 8.626198083067093e-06, |
|
"loss": 1.289, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.447522642514651, |
|
"grad_norm": 4.187154082722337, |
|
"learning_rate": 8.945686900958466e-06, |
|
"loss": 1.2772, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.46350559403303143, |
|
"grad_norm": 3.3209936896861265, |
|
"learning_rate": 9.265175718849841e-06, |
|
"loss": 1.2714, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.47948854555141185, |
|
"grad_norm": 3.4443001562081874, |
|
"learning_rate": 9.584664536741216e-06, |
|
"loss": 1.2615, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4954714970697922, |
|
"grad_norm": 3.4861656497713764, |
|
"learning_rate": 9.904153354632589e-06, |
|
"loss": 1.2756, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5114544485881726, |
|
"grad_norm": 3.2211114109625165, |
|
"learning_rate": 9.999847101583393e-06, |
|
"loss": 1.2434, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.527437400106553, |
|
"grad_norm": 3.1006035508008485, |
|
"learning_rate": 9.999098233890869e-06, |
|
"loss": 1.2489, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5434203516249334, |
|
"grad_norm": 3.0983188498194516, |
|
"learning_rate": 9.997725406892392e-06, |
|
"loss": 1.2407, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5594033031433138, |
|
"grad_norm": 3.350092298430153, |
|
"learning_rate": 9.995728791936505e-06, |
|
"loss": 1.243, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5753862546616942, |
|
"grad_norm": 2.712885657555514, |
|
"learning_rate": 9.993108638229449e-06, |
|
"loss": 1.2677, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5913692061800746, |
|
"grad_norm": 2.6927845803955983, |
|
"learning_rate": 9.989865272804064e-06, |
|
"loss": 1.2272, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.607352157698455, |
|
"grad_norm": 2.9181755886200675, |
|
"learning_rate": 9.985999100478964e-06, |
|
"loss": 1.2217, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6233351092168353, |
|
"grad_norm": 2.663582794855719, |
|
"learning_rate": 9.981510603808024e-06, |
|
"loss": 1.2225, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6393180607352158, |
|
"grad_norm": 2.81821319928569, |
|
"learning_rate": 9.976400343020134e-06, |
|
"loss": 1.2291, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6553010122535962, |
|
"grad_norm": 2.7190463266479363, |
|
"learning_rate": 9.970668955949285e-06, |
|
"loss": 1.2017, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6712839637719765, |
|
"grad_norm": 2.6121849551347056, |
|
"learning_rate": 9.964317157954955e-06, |
|
"loss": 1.1999, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.687266915290357, |
|
"grad_norm": 2.8153860907477988, |
|
"learning_rate": 9.95734574183282e-06, |
|
"loss": 1.2119, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7032498668087374, |
|
"grad_norm": 2.8604210593081016, |
|
"learning_rate": 9.949755577715806e-06, |
|
"loss": 1.1973, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7192328183271177, |
|
"grad_norm": 2.683531260179304, |
|
"learning_rate": 9.941547612965475e-06, |
|
"loss": 1.2138, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7352157698454982, |
|
"grad_norm": 3.0411589457571377, |
|
"learning_rate": 9.932722872053797e-06, |
|
"loss": 1.2221, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7511987213638786, |
|
"grad_norm": 2.993913433024178, |
|
"learning_rate": 9.923282456435262e-06, |
|
"loss": 1.209, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7671816728822589, |
|
"grad_norm": 2.6707946625116072, |
|
"learning_rate": 9.913227544409416e-06, |
|
"loss": 1.1918, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7831646244006393, |
|
"grad_norm": 2.7683148756228033, |
|
"learning_rate": 9.90255939097379e-06, |
|
"loss": 1.1784, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7991475759190197, |
|
"grad_norm": 2.803374407208519, |
|
"learning_rate": 9.891279327667252e-06, |
|
"loss": 1.1769, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"total_flos": 7.826917846385951e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|