|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 671, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014903129657228018, |
|
"grad_norm": 0.7207538217956673, |
|
"learning_rate": 4.411764705882353e-08, |
|
"loss": 1.6077, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007451564828614009, |
|
"grad_norm": 0.65849854443806, |
|
"learning_rate": 2.2058823529411765e-07, |
|
"loss": 1.5945, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014903129657228018, |
|
"grad_norm": 0.6800875831730971, |
|
"learning_rate": 4.411764705882353e-07, |
|
"loss": 1.6315, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.022354694485842028, |
|
"grad_norm": 0.8644630471455123, |
|
"learning_rate": 6.61764705882353e-07, |
|
"loss": 1.6271, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.029806259314456036, |
|
"grad_norm": 0.7771872952949634, |
|
"learning_rate": 8.823529411764706e-07, |
|
"loss": 1.5795, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.037257824143070044, |
|
"grad_norm": 0.7397975175907138, |
|
"learning_rate": 1.1029411764705884e-06, |
|
"loss": 1.6225, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.044709388971684055, |
|
"grad_norm": 0.7702121293753343, |
|
"learning_rate": 1.323529411764706e-06, |
|
"loss": 1.6526, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05216095380029806, |
|
"grad_norm": 0.6644067274265962, |
|
"learning_rate": 1.5441176470588234e-06, |
|
"loss": 1.6006, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05961251862891207, |
|
"grad_norm": 0.6017633448350802, |
|
"learning_rate": 1.7647058823529412e-06, |
|
"loss": 1.6434, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06706408345752608, |
|
"grad_norm": 0.4509127973744261, |
|
"learning_rate": 1.9852941176470586e-06, |
|
"loss": 1.5392, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07451564828614009, |
|
"grad_norm": 0.4610064447084038, |
|
"learning_rate": 2.2058823529411767e-06, |
|
"loss": 1.589, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08196721311475409, |
|
"grad_norm": 0.35955958459549675, |
|
"learning_rate": 2.4264705882352943e-06, |
|
"loss": 1.5594, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08941877794336811, |
|
"grad_norm": 0.3478335776850152, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 1.5798, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09687034277198212, |
|
"grad_norm": 0.3527200373964567, |
|
"learning_rate": 2.8676470588235296e-06, |
|
"loss": 1.5693, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.10432190760059612, |
|
"grad_norm": 0.29600438876672497, |
|
"learning_rate": 2.999918570372821e-06, |
|
"loss": 1.5045, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11177347242921014, |
|
"grad_norm": 0.2900544302757877, |
|
"learning_rate": 2.9990025885979037e-06, |
|
"loss": 1.483, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11922503725782414, |
|
"grad_norm": 0.2408233475597495, |
|
"learning_rate": 2.997069461623824e-06, |
|
"loss": 1.5672, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12667660208643816, |
|
"grad_norm": 0.28176854976259386, |
|
"learning_rate": 2.9941205011700118e-06, |
|
"loss": 1.5082, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.13412816691505217, |
|
"grad_norm": 0.19913384731707087, |
|
"learning_rate": 2.990157708247667e-06, |
|
"loss": 1.4819, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14157973174366617, |
|
"grad_norm": 0.21083590352306433, |
|
"learning_rate": 2.9851837718019762e-06, |
|
"loss": 1.5531, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14903129657228018, |
|
"grad_norm": 0.20730699550159845, |
|
"learning_rate": 2.9792020668875367e-06, |
|
"loss": 1.481, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15648286140089418, |
|
"grad_norm": 0.18165412005960324, |
|
"learning_rate": 2.9722166523782167e-06, |
|
"loss": 1.4844, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 0.15838156755040073, |
|
"learning_rate": 2.964232268213018e-06, |
|
"loss": 1.4455, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17138599105812222, |
|
"grad_norm": 0.16340737391300794, |
|
"learning_rate": 2.955254332179797e-06, |
|
"loss": 1.469, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17883755588673622, |
|
"grad_norm": 0.14759052394480532, |
|
"learning_rate": 2.9452889362390366e-06, |
|
"loss": 1.5597, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18628912071535023, |
|
"grad_norm": 0.12914782899327282, |
|
"learning_rate": 2.9343428423901614e-06, |
|
"loss": 1.4765, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.19374068554396423, |
|
"grad_norm": 0.12460225982681429, |
|
"learning_rate": 2.9224234780831905e-06, |
|
"loss": 1.4656, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20119225037257824, |
|
"grad_norm": 0.14217631629657101, |
|
"learning_rate": 2.9095389311788626e-06, |
|
"loss": 1.4872, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.20864381520119224, |
|
"grad_norm": 0.1272863323148308, |
|
"learning_rate": 2.8956979444606303e-06, |
|
"loss": 1.4314, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21609538002980627, |
|
"grad_norm": 0.11983511963861297, |
|
"learning_rate": 2.8809099097022624e-06, |
|
"loss": 1.4734, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.22354694485842028, |
|
"grad_norm": 0.12007606131316517, |
|
"learning_rate": 2.8651848612950768e-06, |
|
"loss": 1.4997, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23099850968703428, |
|
"grad_norm": 0.11593831929818996, |
|
"learning_rate": 2.848533469439122e-06, |
|
"loss": 1.3879, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.23845007451564829, |
|
"grad_norm": 0.11598915986268504, |
|
"learning_rate": 2.8309670329029358e-06, |
|
"loss": 1.4192, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2459016393442623, |
|
"grad_norm": 0.11190528057252458, |
|
"learning_rate": 2.8124974713567872e-06, |
|
"loss": 1.452, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2533532041728763, |
|
"grad_norm": 0.1203691003615664, |
|
"learning_rate": 2.79313731728461e-06, |
|
"loss": 1.4819, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2608047690014903, |
|
"grad_norm": 0.10952238143478435, |
|
"learning_rate": 2.772899707480108e-06, |
|
"loss": 1.4173, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.26825633383010433, |
|
"grad_norm": 0.10685505615273189, |
|
"learning_rate": 2.7517983741328146e-06, |
|
"loss": 1.4078, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2757078986587183, |
|
"grad_norm": 0.11464827031091669, |
|
"learning_rate": 2.729847635510137e-06, |
|
"loss": 1.5007, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.28315946348733234, |
|
"grad_norm": 0.11122715995599067, |
|
"learning_rate": 2.70706238624173e-06, |
|
"loss": 1.4541, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2906110283159464, |
|
"grad_norm": 0.10752842550767219, |
|
"learning_rate": 2.6834580872127733e-06, |
|
"loss": 1.4129, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.29806259314456035, |
|
"grad_norm": 0.11490358820200401, |
|
"learning_rate": 2.6590507550730175e-06, |
|
"loss": 1.3986, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3055141579731744, |
|
"grad_norm": 0.10706266255477938, |
|
"learning_rate": 2.6338569513687182e-06, |
|
"loss": 1.4404, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.31296572280178836, |
|
"grad_norm": 0.1069097244298094, |
|
"learning_rate": 2.6078937713048357e-06, |
|
"loss": 1.3969, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3204172876304024, |
|
"grad_norm": 0.1160964307611302, |
|
"learning_rate": 2.581178832145114e-06, |
|
"loss": 1.4672, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.1097725420424586, |
|
"learning_rate": 2.553730261257924e-06, |
|
"loss": 1.387, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3353204172876304, |
|
"grad_norm": 0.10407180190964474, |
|
"learning_rate": 2.525566683815973e-06, |
|
"loss": 1.3973, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.34277198211624443, |
|
"grad_norm": 0.11536030542488797, |
|
"learning_rate": 2.496707210158233e-06, |
|
"loss": 1.4624, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3502235469448584, |
|
"grad_norm": 0.10982670956660034, |
|
"learning_rate": 2.4671714228226542e-06, |
|
"loss": 1.4078, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.35767511177347244, |
|
"grad_norm": 0.10587771136409477, |
|
"learning_rate": 2.4369793632584796e-06, |
|
"loss": 1.4285, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3651266766020864, |
|
"grad_norm": 0.1208719398977082, |
|
"learning_rate": 2.4061515182271535e-06, |
|
"loss": 1.4726, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.37257824143070045, |
|
"grad_norm": 0.10511587252373167, |
|
"learning_rate": 2.3747088059010745e-06, |
|
"loss": 1.3548, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38002980625931443, |
|
"grad_norm": 0.10957225921751226, |
|
"learning_rate": 2.342672561669611e-06, |
|
"loss": 1.4678, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.38748137108792846, |
|
"grad_norm": 0.10693917993857549, |
|
"learning_rate": 2.3100645236620133e-06, |
|
"loss": 1.3974, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3949329359165425, |
|
"grad_norm": 0.11581766322136706, |
|
"learning_rate": 2.276906817997054e-06, |
|
"loss": 1.4098, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.40238450074515647, |
|
"grad_norm": 0.10904876454822796, |
|
"learning_rate": 2.2432219437693897e-06, |
|
"loss": 1.4056, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4098360655737705, |
|
"grad_norm": 0.10831229242143947, |
|
"learning_rate": 2.209032757782848e-06, |
|
"loss": 1.3776, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4172876304023845, |
|
"grad_norm": 0.11150103499322561, |
|
"learning_rate": 2.174362459040989e-06, |
|
"loss": 1.4279, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4247391952309985, |
|
"grad_norm": 0.11628953499023266, |
|
"learning_rate": 2.139234573005468e-06, |
|
"loss": 1.4328, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.43219076005961254, |
|
"grad_norm": 0.1154264307480958, |
|
"learning_rate": 2.1036729356328806e-06, |
|
"loss": 1.4109, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4396423248882265, |
|
"grad_norm": 0.1060574220828385, |
|
"learning_rate": 2.06770167720092e-06, |
|
"loss": 1.3781, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.44709388971684055, |
|
"grad_norm": 0.1053110442104737, |
|
"learning_rate": 2.0313452059348308e-06, |
|
"loss": 1.4444, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.10693306802856366, |
|
"learning_rate": 1.99462819144525e-06, |
|
"loss": 1.4359, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.46199701937406856, |
|
"grad_norm": 0.11652642471380918, |
|
"learning_rate": 1.957575547988697e-06, |
|
"loss": 1.4028, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.46944858420268254, |
|
"grad_norm": 0.11274466501498542, |
|
"learning_rate": 1.9202124175620545e-06, |
|
"loss": 1.3804, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.47690014903129657, |
|
"grad_norm": 0.10861122941192258, |
|
"learning_rate": 1.8825641528425148e-06, |
|
"loss": 1.3951, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4843517138599106, |
|
"grad_norm": 0.11658514112563267, |
|
"learning_rate": 1.8446562999845715e-06, |
|
"loss": 1.4377, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.11414477697164481, |
|
"learning_rate": 1.8065145812857305e-06, |
|
"loss": 1.3746, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4992548435171386, |
|
"grad_norm": 0.10815121580373825, |
|
"learning_rate": 1.7681648777326943e-06, |
|
"loss": 1.4422, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5067064083457526, |
|
"grad_norm": 0.11037495106798235, |
|
"learning_rate": 1.7296332114398704e-06, |
|
"loss": 1.3666, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5141579731743666, |
|
"grad_norm": 0.11676408141846001, |
|
"learning_rate": 1.6909457279921186e-06, |
|
"loss": 1.3734, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5216095380029806, |
|
"grad_norm": 0.1113201753442444, |
|
"learning_rate": 1.6521286787037178e-06, |
|
"loss": 1.4135, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5290611028315947, |
|
"grad_norm": 0.10247556885521109, |
|
"learning_rate": 1.613208402805586e-06, |
|
"loss": 1.4152, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5365126676602087, |
|
"grad_norm": 0.10445203605746987, |
|
"learning_rate": 1.5742113095728515e-06, |
|
"loss": 1.4376, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5439642324888226, |
|
"grad_norm": 0.10879333034477139, |
|
"learning_rate": 1.535163860404891e-06, |
|
"loss": 1.455, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5514157973174366, |
|
"grad_norm": 0.12869015642921014, |
|
"learning_rate": 1.4960925508699984e-06, |
|
"loss": 1.3821, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5588673621460507, |
|
"grad_norm": 0.10954384701245683, |
|
"learning_rate": 1.4570238927268746e-06, |
|
"loss": 1.4397, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5663189269746647, |
|
"grad_norm": 0.10796984723976862, |
|
"learning_rate": 1.4179843959351213e-06, |
|
"loss": 1.4644, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5737704918032787, |
|
"grad_norm": 0.11517040756519294, |
|
"learning_rate": 1.3790005506669643e-06, |
|
"loss": 1.4344, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5812220566318927, |
|
"grad_norm": 0.10926990342446409, |
|
"learning_rate": 1.340098809332401e-06, |
|
"loss": 1.4211, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5886736214605067, |
|
"grad_norm": 0.1115330774443057, |
|
"learning_rate": 1.3013055686299683e-06, |
|
"loss": 1.3726, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5961251862891207, |
|
"grad_norm": 0.10589693766514106, |
|
"learning_rate": 1.2626471516353158e-06, |
|
"loss": 1.4593, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6035767511177347, |
|
"grad_norm": 0.11370583262546306, |
|
"learning_rate": 1.22414978993974e-06, |
|
"loss": 1.3516, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6110283159463488, |
|
"grad_norm": 0.12504328760615357, |
|
"learning_rate": 1.1858396058507837e-06, |
|
"loss": 1.5039, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6184798807749627, |
|
"grad_norm": 0.11743701858841901, |
|
"learning_rate": 1.1477425946670016e-06, |
|
"loss": 1.413, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6259314456035767, |
|
"grad_norm": 0.11045498569848493, |
|
"learning_rate": 1.1098846070389027e-06, |
|
"loss": 1.3941, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6333830104321908, |
|
"grad_norm": 0.12144356998222744, |
|
"learning_rate": 1.0722913314280395e-06, |
|
"loss": 1.4172, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6408345752608048, |
|
"grad_norm": 0.10365373765687404, |
|
"learning_rate": 1.0349882766761573e-06, |
|
"loss": 1.3337, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6482861400894188, |
|
"grad_norm": 0.10647853031859235, |
|
"learning_rate": 9.980007546962206e-07, |
|
"loss": 1.3959, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.10731308955650487, |
|
"learning_rate": 9.613538632970634e-07, |
|
"loss": 1.4023, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6631892697466468, |
|
"grad_norm": 0.10450909600455241, |
|
"learning_rate": 9.250724691533223e-07, |
|
"loss": 1.3872, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6706408345752608, |
|
"grad_norm": 0.1041016914874247, |
|
"learning_rate": 8.891811909322058e-07, |
|
"loss": 1.4465, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6780923994038748, |
|
"grad_norm": 0.120460469459905, |
|
"learning_rate": 8.537043825885445e-07, |
|
"loss": 1.4501, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6855439642324889, |
|
"grad_norm": 0.1136665297817651, |
|
"learning_rate": 8.186661168394658e-07, |
|
"loss": 1.3963, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6929955290611028, |
|
"grad_norm": 0.11135508784322877, |
|
"learning_rate": 7.840901688299e-07, |
|
"loss": 1.4571, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7004470938897168, |
|
"grad_norm": 0.09679813596282592, |
|
"learning_rate": 7.500000000000003e-07, |
|
"loss": 1.3349, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7078986587183308, |
|
"grad_norm": 0.11033066007905798, |
|
"learning_rate": 7.16418742165435e-07, |
|
"loss": 1.4129, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7153502235469449, |
|
"grad_norm": 0.10642022161621655, |
|
"learning_rate": 6.83369181821336e-07, |
|
"loss": 1.4064, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7228017883755589, |
|
"grad_norm": 0.11256587052450513, |
|
"learning_rate": 6.508737446805704e-07, |
|
"loss": 1.4562, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7302533532041728, |
|
"grad_norm": 0.11453839206549224, |
|
"learning_rate": 6.189544804568165e-07, |
|
"loss": 1.375, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7377049180327869, |
|
"grad_norm": 0.10923398824173401, |
|
"learning_rate": 5.876330479027766e-07, |
|
"loss": 1.3996, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7451564828614009, |
|
"grad_norm": 0.10567442549564693, |
|
"learning_rate": 5.56930700113673e-07, |
|
"loss": 1.3151, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7526080476900149, |
|
"grad_norm": 0.11241400299999167, |
|
"learning_rate": 5.268682701060012e-07, |
|
"loss": 1.3493, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7600596125186289, |
|
"grad_norm": 0.12270530511888589, |
|
"learning_rate": 4.974661566813315e-07, |
|
"loss": 1.4556, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.767511177347243, |
|
"grad_norm": 0.11388307104576878, |
|
"learning_rate": 4.6874431058474127e-07, |
|
"loss": 1.3664, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7749627421758569, |
|
"grad_norm": 0.11683819283505842, |
|
"learning_rate": 4.4072222096727663e-07, |
|
"loss": 1.3655, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7824143070044709, |
|
"grad_norm": 0.11164956386870885, |
|
"learning_rate": 4.1341890216162934e-07, |
|
"loss": 1.4173, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.789865871833085, |
|
"grad_norm": 0.115617873949176, |
|
"learning_rate": 3.868528807799988e-07, |
|
"loss": 1.4142, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.797317436661699, |
|
"grad_norm": 0.11850575487319875, |
|
"learning_rate": 3.610421831428953e-07, |
|
"loss": 1.4074, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.8047690014903129, |
|
"grad_norm": 0.10945804497786706, |
|
"learning_rate": 3.36004323047419e-07, |
|
"loss": 1.4065, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.812220566318927, |
|
"grad_norm": 0.1207296244907688, |
|
"learning_rate": 3.11756289883306e-07, |
|
"loss": 1.4036, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 0.11721523752905044, |
|
"learning_rate": 2.883145371048133e-07, |
|
"loss": 1.4412, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.827123695976155, |
|
"grad_norm": 0.1082841170490658, |
|
"learning_rate": 2.656949710662591e-07, |
|
"loss": 1.4002, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.834575260804769, |
|
"grad_norm": 0.10851598999478153, |
|
"learning_rate": 2.4391294022879947e-07, |
|
"loss": 1.3989, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.842026825633383, |
|
"grad_norm": 0.11994637882887171, |
|
"learning_rate": 2.2298322474575838e-07, |
|
"loss": 1.4249, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.849478390461997, |
|
"grad_norm": 0.11657770238088389, |
|
"learning_rate": 2.0292002643358892e-07, |
|
"loss": 1.3606, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.856929955290611, |
|
"grad_norm": 0.12222665777231102, |
|
"learning_rate": 1.8373695913525317e-07, |
|
"loss": 1.4042, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8643815201192251, |
|
"grad_norm": 0.11933591684408021, |
|
"learning_rate": 1.6544703948258172e-07, |
|
"loss": 1.3801, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8718330849478391, |
|
"grad_norm": 0.11512513535214042, |
|
"learning_rate": 1.4806267806386093e-07, |
|
"loss": 1.4339, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.879284649776453, |
|
"grad_norm": 0.10887999147831859, |
|
"learning_rate": 1.3159567100265506e-07, |
|
"loss": 1.3578, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.886736214605067, |
|
"grad_norm": 0.10936406330567362, |
|
"learning_rate": 1.1605719195356806e-07, |
|
"loss": 1.4209, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8941877794336811, |
|
"grad_norm": 0.11262116794609218, |
|
"learning_rate": 1.0145778452038629e-07, |
|
"loss": 1.3144, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9016393442622951, |
|
"grad_norm": 0.10899372973775676, |
|
"learning_rate": 8.780735510173316e-08, |
|
"loss": 1.3914, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.10967422102767481, |
|
"learning_rate": 7.51151661691048e-08, |
|
"loss": 1.4328, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9165424739195231, |
|
"grad_norm": 0.1195412132912122, |
|
"learning_rate": 6.338982998183856e-08, |
|
"loss": 1.4603, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.9239940387481371, |
|
"grad_norm": 0.11487548750982872, |
|
"learning_rate": 5.263930274328044e-08, |
|
"loss": 1.3718, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9314456035767511, |
|
"grad_norm": 0.10425903788261691, |
|
"learning_rate": 4.287087920212035e-08, |
|
"loss": 1.3577, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9388971684053651, |
|
"grad_norm": 0.10782320142147576, |
|
"learning_rate": 3.4091187702554485e-08, |
|
"loss": 1.4204, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9463487332339792, |
|
"grad_norm": 0.11831408481668713, |
|
"learning_rate": 2.630618568663584e-08, |
|
"loss": 1.3797, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.9538002980625931, |
|
"grad_norm": 0.10973676611006619, |
|
"learning_rate": 1.9521155651863854e-08, |
|
"loss": 1.4409, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9612518628912071, |
|
"grad_norm": 0.1141554461719489, |
|
"learning_rate": 1.3740701566756276e-08, |
|
"loss": 1.3842, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.9687034277198212, |
|
"grad_norm": 0.11403561646592845, |
|
"learning_rate": 8.968745746835983e-09, |
|
"loss": 1.3295, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9761549925484352, |
|
"grad_norm": 0.11455606102253561, |
|
"learning_rate": 5.208526193150764e-09, |
|
"loss": 1.434, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.10758938639100596, |
|
"learning_rate": 2.462594395134854e-09, |
|
"loss": 1.433, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9910581222056631, |
|
"grad_norm": 0.11199281621555614, |
|
"learning_rate": 7.328135993011631e-10, |
|
"loss": 1.4072, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9985096870342772, |
|
"grad_norm": 0.10839803620089897, |
|
"learning_rate": 2.035754493812103e-11, |
|
"loss": 1.4364, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4019134044647217, |
|
"eval_runtime": 563.36, |
|
"eval_samples_per_second": 4.235, |
|
"eval_steps_per_second": 0.133, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 671, |
|
"total_flos": 2236028102377472.0, |
|
"train_loss": 1.4386844755817987, |
|
"train_runtime": 17062.4084, |
|
"train_samples_per_second": 1.258, |
|
"train_steps_per_second": 0.039 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 671, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2236028102377472.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|