|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 14.952978056426332, |
|
"eval_steps": 500, |
|
"global_step": 2385, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006269592476489028, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 8.368200836820084e-07, |
|
"loss": 3.8384, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03134796238244514, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.184100418410042e-06, |
|
"loss": 3.8397, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06269592476489028, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 8.368200836820084e-06, |
|
"loss": 3.739, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09404388714733543, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.2552301255230125e-05, |
|
"loss": 3.6737, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12539184952978055, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.6736401673640167e-05, |
|
"loss": 3.4019, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15673981191222572, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 2.092050209205021e-05, |
|
"loss": 3.1898, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.18808777429467086, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.510460251046025e-05, |
|
"loss": 2.9745, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.219435736677116, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.9288702928870294e-05, |
|
"loss": 2.7919, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2507836990595611, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 3.3472803347280334e-05, |
|
"loss": 2.6087, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.28213166144200624, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.765690376569038e-05, |
|
"loss": 2.4252, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.31347962382445144, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.184100418410042e-05, |
|
"loss": 2.2774, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.602510460251046e-05, |
|
"loss": 2.1221, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3761755485893417, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 5.02092050209205e-05, |
|
"loss": 2.0078, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.40752351097178685, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 5.4393305439330545e-05, |
|
"loss": 1.9116, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.438871473354232, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 5.857740585774059e-05, |
|
"loss": 1.8231, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4702194357366771, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 6.276150627615063e-05, |
|
"loss": 1.7418, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5015673981191222, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.694560669456067e-05, |
|
"loss": 1.6812, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5329153605015674, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 7.11297071129707e-05, |
|
"loss": 1.6494, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5642633228840125, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.531380753138076e-05, |
|
"loss": 1.6192, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5956112852664577, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.949790794979079e-05, |
|
"loss": 1.6012, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6269592476489029, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 8.368200836820084e-05, |
|
"loss": 1.5678, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.658307210031348, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 8.786610878661088e-05, |
|
"loss": 1.5555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.205020920502092e-05, |
|
"loss": 1.5386, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7210031347962382, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.623430962343097e-05, |
|
"loss": 1.5285, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7523510971786834, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.000100418410041841, |
|
"loss": 1.5008, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7836990595611285, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00010460251046025104, |
|
"loss": 1.485, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8150470219435737, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.00010878661087866109, |
|
"loss": 1.4749, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8463949843260188, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.00011297071129707113, |
|
"loss": 1.4928, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.877742946708464, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.00011715481171548118, |
|
"loss": 1.4692, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00012133891213389121, |
|
"loss": 1.4803, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9404388714733543, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00012552301255230126, |
|
"loss": 1.4609, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9717868338557993, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0001297071129707113, |
|
"loss": 1.4266, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9968652037617555, |
|
"eval_loss": 1.9879965782165527, |
|
"eval_runtime": 0.5558, |
|
"eval_samples_per_second": 3.599, |
|
"eval_steps_per_second": 1.799, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0031347962382444, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00013389121338912134, |
|
"loss": 1.4278, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00013807531380753137, |
|
"loss": 1.4211, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0658307210031348, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0001422594142259414, |
|
"loss": 1.4012, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.09717868338558, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.00014644351464435147, |
|
"loss": 1.4179, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1285266457680252, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0001506276150627615, |
|
"loss": 1.4237, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1598746081504703, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00015481171548117155, |
|
"loss": 1.4272, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1912225705329154, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00015899581589958158, |
|
"loss": 1.4069, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2225705329153604, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00016317991631799162, |
|
"loss": 1.3882, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2539184952978055, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00016736401673640169, |
|
"loss": 1.376, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2852664576802508, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00017154811715481172, |
|
"loss": 1.3793, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.316614420062696, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00017573221757322176, |
|
"loss": 1.3698, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.347962382445141, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0001799163179916318, |
|
"loss": 1.3695, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018410041841004183, |
|
"loss": 1.3761, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4106583072100314, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0001882845188284519, |
|
"loss": 1.3639, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4420062695924765, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00019246861924686193, |
|
"loss": 1.3572, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4733542319749215, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019665271966527197, |
|
"loss": 1.3505, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5047021943573666, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019999989284554375, |
|
"loss": 1.367, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.536050156739812, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019999614246368665, |
|
"loss": 1.3631, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.567398119122257, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0001999870345886555, |
|
"loss": 1.3484, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5987460815047023, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00019997256970842288, |
|
"loss": 1.335, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6300940438871474, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00019995274859797366, |
|
"loss": 1.3461, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6614420062695925, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00019992757231926343, |
|
"loss": 1.3332, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.6927899686520376, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00019989704222116167, |
|
"loss": 1.3424, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019986115993937938, |
|
"loss": 1.3278, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7554858934169277, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00019981992739638148, |
|
"loss": 1.3329, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.786833855799373, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019977334680128394, |
|
"loss": 1.3246, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00019972142064973519, |
|
"loss": 1.3346, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8495297805642634, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00019966415172378255, |
|
"loss": 1.3236, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.8808777429467085, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00019960154309172322, |
|
"loss": 1.3059, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9122257053291536, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00019953359810793978, |
|
"loss": 1.2962, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.9435736677115987, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00019946032041272052, |
|
"loss": 1.3079, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9749216300940438, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001993817139320644, |
|
"loss": 1.3029, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.9709800481796265, |
|
"eval_runtime": 0.5506, |
|
"eval_samples_per_second": 3.632, |
|
"eval_steps_per_second": 1.816, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.006269592476489, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00019929778287747072, |
|
"loss": 1.3202, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0376175548589344, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00019920853174571347, |
|
"loss": 1.2698, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019911396531860037, |
|
"loss": 1.2581, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1003134796238245, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019901408866271678, |
|
"loss": 1.2547, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.1316614420062696, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019890890712915416, |
|
"loss": 1.275, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.1630094043887147, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0001987984263532233, |
|
"loss": 1.2573, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.19435736677116, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00019868265225415265, |
|
"loss": 1.2681, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.225705329153605, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00019856159103477086, |
|
"loss": 1.2735, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.2570532915360504, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00019843524918117475, |
|
"loss": 1.2757, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.2884012539184955, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00019830363346238163, |
|
"loss": 1.2594, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.3197492163009406, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00019816675092996665, |
|
"loss": 1.248, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.3510971786833856, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.000198024608917685, |
|
"loss": 1.2508, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.3824451410658307, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019787721504107916, |
|
"loss": 1.2488, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.00019772457719707053, |
|
"loss": 1.2454, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.445141065830721, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001975667035635367, |
|
"loss": 1.2667, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.476489028213166, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019740360259887308, |
|
"loss": 1.2558, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.507836990595611, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019723528304153984, |
|
"loss": 1.2674, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.5391849529780566, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00019706175390959364, |
|
"loss": 1.2715, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.5705329153605017, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00019688302450020446, |
|
"loss": 1.2679, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6018808777429467, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00019669910438915763, |
|
"loss": 1.2521, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.633228840125392, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00019651000343034073, |
|
"loss": 1.2567, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.664576802507837, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00019631573175521547, |
|
"loss": 1.2437, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.695924764890282, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0001961162997722751, |
|
"loss": 1.242, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0001959117181664867, |
|
"loss": 1.2463, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019570199789871863, |
|
"loss": 1.2465, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.7899686520376177, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0001954871502051534, |
|
"loss": 1.24, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.8213166144200628, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00019526718659668553, |
|
"loss": 1.2382, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.852664576802508, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019504211885830493, |
|
"loss": 1.2405, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.884012539184953, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00019481195904846548, |
|
"loss": 1.2526, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.915360501567398, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.000194576719498439, |
|
"loss": 1.2287, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.946708463949843, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0001943364128116545, |
|
"loss": 1.2388, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.978056426332288, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019409105186302293, |
|
"loss": 1.2414, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.9968652037617556, |
|
"eval_loss": 1.9793964624404907, |
|
"eval_runtime": 0.5545, |
|
"eval_samples_per_second": 3.607, |
|
"eval_steps_per_second": 1.804, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.0094043887147337, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019384064979824752, |
|
"loss": 1.2176, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.040752351097179, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019358522003311927, |
|
"loss": 1.187, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.072100313479624, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001933247762527984, |
|
"loss": 1.1861, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019305933241108085, |
|
"loss": 1.1895, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.134796238244514, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00019278890272965096, |
|
"loss": 1.1912, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.166144200626959, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00019251350169731935, |
|
"loss": 1.1844, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.197492163009404, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00019223314406924673, |
|
"loss": 1.1933, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.2288401253918497, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001919478448661533, |
|
"loss": 1.1837, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.260188087774295, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0001916576193735141, |
|
"loss": 1.1817, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.29153605015674, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00019136248314073983, |
|
"loss": 1.1935, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.322884012539185, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00019106245198034403, |
|
"loss": 1.1726, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.35423197492163, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019075754196709572, |
|
"loss": 1.1995, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.385579937304075, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001904477694371582, |
|
"loss": 1.1782, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.41692789968652, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00019013315098721388, |
|
"loss": 1.2003, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00018981370347357493, |
|
"loss": 1.1869, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.479623824451411, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00018948944401128034, |
|
"loss": 1.1821, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.510971786833856, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00018916038997317887, |
|
"loss": 1.1851, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.542319749216301, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001888265589889981, |
|
"loss": 1.1873, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.573667711598746, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00018848796894440031, |
|
"loss": 1.1952, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.605015673981191, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00018814463798002372, |
|
"loss": 1.1829, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00018779658449051092, |
|
"loss": 1.1979, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.6677115987460818, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00018744382712352318, |
|
"loss": 1.1867, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.699059561128527, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00018708638477874144, |
|
"loss": 1.1933, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.730407523510972, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00018672427660685364, |
|
"loss": 1.1699, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.761755485893417, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00018635752200852877, |
|
"loss": 1.1757, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00018598614063337744, |
|
"loss": 1.1991, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.824451410658307, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00018561015237889895, |
|
"loss": 1.1871, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.8557993730407523, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001852295773894155, |
|
"loss": 1.1968, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.8871473354231973, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00018484443605499266, |
|
"loss": 1.1792, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.9184952978056424, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001844547490103472, |
|
"loss": 1.2007, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.9498432601880875, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001840605371337413, |
|
"loss": 1.1966, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.981191222570533, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00018366182154586406, |
|
"loss": 1.2012, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.0134387016296387, |
|
"eval_runtime": 0.5449, |
|
"eval_samples_per_second": 3.67, |
|
"eval_steps_per_second": 1.835, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.012539184952978, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00018325862360869994, |
|
"loss": 1.1633, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.043887147335423, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00018285096492438424, |
|
"loss": 1.1407, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.075235109717869, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00018243886733404564, |
|
"loss": 1.1271, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.106583072100314, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0001820223529166361, |
|
"loss": 1.1199, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018160144398774797, |
|
"loss": 1.1245, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.169278996865204, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001811761630984183, |
|
"loss": 1.1182, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.200626959247649, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00018074653303392063, |
|
"loss": 1.1331, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.231974921630094, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0001803125768125443, |
|
"loss": 1.1308, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.263322884012539, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001798743176843611, |
|
"loss": 1.1312, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.294670846394984, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00017943177912997971, |
|
"loss": 1.1162, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.326018808777429, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00017898498485928763, |
|
"loss": 1.1379, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.3573667711598745, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00017853395881018073, |
|
"loss": 1.1399, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.38871473354232, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00017807872514728106, |
|
"loss": 1.1272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.420062695924765, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00017761930826064182, |
|
"loss": 1.1293, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.45141065830721, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00017715573276444086, |
|
"loss": 1.1315, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.482758620689655, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001766880234956619, |
|
"loss": 1.1355, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.514106583072101, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00017621620551276366, |
|
"loss": 1.1434, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00017574030409433751, |
|
"loss": 1.1433, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.576802507836991, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00017526034473775307, |
|
"loss": 1.1341, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.608150470219436, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00017477635315779204, |
|
"loss": 1.1352, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.639498432601881, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0001742883552852706, |
|
"loss": 1.1428, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.670846394984326, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00017379637726564994, |
|
"loss": 1.1337, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.702194357366771, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00017330044545763574, |
|
"loss": 1.1469, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.733542319749216, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00017280058643176578, |
|
"loss": 1.1318, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.764890282131661, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00017229682696898624, |
|
"loss": 1.1402, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.7962382445141065, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017178919405921717, |
|
"loss": 1.1288, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00017127771489990613, |
|
"loss": 1.1298, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.858934169278997, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00017076241689457136, |
|
"loss": 1.1386, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.890282131661442, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00017024332765133325, |
|
"loss": 1.14, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.921630094043887, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00016972047498143544, |
|
"loss": 1.1444, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 4.952978056426332, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00016919388689775464, |
|
"loss": 1.1466, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.984326018808778, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0001686635916132998, |
|
"loss": 1.1513, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 4.996865203761756, |
|
"eval_loss": 2.0582528114318848, |
|
"eval_runtime": 0.5554, |
|
"eval_samples_per_second": 3.601, |
|
"eval_steps_per_second": 1.801, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 5.015673981191223, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00016812961753970054, |
|
"loss": 1.1118, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.047021943573668, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00016759199328568504, |
|
"loss": 1.0654, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 5.078369905956113, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00016705074765554717, |
|
"loss": 1.0557, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.109717868338558, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001665059096476032, |
|
"loss": 1.0685, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 5.141065830721003, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00016595750845263825, |
|
"loss": 1.073, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.172413793103448, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00016540557345234237, |
|
"loss": 1.0784, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 5.2037617554858935, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00016485013421773615, |
|
"loss": 1.0628, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.235109717868339, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00016429122050758672, |
|
"loss": 1.0822, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 5.266457680250784, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00016372886226681302, |
|
"loss": 1.0748, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.297805642633229, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00016316308962488173, |
|
"loss": 1.0867, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 5.329153605015674, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00016259393289419277, |
|
"loss": 1.0796, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.360501567398119, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00016202142256845553, |
|
"loss": 1.0896, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 5.391849529780564, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00016144558932105473, |
|
"loss": 1.0802, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 5.423197492163009, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00016086646400340757, |
|
"loss": 1.0688, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00016028407764331014, |
|
"loss": 1.0836, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 5.485893416927899, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00015969846144327574, |
|
"loss": 1.0807, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001591096467788625, |
|
"loss": 1.0957, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 5.54858934169279, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00015851766519699295, |
|
"loss": 1.0724, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 5.579937304075235, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00015792254841426328, |
|
"loss": 1.0989, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 5.61128526645768, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00015732432831524448, |
|
"loss": 1.0886, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 5.6426332288401255, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00015672303695077398, |
|
"loss": 1.0961, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.673981191222571, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015611870653623825, |
|
"loss": 1.0964, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 5.705329153605016, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00015551136944984699, |
|
"loss": 1.0895, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 5.736677115987461, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001549010582308984, |
|
"loss": 1.0814, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 5.768025078369906, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00015428780557803567, |
|
"loss": 1.0926, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 5.799373040752351, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00015367164434749534, |
|
"loss": 1.0849, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 5.830721003134796, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00015305260755134667, |
|
"loss": 1.0934, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 5.862068965517241, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00015243072835572318, |
|
"loss": 1.0942, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 5.893416927899686, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0001518060400790456, |
|
"loss": 1.0832, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 5.924764890282132, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00015117857619023677, |
|
"loss": 1.0944, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 5.956112852664576, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00015054837030692854, |
|
"loss": 1.0972, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.987460815047022, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00014991545619366054, |
|
"loss": 1.0951, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.108396291732788, |
|
"eval_runtime": 0.554, |
|
"eval_samples_per_second": 3.61, |
|
"eval_steps_per_second": 1.805, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 6.018808777429467, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00014927986776007128, |
|
"loss": 1.054, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.0501567398119125, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00014864163905908132, |
|
"loss": 1.0222, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 6.081504702194358, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00014800080428506882, |
|
"loss": 1.0209, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 6.112852664576803, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00014735739777203745, |
|
"loss": 1.0167, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 6.144200626959248, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.000146711453991777, |
|
"loss": 1.0113, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 6.175548589341693, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00014606300755201645, |
|
"loss": 1.019, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00014541209319456972, |
|
"loss": 1.0317, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 6.238244514106583, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00014475874579347435, |
|
"loss": 1.0342, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 6.269592476489028, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00014410300035312302, |
|
"loss": 1.0258, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.300940438871473, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00014344489200638827, |
|
"loss": 1.0393, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 6.332288401253918, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00014278445601274, |
|
"loss": 1.038, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00014212172775635633, |
|
"loss": 1.0334, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 6.394984326018808, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001414567427442282, |
|
"loss": 1.0272, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 6.4263322884012535, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00014078953660425652, |
|
"loss": 1.0298, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 6.4576802507836994, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00014012014508334365, |
|
"loss": 1.0337, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 6.4890282131661445, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00013944860404547816, |
|
"loss": 1.0285, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 6.52037617554859, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00013877494946981314, |
|
"loss": 1.041, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 6.551724137931035, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00013809921744873885, |
|
"loss": 1.0319, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 6.58307210031348, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0001374214441859487, |
|
"loss": 1.0311, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.614420062695925, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00013674166599449977, |
|
"loss": 1.0299, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 6.64576802507837, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001360599192948673, |
|
"loss": 1.0435, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 6.677115987460815, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00013537624061299303, |
|
"loss": 1.0342, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 6.70846394984326, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001346906665783288, |
|
"loss": 1.0426, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 6.739811912225705, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00013400323392187357, |
|
"loss": 1.0424, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 6.77115987460815, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00013331397947420576, |
|
"loss": 1.0644, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 6.802507836990595, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00013262294016350986, |
|
"loss": 1.0373, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 6.83385579937304, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.000131930153013598, |
|
"loss": 1.0423, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 6.8652037617554855, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00013123565514192625, |
|
"loss": 1.0421, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00013053948375760604, |
|
"loss": 1.04, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.927899686520377, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00012984167615941056, |
|
"loss": 1.0378, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 6.959247648902822, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00012914226973377644, |
|
"loss": 1.0383, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 6.990595611285267, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00012844130195280076, |
|
"loss": 1.0414, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 6.996865203761756, |
|
"eval_loss": 2.2094361782073975, |
|
"eval_runtime": 0.5549, |
|
"eval_samples_per_second": 3.604, |
|
"eval_steps_per_second": 1.802, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 7.021943573667712, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0001277388103722332, |
|
"loss": 0.9864, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 7.053291536050157, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00012703483262946415, |
|
"loss": 0.9734, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 7.084639498432602, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.000126329406441508, |
|
"loss": 0.9689, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 7.115987460815047, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00012562256960298266, |
|
"loss": 0.9804, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 7.147335423197492, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0001249143599840843, |
|
"loss": 0.9741, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 7.178683385579937, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00012420481552855863, |
|
"loss": 0.9766, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 7.210031347962382, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00012349397425166786, |
|
"loss": 0.9763, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.241379310344827, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.000122781874238154, |
|
"loss": 0.9791, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00012206855364019845, |
|
"loss": 0.9773, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 7.304075235109718, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00012135405067537777, |
|
"loss": 0.9873, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 7.335423197492163, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0001206384036246162, |
|
"loss": 0.9888, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 7.366771159874608, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0001199216508301348, |
|
"loss": 0.9731, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 7.398119122257054, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00011920383069339684, |
|
"loss": 0.9975, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 7.429467084639499, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00011848498167305078, |
|
"loss": 0.9835, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 7.460815047021944, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0001177651422828695, |
|
"loss": 0.9779, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 7.492163009404389, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00011704435108968688, |
|
"loss": 0.9782, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 7.523510971786834, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00011632264671133162, |
|
"loss": 0.9797, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.554858934169279, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00011560006781455812, |
|
"loss": 0.9956, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 7.586206896551724, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00011487665311297484, |
|
"loss": 0.9923, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 7.617554858934169, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00011415244136497013, |
|
"loss": 0.9866, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 7.648902821316614, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00011342747137163572, |
|
"loss": 0.9932, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 7.6802507836990594, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00011270178197468789, |
|
"loss": 0.9841, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 7.7115987460815045, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00011197541205438634, |
|
"loss": 0.9863, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 7.74294670846395, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001112484005274512, |
|
"loss": 0.9951, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 7.774294670846395, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00011052078634497796, |
|
"loss": 0.9847, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 7.80564263322884, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00010979260849035054, |
|
"loss": 0.9868, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 7.836990595611285, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00010906390597715282, |
|
"loss": 0.9874, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 7.868338557993731, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00010833471784707824, |
|
"loss": 0.9928, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 7.899686520376176, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00010760508316783808, |
|
"loss": 1.0034, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 7.931034482758621, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00010687504103106854, |
|
"loss": 0.9844, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 7.962382445141066, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.000106144630550236, |
|
"loss": 0.986, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 7.993730407523511, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00010541389085854176, |
|
"loss": 1.0041, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.304290294647217, |
|
"eval_runtime": 0.554, |
|
"eval_samples_per_second": 3.61, |
|
"eval_steps_per_second": 1.805, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 8.025078369905955, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00010468286110682517, |
|
"loss": 0.9349, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 8.056426332288401, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00010395158046146606, |
|
"loss": 0.915, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 8.087774294670846, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00010322008810228657, |
|
"loss": 0.935, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 8.119122257053291, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00010248842322045164, |
|
"loss": 0.9215, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 8.150470219435737, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0001017566250163696, |
|
"loss": 0.9316, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00010102473269759171, |
|
"loss": 0.9211, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 8.213166144200628, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00010029278547671161, |
|
"loss": 0.9244, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 8.244514106583072, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 9.956082256926448e-05, |
|
"loss": 0.9338, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 9.88288831916259e-05, |
|
"loss": 0.9279, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 8.307210031347962, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 9.80970065589108e-05, |
|
"loss": 0.9312, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 8.338557993730408, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 9.73652318828724e-05, |
|
"loss": 0.9378, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 8.369905956112852, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 9.663359836980144e-05, |
|
"loss": 0.934, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 8.401253918495298, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 9.590214521842556e-05, |
|
"loss": 0.9366, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 8.432601880877742, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 9.517091161780914e-05, |
|
"loss": 0.9317, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 8.463949843260188, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 9.443993674525368e-05, |
|
"loss": 0.9535, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 8.495297805642632, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 9.370925976419885e-05, |
|
"loss": 0.9418, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 8.526645768025078, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.297891982212415e-05, |
|
"loss": 0.9457, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 8.557993730407524, |
|
"grad_norm": 0.5, |
|
"learning_rate": 9.224895604845156e-05, |
|
"loss": 0.9307, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 8.589341692789969, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 9.151940755244912e-05, |
|
"loss": 0.9359, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 8.620689655172415, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 9.07903134211354e-05, |
|
"loss": 0.9451, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 8.652037617554859, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 9.006171271718566e-05, |
|
"loss": 0.9396, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 8.683385579937305, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 8.933364447683868e-05, |
|
"loss": 0.9376, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 8.714733542319749, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 8.860614770780553e-05, |
|
"loss": 0.9465, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 8.746081504702195, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 8.787926138717943e-05, |
|
"loss": 0.9391, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 8.77742946708464, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 8.715302445934773e-05, |
|
"loss": 0.9545, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 8.808777429467085, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 8.642747583390521e-05, |
|
"loss": 0.9418, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 8.84012539184953, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 8.570265438356948e-05, |
|
"loss": 0.9383, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 8.871473354231975, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 8.497859894209828e-05, |
|
"loss": 0.9524, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 8.90282131661442, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 8.425534830220893e-05, |
|
"loss": 0.9504, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 8.934169278996865, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 8.353294121349992e-05, |
|
"loss": 0.9448, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 8.96551724137931, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 8.281141638037464e-05, |
|
"loss": 0.9385, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 8.996865203761756, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 8.209081245996807e-05, |
|
"loss": 0.9481, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 8.996865203761756, |
|
"eval_loss": 2.398902416229248, |
|
"eval_runtime": 0.5512, |
|
"eval_samples_per_second": 3.628, |
|
"eval_steps_per_second": 1.814, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 9.0282131661442, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 8.137116806007531e-05, |
|
"loss": 0.8853, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 9.059561128526646, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 8.065252173708333e-05, |
|
"loss": 0.8874, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 7.993491199390507e-05, |
|
"loss": 0.8784, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.122257053291536, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 7.921837727791673e-05, |
|
"loss": 0.8917, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 9.153605015673982, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 7.85029559788976e-05, |
|
"loss": 0.8781, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 9.184952978056426, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 7.778868642697359e-05, |
|
"loss": 0.8851, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 9.216300940438872, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 7.707560689056343e-05, |
|
"loss": 0.8892, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 9.247648902821316, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 7.636375557432835e-05, |
|
"loss": 0.8863, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 9.278996865203762, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 7.565317061712525e-05, |
|
"loss": 0.8907, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 9.310344827586206, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 7.494389008996327e-05, |
|
"loss": 0.8906, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 9.341692789968652, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 7.423595199396419e-05, |
|
"loss": 0.8987, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 9.373040752351097, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 7.35293942583263e-05, |
|
"loss": 0.8996, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 9.404388714733543, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 7.282425473829236e-05, |
|
"loss": 0.8985, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.435736677115987, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 7.212057121312133e-05, |
|
"loss": 0.8923, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 9.467084639498433, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 7.141838138406438e-05, |
|
"loss": 0.8873, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 9.498432601880877, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 7.071772287234497e-05, |
|
"loss": 0.8872, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 9.529780564263323, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 7.001863321714309e-05, |
|
"loss": 0.8988, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 9.561128526645769, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.932114987358413e-05, |
|
"loss": 0.895, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 9.592476489028213, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 6.862531021073222e-05, |
|
"loss": 0.8905, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 9.623824451410659, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.79311515095878e-05, |
|
"loss": 0.9014, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 0.625, |
|
"learning_rate": 6.723871096109064e-05, |
|
"loss": 0.9016, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 9.68652037617555, |
|
"grad_norm": 0.5, |
|
"learning_rate": 6.654802566412697e-05, |
|
"loss": 0.9134, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 9.717868338557993, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.585913262354184e-05, |
|
"loss": 0.9018, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 9.74921630094044, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 6.51720687481567e-05, |
|
"loss": 0.8992, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 9.780564263322884, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 6.448687084879175e-05, |
|
"loss": 0.9016, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 9.81191222570533, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 6.380357563629381e-05, |
|
"loss": 0.8973, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 9.843260188087774, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 6.312221971956944e-05, |
|
"loss": 0.8979, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 9.87460815047022, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 6.24428396036236e-05, |
|
"loss": 0.8956, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 9.905956112852664, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 6.176547168760373e-05, |
|
"loss": 0.9019, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 9.93730407523511, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 6.109015226284961e-05, |
|
"loss": 0.9004, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 9.968652037617554, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 6.041691751094908e-05, |
|
"loss": 0.8983, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 5.974580350179938e-05, |
|
"loss": 0.9006, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.5172829627990723, |
|
"eval_runtime": 0.5456, |
|
"eval_samples_per_second": 3.666, |
|
"eval_steps_per_second": 1.833, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 10.031347962382446, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 5.9076846191674803e-05, |
|
"loss": 0.8494, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 10.06269592476489, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 5.8410081421300154e-05, |
|
"loss": 0.8491, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 10.094043887147336, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 5.7745544913930496e-05, |
|
"loss": 0.8479, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 10.12539184952978, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 5.7083272273437346e-05, |
|
"loss": 0.8561, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 10.156739811912226, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 5.642329898240089e-05, |
|
"loss": 0.8459, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 10.18808777429467, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 5.5765660400209174e-05, |
|
"loss": 0.8513, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 10.219435736677116, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 5.511039176116357e-05, |
|
"loss": 0.8604, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 10.25078369905956, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 5.44575281725909e-05, |
|
"loss": 0.8602, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 10.282131661442007, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 5.3807104612962676e-05, |
|
"loss": 0.8559, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 10.31347962382445, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 5.3159155930021e-05, |
|
"loss": 0.8642, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 10.344827586206897, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 5.251371683891146e-05, |
|
"loss": 0.8565, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 10.376175548589341, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 5.1870821920323275e-05, |
|
"loss": 0.8513, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 10.407523510971787, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 5.123050561863657e-05, |
|
"loss": 0.8552, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 10.438871473354231, |
|
"grad_norm": 0.5, |
|
"learning_rate": 5.05928022400768e-05, |
|
"loss": 0.8521, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 10.470219435736677, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 4.9957745950876945e-05, |
|
"loss": 0.8661, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 10.501567398119121, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 4.9325370775446864e-05, |
|
"loss": 0.8551, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 10.532915360501567, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 4.869571059455039e-05, |
|
"loss": 0.864, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 10.564263322884013, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 4.806879914349009e-05, |
|
"loss": 0.8631, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 10.595611285266457, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 4.74446700102998e-05, |
|
"loss": 0.8589, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 10.626959247648903, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 4.6823356633945136e-05, |
|
"loss": 0.8682, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 10.658307210031348, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 4.620489230253198e-05, |
|
"loss": 0.8628, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 10.689655172413794, |
|
"grad_norm": 0.5, |
|
"learning_rate": 4.558931015152288e-05, |
|
"loss": 0.868, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 10.721003134796238, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.497664316196175e-05, |
|
"loss": 0.8608, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 10.752351097178684, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 4.4366924158707014e-05, |
|
"loss": 0.8676, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 10.783699059561128, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 4.3760185808672784e-05, |
|
"loss": 0.8652, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 10.815047021943574, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 4.315646061907872e-05, |
|
"loss": 0.8578, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 10.846394984326018, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 4.25557809357084e-05, |
|
"loss": 0.856, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 10.877742946708464, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 4.195817894117635e-05, |
|
"loss": 0.862, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 4.136368665320366e-05, |
|
"loss": 0.8602, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 10.940438871473354, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 4.0772335922902784e-05, |
|
"loss": 0.8572, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 10.971786833855798, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 4.0184158433070937e-05, |
|
"loss": 0.8626, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 10.996865203761756, |
|
"eval_loss": 2.6419336795806885, |
|
"eval_runtime": 0.557, |
|
"eval_samples_per_second": 3.591, |
|
"eval_steps_per_second": 1.795, |
|
"step": 1754 |
|
}, |
|
{ |
|
"epoch": 11.003134796238244, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.9599185696492544e-05, |
|
"loss": 0.8655, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 11.03448275862069, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.9017449054251055e-05, |
|
"loss": 0.8346, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 11.065830721003135, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 3.843897967404968e-05, |
|
"loss": 0.8387, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 11.09717868338558, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 3.7863808548541535e-05, |
|
"loss": 0.8205, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 11.128526645768025, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 3.729196649366914e-05, |
|
"loss": 0.8316, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 11.15987460815047, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.672348414701341e-05, |
|
"loss": 0.8391, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 11.191222570532915, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.615839196615217e-05, |
|
"loss": 0.8264, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 11.22257053291536, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.5596720227028376e-05, |
|
"loss": 0.831, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 11.253918495297805, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.503849902232792e-05, |
|
"loss": 0.8312, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 11.285266457680251, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.448375825986741e-05, |
|
"loss": 0.8382, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 11.316614420062695, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.393252766099187e-05, |
|
"loss": 0.8166, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 11.347962382445141, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 3.338483675898227e-05, |
|
"loss": 0.8285, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 11.379310344827585, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.284071489747325e-05, |
|
"loss": 0.8384, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 11.410658307210031, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.230019122888094e-05, |
|
"loss": 0.8332, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 11.442006269592476, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 3.176329471284113e-05, |
|
"loss": 0.8301, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 11.473354231974922, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 3.123005411465766e-05, |
|
"loss": 0.8411, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 11.504702194357368, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.070049800376127e-05, |
|
"loss": 0.8308, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 11.536050156739812, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 3.01746547521789e-05, |
|
"loss": 0.8285, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 11.567398119122258, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 2.96525525330136e-05, |
|
"loss": 0.835, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 11.598746081504702, |
|
"grad_norm": 0.5, |
|
"learning_rate": 2.9134219318935228e-05, |
|
"loss": 0.8454, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 11.630094043887148, |
|
"grad_norm": 0.5, |
|
"learning_rate": 2.8619682880681596e-05, |
|
"loss": 0.8331, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 11.661442006269592, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 2.8108970785570698e-05, |
|
"loss": 0.8363, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 11.692789968652038, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.7602110396023673e-05, |
|
"loss": 0.8324, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 11.724137931034482, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 2.7099128868098846e-05, |
|
"loss": 0.8368, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 11.755485893416928, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 2.6600053150036797e-05, |
|
"loss": 0.834, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 11.786833855799372, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 2.610490998081653e-05, |
|
"loss": 0.8374, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 11.818181818181818, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 2.5613725888722828e-05, |
|
"loss": 0.8436, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 11.849529780564263, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.5126527189925076e-05, |
|
"loss": 0.8318, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 11.880877742946709, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.464333998706726e-05, |
|
"loss": 0.8339, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 11.912225705329153, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 2.416419016786936e-05, |
|
"loss": 0.844, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 11.943573667711599, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.3689103403740543e-05, |
|
"loss": 0.8424, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 11.974921630094045, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 2.3218105148403656e-05, |
|
"loss": 0.8351, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 2.7330658435821533, |
|
"eval_runtime": 0.544, |
|
"eval_samples_per_second": 3.676, |
|
"eval_steps_per_second": 1.838, |
|
"step": 1914 |
|
}, |
|
{ |
|
"epoch": 12.006269592476489, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 2.2751220636531522e-05, |
|
"loss": 0.8286, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 12.037617554858935, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 2.2288474882394917e-05, |
|
"loss": 0.8207, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 12.068965517241379, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.1829892678522458e-05, |
|
"loss": 0.8146, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 12.100313479623825, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 2.1375498594372113e-05, |
|
"loss": 0.8151, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 12.13166144200627, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 2.0925316975015087e-05, |
|
"loss": 0.8178, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 12.163009404388715, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 2.0479371939831325e-05, |
|
"loss": 0.8197, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 12.19435736677116, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 2.003768738121732e-05, |
|
"loss": 0.8224, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 12.225705329153605, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 1.9600286963305957e-05, |
|
"loss": 0.8195, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 12.25705329153605, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.9167194120698795e-05, |
|
"loss": 0.8232, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 12.288401253918495, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.87384320572104e-05, |
|
"loss": 0.8164, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 12.31974921630094, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 1.8314023744625208e-05, |
|
"loss": 0.8123, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 12.351097178683386, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.789399192146678e-05, |
|
"loss": 0.824, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 12.38244514106583, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 1.7478359091779394e-05, |
|
"loss": 0.8155, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 12.413793103448276, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 1.706714752392259e-05, |
|
"loss": 0.8314, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 12.445141065830722, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.666037924937791e-05, |
|
"loss": 0.8257, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 12.476489028213166, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.6258076061568582e-05, |
|
"loss": 0.8244, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 12.507836990595612, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.5860259514691933e-05, |
|
"loss": 0.8147, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 12.539184952978056, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.5466950922564426e-05, |
|
"loss": 0.8277, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 12.570532915360502, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.5078171357479942e-05, |
|
"loss": 0.8243, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 12.601880877742946, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.4693941649080655e-05, |
|
"loss": 0.8269, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 12.633228840125392, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 1.4314282383241096e-05, |
|
"loss": 0.8155, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 12.664576802507836, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.3939213900965132e-05, |
|
"loss": 0.8249, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 12.695924764890282, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.3568756297296292e-05, |
|
"loss": 0.8218, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 12.727272727272727, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.3202929420241051e-05, |
|
"loss": 0.8158, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 12.758620689655173, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 1.284175286970546e-05, |
|
"loss": 0.8216, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 12.789968652037617, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.2485245996445006e-05, |
|
"loss": 0.8241, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 12.821316614420063, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 1.2133427901027917e-05, |
|
"loss": 0.8241, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 12.852664576802507, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 1.1786317432811767e-05, |
|
"loss": 0.8234, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 12.884012539184953, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.1443933188933553e-05, |
|
"loss": 0.8206, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 12.915360501567399, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.1106293513313436e-05, |
|
"loss": 0.8188, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 12.946708463949843, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.0773416495671773e-05, |
|
"loss": 0.8234, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 12.978056426332289, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 1.0445319970560041e-05, |
|
"loss": 0.8265, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 12.996865203761756, |
|
"eval_loss": 2.783811569213867, |
|
"eval_runtime": 0.5489, |
|
"eval_samples_per_second": 3.643, |
|
"eval_steps_per_second": 1.822, |
|
"step": 2073 |
|
}, |
|
{ |
|
"epoch": 13.009404388714733, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 1.0122021516405278e-05, |
|
"loss": 0.8204, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 13.04075235109718, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 9.803538454568284e-06, |
|
"loss": 0.8004, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 13.072100313479623, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 9.489887848415569e-06, |
|
"loss": 0.8145, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 13.10344827586207, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 9.1810865024052e-06, |
|
"loss": 0.8177, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 13.134796238244514, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 8.87715096118642e-06, |
|
"loss": 0.8189, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 13.16614420062696, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 8.578097508713279e-06, |
|
"loss": 0.8142, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 13.197492163009404, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 8.283942167372127e-06, |
|
"loss": 0.8273, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 13.22884012539185, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 7.994700697123247e-06, |
|
"loss": 0.8079, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 13.260188087774294, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 7.710388594656449e-06, |
|
"loss": 0.8126, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 13.29153605015674, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 7.431021092560819e-06, |
|
"loss": 0.813, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 13.322884012539184, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 7.156613158508619e-06, |
|
"loss": 0.8156, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 13.35423197492163, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.887179494453288e-06, |
|
"loss": 0.8058, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 13.385579937304076, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 6.622734535841868e-06, |
|
"loss": 0.8222, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 13.41692789968652, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 6.363292450841485e-06, |
|
"loss": 0.8177, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 13.448275862068966, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.108867139580365e-06, |
|
"loss": 0.8204, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 13.47962382445141, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 5.859472233402985e-06, |
|
"loss": 0.8132, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 13.510971786833856, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 5.615121094139897e-06, |
|
"loss": 0.8177, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 13.5423197492163, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 5.3758268133916825e-06, |
|
"loss": 0.8137, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 13.573667711598747, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 5.14160221182769e-06, |
|
"loss": 0.8241, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 13.60501567398119, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 4.912459838499028e-06, |
|
"loss": 0.8184, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 13.636363636363637, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 4.688411970166295e-06, |
|
"loss": 0.8203, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 13.66771159874608, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 4.469470610641802e-06, |
|
"loss": 0.8107, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 13.699059561128527, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 4.2556474901464195e-06, |
|
"loss": 0.8115, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 13.730407523510971, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 4.046954064681185e-06, |
|
"loss": 0.8156, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 13.761755485893417, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 3.843401515413392e-06, |
|
"loss": 0.8246, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 13.793103448275861, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 3.6450007480777093e-06, |
|
"loss": 0.8191, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 13.824451410658307, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 3.451762392391733e-06, |
|
"loss": 0.824, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 13.855799373040753, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 3.2636968014865378e-06, |
|
"loss": 0.8202, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 13.887147335423197, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 3.080814051352021e-06, |
|
"loss": 0.8148, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 13.918495297805643, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 2.9031239402970144e-06, |
|
"loss": 0.8245, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 13.949843260188088, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 2.730635988424335e-06, |
|
"loss": 0.8265, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 13.981191222570533, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.5633594371206937e-06, |
|
"loss": 0.8167, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 2.799032211303711, |
|
"eval_runtime": 0.5421, |
|
"eval_samples_per_second": 3.689, |
|
"eval_steps_per_second": 1.845, |
|
"step": 2233 |
|
}, |
|
{ |
|
"epoch": 14.012539184952978, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 2.401303248561659e-06, |
|
"loss": 0.8138, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 14.043887147335424, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 2.2444761052313856e-06, |
|
"loss": 0.8159, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 14.075235109717868, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 2.0928864094574842e-06, |
|
"loss": 0.8174, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 14.106583072100314, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 1.9465422829608837e-06, |
|
"loss": 0.8186, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 14.137931034482758, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.8054515664206128e-06, |
|
"loss": 0.8183, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 14.169278996865204, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.6696218190537683e-06, |
|
"loss": 0.814, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 14.200626959247648, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.539060318210539e-06, |
|
"loss": 0.8215, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 14.231974921630094, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 1.413774058984252e-06, |
|
"loss": 0.8152, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 14.263322884012538, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.2937697538366378e-06, |
|
"loss": 0.8136, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 14.294670846394984, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 1.1790538322381527e-06, |
|
"loss": 0.8116, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 14.32601880877743, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.0696324403235757e-06, |
|
"loss": 0.824, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 14.357366771159874, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 9.655114405626386e-07, |
|
"loss": 0.8171, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 14.38871473354232, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 8.666964114459997e-07, |
|
"loss": 0.8055, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 14.420062695924765, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 7.73192647186316e-07, |
|
"loss": 0.8262, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 14.45141065830721, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.850051574346372e-07, |
|
"loss": 0.8127, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 14.482758620689655, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 6.021386670119756e-07, |
|
"loss": 0.8089, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 14.5141065830721, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 5.245976156561305e-07, |
|
"loss": 0.8186, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 14.545454545454545, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 4.523861577839239e-07, |
|
"loss": 0.8223, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 14.576802507836991, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 3.8550816226852196e-07, |
|
"loss": 0.8151, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 14.608150470219435, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.23967212232168e-07, |
|
"loss": 0.8152, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 14.639498432601881, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 2.677666048542693e-07, |
|
"loss": 0.8097, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 14.670846394984325, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 2.1690935119468293e-07, |
|
"loss": 0.827, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 14.702194357366771, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.7139817603240016e-07, |
|
"loss": 0.8203, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 14.733542319749215, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 1.3123551771958564e-07, |
|
"loss": 0.8204, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 14.764890282131661, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 9.642352805093734e-08, |
|
"loss": 0.8137, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 14.796238244514107, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.696407214835664e-08, |
|
"loss": 0.8149, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 14.827586206896552, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 4.285872836108373e-08, |
|
"loss": 0.8119, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 14.858934169278998, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 2.4108788181076423e-08, |
|
"loss": 0.8128, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 14.890282131661442, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 1.071525617384328e-08, |
|
"loss": 0.818, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 14.921630094043888, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 2.6788499246421795e-09, |
|
"loss": 0.8068, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 14.952978056426332, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0, |
|
"loss": 0.8075, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 14.952978056426332, |
|
"eval_loss": 2.8001084327697754, |
|
"eval_runtime": 0.5613, |
|
"eval_samples_per_second": 3.563, |
|
"eval_steps_per_second": 1.782, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 14.952978056426332, |
|
"step": 2385, |
|
"total_flos": 1.4215766364399862e+18, |
|
"train_loss": 1.0738131565117985, |
|
"train_runtime": 14553.4383, |
|
"train_samples_per_second": 7.888, |
|
"train_steps_per_second": 0.164 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2385, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4215766364399862e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|