|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 90669, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005514563963427412, |
|
"grad_norm": 4.689566135406494, |
|
"learning_rate": 4.972427180182863e-05, |
|
"loss": 6.5185, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.011029127926854823, |
|
"grad_norm": 4.601550102233887, |
|
"learning_rate": 4.944854360365726e-05, |
|
"loss": 5.5109, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.016543691890282236, |
|
"grad_norm": 4.204834461212158, |
|
"learning_rate": 4.917281540548589e-05, |
|
"loss": 5.1513, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.022058255853709647, |
|
"grad_norm": 4.12762451171875, |
|
"learning_rate": 4.889708720731452e-05, |
|
"loss": 4.9528, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.027572819817137058, |
|
"grad_norm": 4.254393100738525, |
|
"learning_rate": 4.862135900914315e-05, |
|
"loss": 4.7961, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.03308738378056447, |
|
"grad_norm": 4.859085559844971, |
|
"learning_rate": 4.834563081097178e-05, |
|
"loss": 4.7004, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.03860194774399188, |
|
"grad_norm": 4.5257368087768555, |
|
"learning_rate": 4.806990261280041e-05, |
|
"loss": 4.5864, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.044116511707419294, |
|
"grad_norm": 4.193604946136475, |
|
"learning_rate": 4.779417441462904e-05, |
|
"loss": 4.5161, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.04963107567084671, |
|
"grad_norm": 4.394799709320068, |
|
"learning_rate": 4.7518446216457665e-05, |
|
"loss": 4.4474, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.055145639634274116, |
|
"grad_norm": 4.351430892944336, |
|
"learning_rate": 4.72427180182863e-05, |
|
"loss": 4.3782, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.06066020359770153, |
|
"grad_norm": 4.718796730041504, |
|
"learning_rate": 4.696698982011493e-05, |
|
"loss": 4.3183, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.06617476756112894, |
|
"grad_norm": 4.186001300811768, |
|
"learning_rate": 4.6691261621943555e-05, |
|
"loss": 4.2597, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.07168933152455635, |
|
"grad_norm": 4.420439720153809, |
|
"learning_rate": 4.641553342377218e-05, |
|
"loss": 4.2321, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.07720389548798376, |
|
"grad_norm": 4.082899570465088, |
|
"learning_rate": 4.613980522560081e-05, |
|
"loss": 4.1731, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.08271845945141118, |
|
"grad_norm": 4.149295806884766, |
|
"learning_rate": 4.5864077027429445e-05, |
|
"loss": 4.1479, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.08823302341483859, |
|
"grad_norm": 4.364389419555664, |
|
"learning_rate": 4.558834882925807e-05, |
|
"loss": 4.1119, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.093747587378266, |
|
"grad_norm": 4.409417629241943, |
|
"learning_rate": 4.53126206310867e-05, |
|
"loss": 4.0806, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.09926215134169342, |
|
"grad_norm": 4.639771938323975, |
|
"learning_rate": 4.5036892432915335e-05, |
|
"loss": 4.0431, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.10477671530512082, |
|
"grad_norm": 4.332629203796387, |
|
"learning_rate": 4.476116423474396e-05, |
|
"loss": 4.0352, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.11029127926854823, |
|
"grad_norm": 4.82522439956665, |
|
"learning_rate": 4.448543603657259e-05, |
|
"loss": 3.9858, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.11580584323197565, |
|
"grad_norm": 4.305941104888916, |
|
"learning_rate": 4.420970783840122e-05, |
|
"loss": 3.9731, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.12132040719540306, |
|
"grad_norm": 4.728514194488525, |
|
"learning_rate": 4.393397964022985e-05, |
|
"loss": 3.9417, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.12683497115883047, |
|
"grad_norm": 4.233896732330322, |
|
"learning_rate": 4.365825144205848e-05, |
|
"loss": 3.9396, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.1323495351222579, |
|
"grad_norm": 4.335183143615723, |
|
"learning_rate": 4.338252324388711e-05, |
|
"loss": 3.9091, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.13786409908568528, |
|
"grad_norm": 4.590264797210693, |
|
"learning_rate": 4.3106795045715735e-05, |
|
"loss": 3.8935, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.1433786630491127, |
|
"grad_norm": 4.33479642868042, |
|
"learning_rate": 4.283106684754436e-05, |
|
"loss": 3.8875, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.14889322701254012, |
|
"grad_norm": 4.2722697257995605, |
|
"learning_rate": 4.2555338649373e-05, |
|
"loss": 3.8566, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.15440779097596752, |
|
"grad_norm": 4.284050464630127, |
|
"learning_rate": 4.2279610451201625e-05, |
|
"loss": 3.8433, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.15992235493939494, |
|
"grad_norm": 4.086195945739746, |
|
"learning_rate": 4.200388225303025e-05, |
|
"loss": 3.8381, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.16543691890282236, |
|
"grad_norm": 4.229586124420166, |
|
"learning_rate": 4.172815405485889e-05, |
|
"loss": 3.8148, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.17095148286624975, |
|
"grad_norm": 4.43237829208374, |
|
"learning_rate": 4.1452425856687515e-05, |
|
"loss": 3.7945, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.17646604682967718, |
|
"grad_norm": 4.232430934906006, |
|
"learning_rate": 4.117669765851614e-05, |
|
"loss": 3.8078, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.1819806107931046, |
|
"grad_norm": 5.106810092926025, |
|
"learning_rate": 4.090096946034477e-05, |
|
"loss": 3.7684, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.187495174756532, |
|
"grad_norm": 4.939910411834717, |
|
"learning_rate": 4.0625241262173405e-05, |
|
"loss": 3.7634, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.1930097387199594, |
|
"grad_norm": 4.215509414672852, |
|
"learning_rate": 4.034951306400203e-05, |
|
"loss": 3.7487, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.19852430268338683, |
|
"grad_norm": 4.279122829437256, |
|
"learning_rate": 4.007378486583066e-05, |
|
"loss": 3.7697, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.20403886664681423, |
|
"grad_norm": 4.503846168518066, |
|
"learning_rate": 3.979805666765929e-05, |
|
"loss": 3.7488, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.20955343061024165, |
|
"grad_norm": 3.935098648071289, |
|
"learning_rate": 3.9522328469487916e-05, |
|
"loss": 3.7347, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.21506799457366907, |
|
"grad_norm": 4.217621326446533, |
|
"learning_rate": 3.924660027131655e-05, |
|
"loss": 3.7063, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.22058255853709646, |
|
"grad_norm": 4.404201030731201, |
|
"learning_rate": 3.897087207314518e-05, |
|
"loss": 3.6878, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.22609712250052388, |
|
"grad_norm": 4.507588863372803, |
|
"learning_rate": 3.869514387497381e-05, |
|
"loss": 3.7021, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.2316116864639513, |
|
"grad_norm": 4.7501220703125, |
|
"learning_rate": 3.841941567680244e-05, |
|
"loss": 3.6856, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.2371262504273787, |
|
"grad_norm": 4.5834879875183105, |
|
"learning_rate": 3.814368747863107e-05, |
|
"loss": 3.6799, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.24264081439080612, |
|
"grad_norm": 4.500739574432373, |
|
"learning_rate": 3.7867959280459695e-05, |
|
"loss": 3.6695, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.24815537835423354, |
|
"grad_norm": 4.357424736022949, |
|
"learning_rate": 3.759223108228832e-05, |
|
"loss": 3.6648, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.25366994231766093, |
|
"grad_norm": 4.667726039886475, |
|
"learning_rate": 3.731650288411696e-05, |
|
"loss": 3.6623, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.25918450628108836, |
|
"grad_norm": 4.472695827484131, |
|
"learning_rate": 3.7040774685945585e-05, |
|
"loss": 3.6336, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.2646990702445158, |
|
"grad_norm": 4.226781368255615, |
|
"learning_rate": 3.676504648777421e-05, |
|
"loss": 3.6362, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.2702136342079432, |
|
"grad_norm": 4.575997829437256, |
|
"learning_rate": 3.648931828960284e-05, |
|
"loss": 3.6146, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.27572819817137056, |
|
"grad_norm": 4.648991584777832, |
|
"learning_rate": 3.621359009143147e-05, |
|
"loss": 3.6352, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.281242762134798, |
|
"grad_norm": 4.165131092071533, |
|
"learning_rate": 3.59378618932601e-05, |
|
"loss": 3.6165, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.2867573260982254, |
|
"grad_norm": 4.220915794372559, |
|
"learning_rate": 3.566213369508873e-05, |
|
"loss": 3.6067, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.29227189006165283, |
|
"grad_norm": 4.650350093841553, |
|
"learning_rate": 3.5386405496917365e-05, |
|
"loss": 3.5922, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.29778645402508025, |
|
"grad_norm": 4.175040245056152, |
|
"learning_rate": 3.511067729874599e-05, |
|
"loss": 3.6063, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.30330101798850767, |
|
"grad_norm": 4.40975284576416, |
|
"learning_rate": 3.483494910057462e-05, |
|
"loss": 3.5945, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.30881558195193504, |
|
"grad_norm": 3.9698615074157715, |
|
"learning_rate": 3.455922090240325e-05, |
|
"loss": 3.5887, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.31433014591536246, |
|
"grad_norm": 4.441317081451416, |
|
"learning_rate": 3.4283492704231876e-05, |
|
"loss": 3.5741, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.3198447098787899, |
|
"grad_norm": 4.244263648986816, |
|
"learning_rate": 3.400776450606051e-05, |
|
"loss": 3.5688, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.3253592738422173, |
|
"grad_norm": 4.017004013061523, |
|
"learning_rate": 3.373203630788914e-05, |
|
"loss": 3.568, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.3308738378056447, |
|
"grad_norm": 4.664565563201904, |
|
"learning_rate": 3.3456308109717765e-05, |
|
"loss": 3.5571, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.33638840176907214, |
|
"grad_norm": 4.073508262634277, |
|
"learning_rate": 3.318057991154639e-05, |
|
"loss": 3.5189, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.3419029657324995, |
|
"grad_norm": 4.424101829528809, |
|
"learning_rate": 3.290485171337502e-05, |
|
"loss": 3.554, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.34741752969592693, |
|
"grad_norm": 4.302523136138916, |
|
"learning_rate": 3.2629123515203655e-05, |
|
"loss": 3.5388, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.35293209365935435, |
|
"grad_norm": 4.329090118408203, |
|
"learning_rate": 3.235339531703228e-05, |
|
"loss": 3.5485, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.3584466576227818, |
|
"grad_norm": 4.2849531173706055, |
|
"learning_rate": 3.207766711886092e-05, |
|
"loss": 3.5388, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.3639612215862092, |
|
"grad_norm": 4.334972381591797, |
|
"learning_rate": 3.1801938920689545e-05, |
|
"loss": 3.5225, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.3694757855496366, |
|
"grad_norm": 4.848361492156982, |
|
"learning_rate": 3.152621072251817e-05, |
|
"loss": 3.5226, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.374990349513064, |
|
"grad_norm": 4.436476230621338, |
|
"learning_rate": 3.12504825243468e-05, |
|
"loss": 3.5046, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.3805049134764914, |
|
"grad_norm": 4.017549991607666, |
|
"learning_rate": 3.097475432617543e-05, |
|
"loss": 3.5091, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.3860194774399188, |
|
"grad_norm": 4.507646083831787, |
|
"learning_rate": 3.069902612800406e-05, |
|
"loss": 3.4951, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.39153404140334624, |
|
"grad_norm": 4.1406989097595215, |
|
"learning_rate": 3.042329792983269e-05, |
|
"loss": 3.496, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.39704860536677367, |
|
"grad_norm": 4.320881366729736, |
|
"learning_rate": 3.0147569731661318e-05, |
|
"loss": 3.5025, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.4025631693302011, |
|
"grad_norm": 4.030999183654785, |
|
"learning_rate": 2.9871841533489946e-05, |
|
"loss": 3.4974, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.40807773329362845, |
|
"grad_norm": 4.489917755126953, |
|
"learning_rate": 2.9596113335318577e-05, |
|
"loss": 3.4867, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.4135922972570559, |
|
"grad_norm": 4.384711742401123, |
|
"learning_rate": 2.9320385137147204e-05, |
|
"loss": 3.4753, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.4191068612204833, |
|
"grad_norm": 4.64800500869751, |
|
"learning_rate": 2.9044656938975836e-05, |
|
"loss": 3.4703, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.4246214251839107, |
|
"grad_norm": 4.490517616271973, |
|
"learning_rate": 2.876892874080447e-05, |
|
"loss": 3.471, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.43013598914733814, |
|
"grad_norm": 4.496025085449219, |
|
"learning_rate": 2.8493200542633098e-05, |
|
"loss": 3.4579, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.43565055311076556, |
|
"grad_norm": 4.765578746795654, |
|
"learning_rate": 2.8217472344461725e-05, |
|
"loss": 3.4646, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.4411651170741929, |
|
"grad_norm": 4.592626094818115, |
|
"learning_rate": 2.7941744146290356e-05, |
|
"loss": 3.4656, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.44667968103762035, |
|
"grad_norm": 4.292928695678711, |
|
"learning_rate": 2.7666015948118984e-05, |
|
"loss": 3.4533, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.45219424500104777, |
|
"grad_norm": 4.014820098876953, |
|
"learning_rate": 2.7390287749947612e-05, |
|
"loss": 3.4576, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.4577088089644752, |
|
"grad_norm": 4.129273891448975, |
|
"learning_rate": 2.7114559551776243e-05, |
|
"loss": 3.4509, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.4632233729279026, |
|
"grad_norm": 4.679018497467041, |
|
"learning_rate": 2.683883135360487e-05, |
|
"loss": 3.4382, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.46873793689133, |
|
"grad_norm": 4.382132053375244, |
|
"learning_rate": 2.6563103155433498e-05, |
|
"loss": 3.4435, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.4742525008547574, |
|
"grad_norm": 4.3672380447387695, |
|
"learning_rate": 2.628737495726213e-05, |
|
"loss": 3.4398, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.4797670648181848, |
|
"grad_norm": 4.159623622894287, |
|
"learning_rate": 2.6011646759090757e-05, |
|
"loss": 3.4529, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.48528162878161224, |
|
"grad_norm": 4.100943565368652, |
|
"learning_rate": 2.573591856091939e-05, |
|
"loss": 3.411, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.49079619274503966, |
|
"grad_norm": 4.237346649169922, |
|
"learning_rate": 2.5460190362748023e-05, |
|
"loss": 3.4153, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.4963107567084671, |
|
"grad_norm": 4.697793960571289, |
|
"learning_rate": 2.518446216457665e-05, |
|
"loss": 3.4153, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.5018253206718944, |
|
"grad_norm": 4.274381160736084, |
|
"learning_rate": 2.4908733966405278e-05, |
|
"loss": 3.4228, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.5073398846353219, |
|
"grad_norm": 4.421125411987305, |
|
"learning_rate": 2.463300576823391e-05, |
|
"loss": 3.3963, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.5128544485987493, |
|
"grad_norm": 4.356249809265137, |
|
"learning_rate": 2.4357277570062537e-05, |
|
"loss": 3.4186, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.5183690125621767, |
|
"grad_norm": 4.516757965087891, |
|
"learning_rate": 2.4081549371891164e-05, |
|
"loss": 3.4222, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.5238835765256041, |
|
"grad_norm": 5.137631416320801, |
|
"learning_rate": 2.3805821173719796e-05, |
|
"loss": 3.4121, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.5293981404890316, |
|
"grad_norm": 4.224301338195801, |
|
"learning_rate": 2.3530092975548423e-05, |
|
"loss": 3.3854, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.534912704452459, |
|
"grad_norm": 4.442605972290039, |
|
"learning_rate": 2.3254364777377054e-05, |
|
"loss": 3.3857, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.5404272684158864, |
|
"grad_norm": 4.190525531768799, |
|
"learning_rate": 2.2978636579205685e-05, |
|
"loss": 3.3958, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.5459418323793138, |
|
"grad_norm": 4.089470386505127, |
|
"learning_rate": 2.2702908381034313e-05, |
|
"loss": 3.4038, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.5514563963427411, |
|
"grad_norm": 4.37148380279541, |
|
"learning_rate": 2.242718018286294e-05, |
|
"loss": 3.3908, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.5569709603061685, |
|
"grad_norm": 4.484643936157227, |
|
"learning_rate": 2.2151451984691572e-05, |
|
"loss": 3.392, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.562485524269596, |
|
"grad_norm": 4.480875492095947, |
|
"learning_rate": 2.18757237865202e-05, |
|
"loss": 3.38, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.5680000882330234, |
|
"grad_norm": 4.415227890014648, |
|
"learning_rate": 2.159999558834883e-05, |
|
"loss": 3.4014, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.5735146521964508, |
|
"grad_norm": 4.461363315582275, |
|
"learning_rate": 2.132426739017746e-05, |
|
"loss": 3.3768, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.5790292161598782, |
|
"grad_norm": 4.364917755126953, |
|
"learning_rate": 2.104853919200609e-05, |
|
"loss": 3.3769, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.5845437801233057, |
|
"grad_norm": 4.509827613830566, |
|
"learning_rate": 2.0772810993834717e-05, |
|
"loss": 3.3567, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.5900583440867331, |
|
"grad_norm": 4.165256023406982, |
|
"learning_rate": 2.0497082795663348e-05, |
|
"loss": 3.3649, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.5955729080501605, |
|
"grad_norm": 4.39963436126709, |
|
"learning_rate": 2.0221354597491976e-05, |
|
"loss": 3.3688, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.6010874720135879, |
|
"grad_norm": 4.492909908294678, |
|
"learning_rate": 1.9945626399320607e-05, |
|
"loss": 3.3654, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.6066020359770153, |
|
"grad_norm": 4.136989593505859, |
|
"learning_rate": 1.9669898201149238e-05, |
|
"loss": 3.3588, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.6121165999404428, |
|
"grad_norm": 4.091104030609131, |
|
"learning_rate": 1.9394170002977866e-05, |
|
"loss": 3.3714, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.6176311639038701, |
|
"grad_norm": 4.557612895965576, |
|
"learning_rate": 1.9118441804806493e-05, |
|
"loss": 3.3783, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.6231457278672975, |
|
"grad_norm": 4.4669718742370605, |
|
"learning_rate": 1.8842713606635124e-05, |
|
"loss": 3.3576, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.6286602918307249, |
|
"grad_norm": 4.214612007141113, |
|
"learning_rate": 1.8566985408463752e-05, |
|
"loss": 3.3769, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.6341748557941523, |
|
"grad_norm": 4.079827785491943, |
|
"learning_rate": 1.8291257210292383e-05, |
|
"loss": 3.3584, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.6396894197575798, |
|
"grad_norm": 4.0199713706970215, |
|
"learning_rate": 1.8015529012121014e-05, |
|
"loss": 3.3593, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.6452039837210072, |
|
"grad_norm": 4.746074199676514, |
|
"learning_rate": 1.7739800813949642e-05, |
|
"loss": 3.3417, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.6507185476844346, |
|
"grad_norm": 4.219590187072754, |
|
"learning_rate": 1.746407261577827e-05, |
|
"loss": 3.3518, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.656233111647862, |
|
"grad_norm": 4.15669584274292, |
|
"learning_rate": 1.71883444176069e-05, |
|
"loss": 3.3401, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.6617476756112894, |
|
"grad_norm": 4.129217147827148, |
|
"learning_rate": 1.6912616219435532e-05, |
|
"loss": 3.3504, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.6672622395747169, |
|
"grad_norm": 4.176223278045654, |
|
"learning_rate": 1.663688802126416e-05, |
|
"loss": 3.3429, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.6727768035381443, |
|
"grad_norm": 3.982861042022705, |
|
"learning_rate": 1.636115982309279e-05, |
|
"loss": 3.3239, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.6782913675015716, |
|
"grad_norm": 4.495360851287842, |
|
"learning_rate": 1.6085431624921418e-05, |
|
"loss": 3.334, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.683805931464999, |
|
"grad_norm": 4.5026679039001465, |
|
"learning_rate": 1.5809703426750046e-05, |
|
"loss": 3.3434, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.6893204954284264, |
|
"grad_norm": 4.469930648803711, |
|
"learning_rate": 1.5533975228578677e-05, |
|
"loss": 3.3085, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.6948350593918539, |
|
"grad_norm": 4.942314147949219, |
|
"learning_rate": 1.5258247030407308e-05, |
|
"loss": 3.3169, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.7003496233552813, |
|
"grad_norm": 4.131747245788574, |
|
"learning_rate": 1.4982518832235937e-05, |
|
"loss": 3.3242, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.7058641873187087, |
|
"grad_norm": 4.662265777587891, |
|
"learning_rate": 1.4706790634064565e-05, |
|
"loss": 3.3455, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.7113787512821361, |
|
"grad_norm": 4.53313684463501, |
|
"learning_rate": 1.4431062435893195e-05, |
|
"loss": 3.3093, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.7168933152455635, |
|
"grad_norm": 4.306014537811279, |
|
"learning_rate": 1.4155334237721824e-05, |
|
"loss": 3.3146, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.722407879208991, |
|
"grad_norm": 4.205687999725342, |
|
"learning_rate": 1.3879606039550452e-05, |
|
"loss": 3.3182, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.7279224431724184, |
|
"grad_norm": 4.351266384124756, |
|
"learning_rate": 1.3603877841379084e-05, |
|
"loss": 3.3363, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.7334370071358458, |
|
"grad_norm": 4.580765724182129, |
|
"learning_rate": 1.3328149643207714e-05, |
|
"loss": 3.2988, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.7389515710992732, |
|
"grad_norm": 4.511965274810791, |
|
"learning_rate": 1.3052421445036341e-05, |
|
"loss": 3.3153, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.7444661350627005, |
|
"grad_norm": 4.1504950523376465, |
|
"learning_rate": 1.277669324686497e-05, |
|
"loss": 3.3193, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.749980699026128, |
|
"grad_norm": 4.668148994445801, |
|
"learning_rate": 1.25009650486936e-05, |
|
"loss": 3.2916, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.7554952629895554, |
|
"grad_norm": 4.106932163238525, |
|
"learning_rate": 1.222523685052223e-05, |
|
"loss": 3.3268, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.7610098269529828, |
|
"grad_norm": 4.127325057983398, |
|
"learning_rate": 1.1949508652350859e-05, |
|
"loss": 3.3036, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.7665243909164102, |
|
"grad_norm": 4.444930076599121, |
|
"learning_rate": 1.1673780454179488e-05, |
|
"loss": 3.3024, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.7720389548798376, |
|
"grad_norm": 4.07660436630249, |
|
"learning_rate": 1.1398052256008118e-05, |
|
"loss": 3.3053, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.7775535188432651, |
|
"grad_norm": 4.658594131469727, |
|
"learning_rate": 1.1122324057836747e-05, |
|
"loss": 3.3048, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.7830680828066925, |
|
"grad_norm": 4.439772129058838, |
|
"learning_rate": 1.0846595859665378e-05, |
|
"loss": 3.2943, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.7885826467701199, |
|
"grad_norm": 4.101642608642578, |
|
"learning_rate": 1.0570867661494006e-05, |
|
"loss": 3.2914, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.7940972107335473, |
|
"grad_norm": 4.650053024291992, |
|
"learning_rate": 1.0295139463322635e-05, |
|
"loss": 3.2977, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.7996117746969748, |
|
"grad_norm": 4.005575180053711, |
|
"learning_rate": 1.0019411265151266e-05, |
|
"loss": 3.2971, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.8051263386604022, |
|
"grad_norm": 4.499767780303955, |
|
"learning_rate": 9.743683066979894e-06, |
|
"loss": 3.3002, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.8106409026238295, |
|
"grad_norm": 4.143964767456055, |
|
"learning_rate": 9.467954868808523e-06, |
|
"loss": 3.2879, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.8161554665872569, |
|
"grad_norm": 4.027842998504639, |
|
"learning_rate": 9.192226670637155e-06, |
|
"loss": 3.2974, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.8216700305506843, |
|
"grad_norm": 4.330503463745117, |
|
"learning_rate": 8.916498472465782e-06, |
|
"loss": 3.2971, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.8271845945141117, |
|
"grad_norm": 4.108890056610107, |
|
"learning_rate": 8.640770274294412e-06, |
|
"loss": 3.2951, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.8326991584775392, |
|
"grad_norm": 4.396561622619629, |
|
"learning_rate": 8.365042076123043e-06, |
|
"loss": 3.3018, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.8382137224409666, |
|
"grad_norm": 4.230642795562744, |
|
"learning_rate": 8.08931387795167e-06, |
|
"loss": 3.2832, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.843728286404394, |
|
"grad_norm": 4.438147068023682, |
|
"learning_rate": 7.8135856797803e-06, |
|
"loss": 3.2845, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.8492428503678214, |
|
"grad_norm": 4.5078325271606445, |
|
"learning_rate": 7.53785748160893e-06, |
|
"loss": 3.2977, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.8547574143312489, |
|
"grad_norm": 4.344171524047852, |
|
"learning_rate": 7.262129283437559e-06, |
|
"loss": 3.2785, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.8602719782946763, |
|
"grad_norm": 4.636903762817383, |
|
"learning_rate": 6.986401085266188e-06, |
|
"loss": 3.2784, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.8657865422581037, |
|
"grad_norm": 4.955584526062012, |
|
"learning_rate": 6.710672887094818e-06, |
|
"loss": 3.2676, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.8713011062215311, |
|
"grad_norm": 4.11828088760376, |
|
"learning_rate": 6.4349446889234475e-06, |
|
"loss": 3.2901, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.8768156701849584, |
|
"grad_norm": 4.916619777679443, |
|
"learning_rate": 6.159216490752077e-06, |
|
"loss": 3.2964, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.8823302341483859, |
|
"grad_norm": 4.217592239379883, |
|
"learning_rate": 5.883488292580705e-06, |
|
"loss": 3.2923, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.8878447981118133, |
|
"grad_norm": 4.360821723937988, |
|
"learning_rate": 5.607760094409336e-06, |
|
"loss": 3.2807, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.8933593620752407, |
|
"grad_norm": 4.062866687774658, |
|
"learning_rate": 5.332031896237965e-06, |
|
"loss": 3.2936, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.8988739260386681, |
|
"grad_norm": 4.373243808746338, |
|
"learning_rate": 5.0563036980665936e-06, |
|
"loss": 3.2592, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.9043884900020955, |
|
"grad_norm": 4.527072906494141, |
|
"learning_rate": 4.780575499895224e-06, |
|
"loss": 3.2754, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.909903053965523, |
|
"grad_norm": 4.206862926483154, |
|
"learning_rate": 4.504847301723853e-06, |
|
"loss": 3.2609, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.9154176179289504, |
|
"grad_norm": 4.414552688598633, |
|
"learning_rate": 4.229119103552482e-06, |
|
"loss": 3.2936, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.9209321818923778, |
|
"grad_norm": 4.722365856170654, |
|
"learning_rate": 3.953390905381112e-06, |
|
"loss": 3.2733, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.9264467458558052, |
|
"grad_norm": 4.194797515869141, |
|
"learning_rate": 3.6776627072097413e-06, |
|
"loss": 3.272, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.9319613098192326, |
|
"grad_norm": 4.476502895355225, |
|
"learning_rate": 3.4019345090383703e-06, |
|
"loss": 3.286, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.93747587378266, |
|
"grad_norm": 4.418745994567871, |
|
"learning_rate": 3.1262063108669997e-06, |
|
"loss": 3.2611, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.9429904377460874, |
|
"grad_norm": 4.2887797355651855, |
|
"learning_rate": 2.8504781126956295e-06, |
|
"loss": 3.2749, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.9485050017095148, |
|
"grad_norm": 4.276843547821045, |
|
"learning_rate": 2.574749914524259e-06, |
|
"loss": 3.2739, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.9540195656729422, |
|
"grad_norm": 4.448587417602539, |
|
"learning_rate": 2.299021716352888e-06, |
|
"loss": 3.2756, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.9595341296363696, |
|
"grad_norm": 4.589288711547852, |
|
"learning_rate": 2.0232935181815176e-06, |
|
"loss": 3.2734, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.9650486935997971, |
|
"grad_norm": 4.560390472412109, |
|
"learning_rate": 1.7475653200101468e-06, |
|
"loss": 3.2652, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.9705632575632245, |
|
"grad_norm": 4.633021831512451, |
|
"learning_rate": 1.4718371218387762e-06, |
|
"loss": 3.2831, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.9760778215266519, |
|
"grad_norm": 4.438389301300049, |
|
"learning_rate": 1.1961089236674058e-06, |
|
"loss": 3.2584, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.9815923854900793, |
|
"grad_norm": 4.40911340713501, |
|
"learning_rate": 9.203807254960352e-07, |
|
"loss": 3.2831, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.9871069494535067, |
|
"grad_norm": 4.270487308502197, |
|
"learning_rate": 6.446525273246646e-07, |
|
"loss": 3.2702, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.9926215134169342, |
|
"grad_norm": 4.149001598358154, |
|
"learning_rate": 3.689243291532939e-07, |
|
"loss": 3.2504, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.9981360773803616, |
|
"grad_norm": 4.047245502471924, |
|
"learning_rate": 9.319613098192327e-08, |
|
"loss": 3.2622, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 90669, |
|
"total_flos": 2.4124793739319296e+16, |
|
"train_loss": 3.571538051929227, |
|
"train_runtime": 3663.4284, |
|
"train_samples_per_second": 197.997, |
|
"train_steps_per_second": 24.75 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 90669, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4124793739319296e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|