|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 1578, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0063371356147021544, |
|
"grad_norm": 7.199723076955636, |
|
"learning_rate": 3.164556962025317e-07, |
|
"loss": 1.4397, |
|
"mean_token_accuracy": 0.6951015710830688, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012674271229404309, |
|
"grad_norm": 7.116001217650177, |
|
"learning_rate": 6.329113924050634e-07, |
|
"loss": 1.4552, |
|
"mean_token_accuracy": 0.6930991888046265, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019011406844106463, |
|
"grad_norm": 4.64551484224683, |
|
"learning_rate": 9.493670886075951e-07, |
|
"loss": 1.3993, |
|
"mean_token_accuracy": 0.6986153647303581, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.025348542458808618, |
|
"grad_norm": 3.0027875956032366, |
|
"learning_rate": 1.2658227848101267e-06, |
|
"loss": 1.3103, |
|
"mean_token_accuracy": 0.7071203991770745, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.031685678073510776, |
|
"grad_norm": 3.1251366551644244, |
|
"learning_rate": 1.5822784810126585e-06, |
|
"loss": 1.2458, |
|
"mean_token_accuracy": 0.7130974352359771, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03802281368821293, |
|
"grad_norm": 2.289149516732586, |
|
"learning_rate": 1.8987341772151901e-06, |
|
"loss": 1.1709, |
|
"mean_token_accuracy": 0.7238569274544716, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.044359949302915085, |
|
"grad_norm": 2.0282034523858945, |
|
"learning_rate": 2.2151898734177215e-06, |
|
"loss": 1.1025, |
|
"mean_token_accuracy": 0.7365155085921288, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.050697084917617236, |
|
"grad_norm": 1.4393865600169713, |
|
"learning_rate": 2.5316455696202535e-06, |
|
"loss": 1.0754, |
|
"mean_token_accuracy": 0.7417579337954521, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.057034220532319393, |
|
"grad_norm": 0.9968108242787993, |
|
"learning_rate": 2.848101265822785e-06, |
|
"loss": 1.0382, |
|
"mean_token_accuracy": 0.7486504480242729, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06337135614702155, |
|
"grad_norm": 0.9450280587173171, |
|
"learning_rate": 3.164556962025317e-06, |
|
"loss": 1.0089, |
|
"mean_token_accuracy": 0.7545697376132011, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0697084917617237, |
|
"grad_norm": 0.9255531790602832, |
|
"learning_rate": 3.4810126582278487e-06, |
|
"loss": 0.974, |
|
"mean_token_accuracy": 0.7610737249255181, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07604562737642585, |
|
"grad_norm": 0.784862301963824, |
|
"learning_rate": 3.7974683544303802e-06, |
|
"loss": 0.9446, |
|
"mean_token_accuracy": 0.7658546909689903, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08238276299112801, |
|
"grad_norm": 0.8654366770506445, |
|
"learning_rate": 4.113924050632912e-06, |
|
"loss": 0.9532, |
|
"mean_token_accuracy": 0.764112365245819, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08871989860583017, |
|
"grad_norm": 0.8440093108811262, |
|
"learning_rate": 4.430379746835443e-06, |
|
"loss": 0.9019, |
|
"mean_token_accuracy": 0.7735387146472931, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09505703422053231, |
|
"grad_norm": 0.7242515634546599, |
|
"learning_rate": 4.746835443037975e-06, |
|
"loss": 0.8972, |
|
"mean_token_accuracy": 0.7742968738079071, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10139416983523447, |
|
"grad_norm": 0.7207614994286641, |
|
"learning_rate": 5.063291139240507e-06, |
|
"loss": 0.8872, |
|
"mean_token_accuracy": 0.7761535227298737, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10773130544993663, |
|
"grad_norm": 0.742791199954622, |
|
"learning_rate": 5.379746835443038e-06, |
|
"loss": 0.8559, |
|
"mean_token_accuracy": 0.7819930538535118, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11406844106463879, |
|
"grad_norm": 0.7678641716925835, |
|
"learning_rate": 5.69620253164557e-06, |
|
"loss": 0.8473, |
|
"mean_token_accuracy": 0.7834332928061485, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12040557667934093, |
|
"grad_norm": 0.71180994773894, |
|
"learning_rate": 6.012658227848101e-06, |
|
"loss": 0.8352, |
|
"mean_token_accuracy": 0.7855397373437881, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1267427122940431, |
|
"grad_norm": 0.7993738785147041, |
|
"learning_rate": 6.329113924050634e-06, |
|
"loss": 0.8589, |
|
"mean_token_accuracy": 0.7812221512198448, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13307984790874525, |
|
"grad_norm": 0.7568194042750847, |
|
"learning_rate": 6.645569620253165e-06, |
|
"loss": 0.8431, |
|
"mean_token_accuracy": 0.7850423708558083, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1394169835234474, |
|
"grad_norm": 0.7969657403354691, |
|
"learning_rate": 6.962025316455697e-06, |
|
"loss": 0.8146, |
|
"mean_token_accuracy": 0.7894491747021675, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14575411913814956, |
|
"grad_norm": 0.7814384559927074, |
|
"learning_rate": 7.2784810126582285e-06, |
|
"loss": 0.816, |
|
"mean_token_accuracy": 0.7893038675189018, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1520912547528517, |
|
"grad_norm": 0.7970973600599863, |
|
"learning_rate": 7.5949367088607605e-06, |
|
"loss": 0.8168, |
|
"mean_token_accuracy": 0.7892953917384148, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15842839036755388, |
|
"grad_norm": 0.7289531586042841, |
|
"learning_rate": 7.911392405063292e-06, |
|
"loss": 0.8036, |
|
"mean_token_accuracy": 0.7918314695358276, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16476552598225602, |
|
"grad_norm": 0.8996289034177167, |
|
"learning_rate": 8.227848101265824e-06, |
|
"loss": 0.7886, |
|
"mean_token_accuracy": 0.7948763906955719, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.17110266159695817, |
|
"grad_norm": 0.9505048466982942, |
|
"learning_rate": 8.544303797468356e-06, |
|
"loss": 0.7765, |
|
"mean_token_accuracy": 0.7972663462162017, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17743979721166034, |
|
"grad_norm": 0.8547089186827208, |
|
"learning_rate": 8.860759493670886e-06, |
|
"loss": 0.7778, |
|
"mean_token_accuracy": 0.7966541960835457, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18377693282636248, |
|
"grad_norm": 0.8115832093940138, |
|
"learning_rate": 9.177215189873418e-06, |
|
"loss": 0.7755, |
|
"mean_token_accuracy": 0.7976241648197174, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.19011406844106463, |
|
"grad_norm": 0.7240367508508893, |
|
"learning_rate": 9.49367088607595e-06, |
|
"loss": 0.7679, |
|
"mean_token_accuracy": 0.7988486766815186, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1964512040557668, |
|
"grad_norm": 0.8604548037210017, |
|
"learning_rate": 9.810126582278482e-06, |
|
"loss": 0.7666, |
|
"mean_token_accuracy": 0.7985615819692612, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.20278833967046894, |
|
"grad_norm": 0.7650650036074902, |
|
"learning_rate": 9.99995105342046e-06, |
|
"loss": 0.7615, |
|
"mean_token_accuracy": 0.8001444712281227, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20912547528517111, |
|
"grad_norm": 0.7907667013526878, |
|
"learning_rate": 9.999400415406145e-06, |
|
"loss": 0.7662, |
|
"mean_token_accuracy": 0.7991914421319961, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.21546261089987326, |
|
"grad_norm": 0.8483269008499935, |
|
"learning_rate": 9.998238023756727e-06, |
|
"loss": 0.7597, |
|
"mean_token_accuracy": 0.800473365187645, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2217997465145754, |
|
"grad_norm": 0.8423617289320647, |
|
"learning_rate": 9.996464020708734e-06, |
|
"loss": 0.7598, |
|
"mean_token_accuracy": 0.7996020078659057, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22813688212927757, |
|
"grad_norm": 0.8057636513151334, |
|
"learning_rate": 9.994078623338757e-06, |
|
"loss": 0.7566, |
|
"mean_token_accuracy": 0.800896917283535, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23447401774397972, |
|
"grad_norm": 0.8990102449117932, |
|
"learning_rate": 9.991082123536902e-06, |
|
"loss": 0.7522, |
|
"mean_token_accuracy": 0.8013818353414536, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.24081115335868186, |
|
"grad_norm": 0.9698242122380497, |
|
"learning_rate": 9.987474887971067e-06, |
|
"loss": 0.7463, |
|
"mean_token_accuracy": 0.8028701841831207, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24714828897338403, |
|
"grad_norm": 0.960138107922893, |
|
"learning_rate": 9.983257358042076e-06, |
|
"loss": 0.7401, |
|
"mean_token_accuracy": 0.8041222214698791, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2534854245880862, |
|
"grad_norm": 0.8305799821436418, |
|
"learning_rate": 9.978430049829672e-06, |
|
"loss": 0.7601, |
|
"mean_token_accuracy": 0.8001280605793, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2598225602027883, |
|
"grad_norm": 0.7129991697669212, |
|
"learning_rate": 9.972993554029357e-06, |
|
"loss": 0.7575, |
|
"mean_token_accuracy": 0.8003058210015297, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2661596958174905, |
|
"grad_norm": 0.8819778135317846, |
|
"learning_rate": 9.966948535880118e-06, |
|
"loss": 0.7444, |
|
"mean_token_accuracy": 0.8032929092645645, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.27249683143219267, |
|
"grad_norm": 0.7833769376025013, |
|
"learning_rate": 9.960295735083023e-06, |
|
"loss": 0.7151, |
|
"mean_token_accuracy": 0.8091372177004814, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2788339670468948, |
|
"grad_norm": 1.2183716099383828, |
|
"learning_rate": 9.953035965710707e-06, |
|
"loss": 0.7346, |
|
"mean_token_accuracy": 0.8045761153101921, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.28517110266159695, |
|
"grad_norm": 0.901875985078722, |
|
"learning_rate": 9.945170116107758e-06, |
|
"loss": 0.7337, |
|
"mean_token_accuracy": 0.805341312289238, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2915082382762991, |
|
"grad_norm": 0.8010789142797352, |
|
"learning_rate": 9.936699148782018e-06, |
|
"loss": 0.737, |
|
"mean_token_accuracy": 0.8051745280623436, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.29784537389100124, |
|
"grad_norm": 0.7625075089546256, |
|
"learning_rate": 9.927624100286795e-06, |
|
"loss": 0.7288, |
|
"mean_token_accuracy": 0.8064413368701935, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3041825095057034, |
|
"grad_norm": 0.7665506308733163, |
|
"learning_rate": 9.917946081094033e-06, |
|
"loss": 0.7001, |
|
"mean_token_accuracy": 0.8119662031531334, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3105196451204056, |
|
"grad_norm": 0.8173005458001997, |
|
"learning_rate": 9.907666275458432e-06, |
|
"loss": 0.7171, |
|
"mean_token_accuracy": 0.8087792381644249, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.31685678073510776, |
|
"grad_norm": 0.7944707699072153, |
|
"learning_rate": 9.896785941272524e-06, |
|
"loss": 0.7169, |
|
"mean_token_accuracy": 0.808886106312275, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3231939163498099, |
|
"grad_norm": 0.7652270272633694, |
|
"learning_rate": 9.885306409912767e-06, |
|
"loss": 0.7122, |
|
"mean_token_accuracy": 0.8092179223895073, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.32953105196451205, |
|
"grad_norm": 0.8298720811434571, |
|
"learning_rate": 9.87322908607661e-06, |
|
"loss": 0.7106, |
|
"mean_token_accuracy": 0.8099273145198822, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3358681875792142, |
|
"grad_norm": 0.6635750146125453, |
|
"learning_rate": 9.860555447610626e-06, |
|
"loss": 0.7205, |
|
"mean_token_accuracy": 0.8083759486675263, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.34220532319391633, |
|
"grad_norm": 0.7701167215766528, |
|
"learning_rate": 9.847287045329665e-06, |
|
"loss": 0.7178, |
|
"mean_token_accuracy": 0.8084105476737022, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3485424588086185, |
|
"grad_norm": 0.8129554147937268, |
|
"learning_rate": 9.833425502827087e-06, |
|
"loss": 0.7191, |
|
"mean_token_accuracy": 0.8078344166278839, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3548795944233207, |
|
"grad_norm": 0.7153635278024493, |
|
"learning_rate": 9.818972516276096e-06, |
|
"loss": 0.6973, |
|
"mean_token_accuracy": 0.8126269072294235, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3612167300380228, |
|
"grad_norm": 0.7019835045316667, |
|
"learning_rate": 9.803929854222182e-06, |
|
"loss": 0.704, |
|
"mean_token_accuracy": 0.8114176645874978, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.36755386565272496, |
|
"grad_norm": 0.7615682616292789, |
|
"learning_rate": 9.788299357366717e-06, |
|
"loss": 0.7089, |
|
"mean_token_accuracy": 0.8106587365269661, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.37389100126742714, |
|
"grad_norm": 0.9786947635111585, |
|
"learning_rate": 9.772082938341706e-06, |
|
"loss": 0.7014, |
|
"mean_token_accuracy": 0.8121261984109879, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.38022813688212925, |
|
"grad_norm": 0.8212453521500733, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.7072, |
|
"mean_token_accuracy": 0.8106094494462013, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3865652724968314, |
|
"grad_norm": 0.865055505917381, |
|
"learning_rate": 9.7379003425513e-06, |
|
"loss": 0.7163, |
|
"mean_token_accuracy": 0.8092033118009567, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3929024081115336, |
|
"grad_norm": 0.6716631342519259, |
|
"learning_rate": 9.71993834855293e-06, |
|
"loss": 0.7045, |
|
"mean_token_accuracy": 0.8109571009874343, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.39923954372623577, |
|
"grad_norm": 0.7649280200479434, |
|
"learning_rate": 9.701398797407258e-06, |
|
"loss": 0.7044, |
|
"mean_token_accuracy": 0.8110996559262276, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.4055766793409379, |
|
"grad_norm": 0.732205588585856, |
|
"learning_rate": 9.68228395771388e-06, |
|
"loss": 0.6906, |
|
"mean_token_accuracy": 0.8138323068618775, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.41191381495564006, |
|
"grad_norm": 0.8349157198044287, |
|
"learning_rate": 9.662596168467823e-06, |
|
"loss": 0.6963, |
|
"mean_token_accuracy": 0.8128764078021049, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.41825095057034223, |
|
"grad_norm": 0.7284163977404773, |
|
"learning_rate": 9.6423378387733e-06, |
|
"loss": 0.6926, |
|
"mean_token_accuracy": 0.8138028383255005, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.42458808618504434, |
|
"grad_norm": 0.6903566919121088, |
|
"learning_rate": 9.621511447548946e-06, |
|
"loss": 0.6992, |
|
"mean_token_accuracy": 0.8125665381550788, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4309252217997465, |
|
"grad_norm": 0.7031778472809708, |
|
"learning_rate": 9.600119543224467e-06, |
|
"loss": 0.6935, |
|
"mean_token_accuracy": 0.8134042397141457, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4372623574144487, |
|
"grad_norm": 0.8781454719155253, |
|
"learning_rate": 9.578164743428808e-06, |
|
"loss": 0.6938, |
|
"mean_token_accuracy": 0.8132070809602737, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4435994930291508, |
|
"grad_norm": 0.8306105321262149, |
|
"learning_rate": 9.55564973466984e-06, |
|
"loss": 0.6928, |
|
"mean_token_accuracy": 0.8133361831307411, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.449936628643853, |
|
"grad_norm": 0.7046997598602632, |
|
"learning_rate": 9.532577272005637e-06, |
|
"loss": 0.679, |
|
"mean_token_accuracy": 0.8159057974815369, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.45627376425855515, |
|
"grad_norm": 0.889929783644245, |
|
"learning_rate": 9.508950178707335e-06, |
|
"loss": 0.6872, |
|
"mean_token_accuracy": 0.8148621737957, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.46261089987325726, |
|
"grad_norm": 0.9776823189593525, |
|
"learning_rate": 9.484771345913673e-06, |
|
"loss": 0.6902, |
|
"mean_token_accuracy": 0.8141683742403985, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.46894803548795944, |
|
"grad_norm": 0.7706175050493524, |
|
"learning_rate": 9.460043732277213e-06, |
|
"loss": 0.6908, |
|
"mean_token_accuracy": 0.8145220652222633, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4752851711026616, |
|
"grad_norm": 0.6524214668295174, |
|
"learning_rate": 9.434770363602307e-06, |
|
"loss": 0.6983, |
|
"mean_token_accuracy": 0.8123016864061355, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4816223067173637, |
|
"grad_norm": 0.717843515473759, |
|
"learning_rate": 9.408954332474845e-06, |
|
"loss": 0.6677, |
|
"mean_token_accuracy": 0.8185531318187713, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4879594423320659, |
|
"grad_norm": 0.7618453217486776, |
|
"learning_rate": 9.382598797883811e-06, |
|
"loss": 0.6795, |
|
"mean_token_accuracy": 0.8164624303579331, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.49429657794676807, |
|
"grad_norm": 0.7324610745032628, |
|
"learning_rate": 9.355706984834765e-06, |
|
"loss": 0.6836, |
|
"mean_token_accuracy": 0.8149291038513183, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5006337135614702, |
|
"grad_norm": 0.6779242960146721, |
|
"learning_rate": 9.328282183955179e-06, |
|
"loss": 0.6884, |
|
"mean_token_accuracy": 0.8146958678960801, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5069708491761724, |
|
"grad_norm": 0.827105664596769, |
|
"learning_rate": 9.300327751091806e-06, |
|
"loss": 0.6873, |
|
"mean_token_accuracy": 0.814927139878273, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5133079847908745, |
|
"grad_norm": 0.6798030291558349, |
|
"learning_rate": 9.271847106900022e-06, |
|
"loss": 0.6659, |
|
"mean_token_accuracy": 0.8187542855739594, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5196451204055766, |
|
"grad_norm": 0.6549281773308409, |
|
"learning_rate": 9.242843736425269e-06, |
|
"loss": 0.6749, |
|
"mean_token_accuracy": 0.8172334164381028, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5259822560202788, |
|
"grad_norm": 0.702757870059226, |
|
"learning_rate": 9.213321188676595e-06, |
|
"loss": 0.6799, |
|
"mean_token_accuracy": 0.8162769109010697, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.532319391634981, |
|
"grad_norm": 0.663720096138884, |
|
"learning_rate": 9.183283076192386e-06, |
|
"loss": 0.6688, |
|
"mean_token_accuracy": 0.8184930950403213, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5386565272496832, |
|
"grad_norm": 0.6874302061015839, |
|
"learning_rate": 9.152733074598312e-06, |
|
"loss": 0.6742, |
|
"mean_token_accuracy": 0.8174020066857338, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5449936628643853, |
|
"grad_norm": 0.7315428759079102, |
|
"learning_rate": 9.121674922157558e-06, |
|
"loss": 0.6738, |
|
"mean_token_accuracy": 0.817636775970459, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5513307984790875, |
|
"grad_norm": 0.786643495084509, |
|
"learning_rate": 9.090112419313395e-06, |
|
"loss": 0.6736, |
|
"mean_token_accuracy": 0.817160977423191, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5576679340937896, |
|
"grad_norm": 0.6591137928729695, |
|
"learning_rate": 9.058049428224128e-06, |
|
"loss": 0.6617, |
|
"mean_token_accuracy": 0.8197388723492622, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5640050697084917, |
|
"grad_norm": 0.7812371618484959, |
|
"learning_rate": 9.025489872290511e-06, |
|
"loss": 0.6634, |
|
"mean_token_accuracy": 0.8193035304546357, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5703422053231939, |
|
"grad_norm": 0.6845961589145344, |
|
"learning_rate": 8.99243773567565e-06, |
|
"loss": 0.6834, |
|
"mean_token_accuracy": 0.8159178540110588, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5766793409378961, |
|
"grad_norm": 0.6657472122738636, |
|
"learning_rate": 8.958897062817491e-06, |
|
"loss": 0.6892, |
|
"mean_token_accuracy": 0.8144657433032989, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5830164765525983, |
|
"grad_norm": 0.6161660996953076, |
|
"learning_rate": 8.924871957933904e-06, |
|
"loss": 0.6746, |
|
"mean_token_accuracy": 0.8171708762645722, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5893536121673004, |
|
"grad_norm": 0.702287591616562, |
|
"learning_rate": 8.890366584520482e-06, |
|
"loss": 0.6696, |
|
"mean_token_accuracy": 0.8184025406837463, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5956907477820025, |
|
"grad_norm": 0.6857028656369465, |
|
"learning_rate": 8.855385164841072e-06, |
|
"loss": 0.6758, |
|
"mean_token_accuracy": 0.8170812010765076, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6020278833967047, |
|
"grad_norm": 0.6231296442226781, |
|
"learning_rate": 8.819931979411107e-06, |
|
"loss": 0.6734, |
|
"mean_token_accuracy": 0.81716128885746, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6083650190114068, |
|
"grad_norm": 0.6948180768376443, |
|
"learning_rate": 8.78401136647383e-06, |
|
"loss": 0.654, |
|
"mean_token_accuracy": 0.8216980487108231, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.614702154626109, |
|
"grad_norm": 0.6911375954678098, |
|
"learning_rate": 8.747627721469437e-06, |
|
"loss": 0.6635, |
|
"mean_token_accuracy": 0.8201975762844086, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6210392902408112, |
|
"grad_norm": 0.7213580744708442, |
|
"learning_rate": 8.710785496497226e-06, |
|
"loss": 0.6651, |
|
"mean_token_accuracy": 0.8194010749459266, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6273764258555133, |
|
"grad_norm": 0.6543956981962712, |
|
"learning_rate": 8.673489199770819e-06, |
|
"loss": 0.6607, |
|
"mean_token_accuracy": 0.8201611772179603, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6337135614702155, |
|
"grad_norm": 0.7036498788596002, |
|
"learning_rate": 8.635743395066511e-06, |
|
"loss": 0.651, |
|
"mean_token_accuracy": 0.8222277790307999, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6400506970849176, |
|
"grad_norm": 0.6168799535449692, |
|
"learning_rate": 8.597552701164818e-06, |
|
"loss": 0.6592, |
|
"mean_token_accuracy": 0.8199419066309929, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6463878326996197, |
|
"grad_norm": 0.697481072279894, |
|
"learning_rate": 8.558921791285304e-06, |
|
"loss": 0.6513, |
|
"mean_token_accuracy": 0.8216616123914718, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6527249683143219, |
|
"grad_norm": 0.7978637983888536, |
|
"learning_rate": 8.519855392514734e-06, |
|
"loss": 0.6469, |
|
"mean_token_accuracy": 0.8225123390555382, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6590621039290241, |
|
"grad_norm": 0.709080907162204, |
|
"learning_rate": 8.480358285228648e-06, |
|
"loss": 0.6656, |
|
"mean_token_accuracy": 0.8191539570689201, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6653992395437263, |
|
"grad_norm": 0.7897211062263881, |
|
"learning_rate": 8.440435302506405e-06, |
|
"loss": 0.6412, |
|
"mean_token_accuracy": 0.8238195776939392, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6717363751584284, |
|
"grad_norm": 0.6661911891034582, |
|
"learning_rate": 8.400091329539784e-06, |
|
"loss": 0.6611, |
|
"mean_token_accuracy": 0.8201816022396088, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6780735107731305, |
|
"grad_norm": 0.6195761512766995, |
|
"learning_rate": 8.359331303035205e-06, |
|
"loss": 0.6593, |
|
"mean_token_accuracy": 0.8203893005847931, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6844106463878327, |
|
"grad_norm": 0.6310438648482855, |
|
"learning_rate": 8.31816021060964e-06, |
|
"loss": 0.6634, |
|
"mean_token_accuracy": 0.8192948743700981, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6907477820025348, |
|
"grad_norm": 0.6512895144306936, |
|
"learning_rate": 8.276583090180311e-06, |
|
"loss": 0.6666, |
|
"mean_token_accuracy": 0.8186753287911415, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.697084917617237, |
|
"grad_norm": 0.6124105415248355, |
|
"learning_rate": 8.234605029348224e-06, |
|
"loss": 0.6511, |
|
"mean_token_accuracy": 0.8219994261860848, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7034220532319392, |
|
"grad_norm": 0.6601442192692848, |
|
"learning_rate": 8.192231164775609e-06, |
|
"loss": 0.6391, |
|
"mean_token_accuracy": 0.8252027094364166, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7097591888466414, |
|
"grad_norm": 0.7028934088221325, |
|
"learning_rate": 8.149466681557384e-06, |
|
"loss": 0.6558, |
|
"mean_token_accuracy": 0.8209778189659118, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7160963244613435, |
|
"grad_norm": 0.7857750596740971, |
|
"learning_rate": 8.106316812586676e-06, |
|
"loss": 0.6486, |
|
"mean_token_accuracy": 0.8220974311232567, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7224334600760456, |
|
"grad_norm": 0.8961234969300941, |
|
"learning_rate": 8.062786837914492e-06, |
|
"loss": 0.6386, |
|
"mean_token_accuracy": 0.824979268014431, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7287705956907478, |
|
"grad_norm": 0.686280664966789, |
|
"learning_rate": 8.01888208410362e-06, |
|
"loss": 0.6622, |
|
"mean_token_accuracy": 0.8198226556181908, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7351077313054499, |
|
"grad_norm": 0.8344864616701414, |
|
"learning_rate": 7.974607923576859e-06, |
|
"loss": 0.6537, |
|
"mean_token_accuracy": 0.821578212082386, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7414448669201521, |
|
"grad_norm": 0.9938826585970929, |
|
"learning_rate": 7.9299697739596e-06, |
|
"loss": 0.6544, |
|
"mean_token_accuracy": 0.8208117336034775, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7477820025348543, |
|
"grad_norm": 0.6249233717628465, |
|
"learning_rate": 7.884973097416908e-06, |
|
"loss": 0.6591, |
|
"mean_token_accuracy": 0.8208227157592773, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7541191381495564, |
|
"grad_norm": 0.6761543444596165, |
|
"learning_rate": 7.83962339998514e-06, |
|
"loss": 0.6439, |
|
"mean_token_accuracy": 0.8236203759908676, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7604562737642585, |
|
"grad_norm": 0.8850862666109794, |
|
"learning_rate": 7.793926230898187e-06, |
|
"loss": 0.6418, |
|
"mean_token_accuracy": 0.8238036289811135, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7667934093789607, |
|
"grad_norm": 0.6931013119334469, |
|
"learning_rate": 7.747887181908464e-06, |
|
"loss": 0.6513, |
|
"mean_token_accuracy": 0.8221172288060188, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7731305449936628, |
|
"grad_norm": 0.9142852384539434, |
|
"learning_rate": 7.701511886602643e-06, |
|
"loss": 0.6522, |
|
"mean_token_accuracy": 0.8214233443140984, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.779467680608365, |
|
"grad_norm": 0.693942629552867, |
|
"learning_rate": 7.65480601971232e-06, |
|
"loss": 0.6555, |
|
"mean_token_accuracy": 0.8214162334799766, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7858048162230672, |
|
"grad_norm": 0.7185534733688809, |
|
"learning_rate": 7.6077752964196095e-06, |
|
"loss": 0.6514, |
|
"mean_token_accuracy": 0.821819719672203, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7921419518377694, |
|
"grad_norm": 0.7933553753697458, |
|
"learning_rate": 7.560425471657814e-06, |
|
"loss": 0.6507, |
|
"mean_token_accuracy": 0.8215969070792198, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7984790874524715, |
|
"grad_norm": 0.9942445013974323, |
|
"learning_rate": 7.512762339407214e-06, |
|
"loss": 0.6426, |
|
"mean_token_accuracy": 0.8233709827065467, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8048162230671736, |
|
"grad_norm": 0.7111122316238967, |
|
"learning_rate": 7.464791731986084e-06, |
|
"loss": 0.6446, |
|
"mean_token_accuracy": 0.8233424022793769, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8111533586818758, |
|
"grad_norm": 0.6760326829400325, |
|
"learning_rate": 7.4165195193370245e-06, |
|
"loss": 0.6411, |
|
"mean_token_accuracy": 0.8234749510884285, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8174904942965779, |
|
"grad_norm": 0.7157859491675397, |
|
"learning_rate": 7.3679516083086785e-06, |
|
"loss": 0.6403, |
|
"mean_token_accuracy": 0.8245514526963234, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8238276299112801, |
|
"grad_norm": 0.6125130117848593, |
|
"learning_rate": 7.319093941932941e-06, |
|
"loss": 0.648, |
|
"mean_token_accuracy": 0.8229272648692131, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8301647655259823, |
|
"grad_norm": 0.6193392226038144, |
|
"learning_rate": 7.269952498697734e-06, |
|
"loss": 0.6568, |
|
"mean_token_accuracy": 0.8208993718028068, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8365019011406845, |
|
"grad_norm": 0.5569382668639404, |
|
"learning_rate": 7.2205332918154525e-06, |
|
"loss": 0.6471, |
|
"mean_token_accuracy": 0.8230623930692673, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8428390367553865, |
|
"grad_norm": 0.6854397276184668, |
|
"learning_rate": 7.170842368487145e-06, |
|
"loss": 0.6394, |
|
"mean_token_accuracy": 0.8240847915410996, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.8491761723700887, |
|
"grad_norm": 0.7247430930413721, |
|
"learning_rate": 7.120885809162561e-06, |
|
"loss": 0.6496, |
|
"mean_token_accuracy": 0.8226393803954124, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8555133079847909, |
|
"grad_norm": 0.5833185802048395, |
|
"learning_rate": 7.070669726796095e-06, |
|
"loss": 0.644, |
|
"mean_token_accuracy": 0.8238432243466377, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.861850443599493, |
|
"grad_norm": 0.6587621871435737, |
|
"learning_rate": 7.020200266098791e-06, |
|
"loss": 0.6367, |
|
"mean_token_accuracy": 0.8251640364527703, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8681875792141952, |
|
"grad_norm": 0.9240470458812879, |
|
"learning_rate": 6.969483602786429e-06, |
|
"loss": 0.6335, |
|
"mean_token_accuracy": 0.8250990778207778, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8745247148288974, |
|
"grad_norm": 0.6647921620988979, |
|
"learning_rate": 6.918525942823836e-06, |
|
"loss": 0.6358, |
|
"mean_token_accuracy": 0.8253032699227333, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8808618504435995, |
|
"grad_norm": 0.7460235517208977, |
|
"learning_rate": 6.8673335216654945e-06, |
|
"loss": 0.6364, |
|
"mean_token_accuracy": 0.8251613467931748, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8871989860583016, |
|
"grad_norm": 0.5692165964237054, |
|
"learning_rate": 6.815912603492531e-06, |
|
"loss": 0.63, |
|
"mean_token_accuracy": 0.8269012838602066, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8935361216730038, |
|
"grad_norm": 0.7678044598257266, |
|
"learning_rate": 6.7642694804462026e-06, |
|
"loss": 0.641, |
|
"mean_token_accuracy": 0.8240568235516548, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.899873257287706, |
|
"grad_norm": 0.6476587911488177, |
|
"learning_rate": 6.712410471857955e-06, |
|
"loss": 0.6389, |
|
"mean_token_accuracy": 0.8243090897798538, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9062103929024081, |
|
"grad_norm": 0.6996232991940935, |
|
"learning_rate": 6.660341923476152e-06, |
|
"loss": 0.6309, |
|
"mean_token_accuracy": 0.8264057099819183, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.9125475285171103, |
|
"grad_norm": 0.6140056059724183, |
|
"learning_rate": 6.608070206689583e-06, |
|
"loss": 0.6284, |
|
"mean_token_accuracy": 0.826878672838211, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9188846641318125, |
|
"grad_norm": 0.5994244215051143, |
|
"learning_rate": 6.555601717747815e-06, |
|
"loss": 0.6469, |
|
"mean_token_accuracy": 0.8231760680675506, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9252217997465145, |
|
"grad_norm": 0.671715865180922, |
|
"learning_rate": 6.502942876978524e-06, |
|
"loss": 0.626, |
|
"mean_token_accuracy": 0.8275385439395905, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9315589353612167, |
|
"grad_norm": 0.6964725986892187, |
|
"learning_rate": 6.450100128001861e-06, |
|
"loss": 0.615, |
|
"mean_token_accuracy": 0.8296460658311844, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9378960709759189, |
|
"grad_norm": 0.6643867039068622, |
|
"learning_rate": 6.397079936941975e-06, |
|
"loss": 0.6425, |
|
"mean_token_accuracy": 0.823666226863861, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.944233206590621, |
|
"grad_norm": 0.612108400302355, |
|
"learning_rate": 6.343888791635797e-06, |
|
"loss": 0.6222, |
|
"mean_token_accuracy": 0.8274678066372871, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9505703422053232, |
|
"grad_norm": 0.5888135214791528, |
|
"learning_rate": 6.2905332008391304e-06, |
|
"loss": 0.6457, |
|
"mean_token_accuracy": 0.8232318565249443, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9569074778200254, |
|
"grad_norm": 0.6023978340437303, |
|
"learning_rate": 6.237019693430227e-06, |
|
"loss": 0.6244, |
|
"mean_token_accuracy": 0.8275379940867424, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9632446134347274, |
|
"grad_norm": 0.5860893552553069, |
|
"learning_rate": 6.18335481761086e-06, |
|
"loss": 0.6258, |
|
"mean_token_accuracy": 0.8275753378868103, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9695817490494296, |
|
"grad_norm": 0.6183329734308459, |
|
"learning_rate": 6.1295451401050645e-06, |
|
"loss": 0.6487, |
|
"mean_token_accuracy": 0.8231626331806183, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9759188846641318, |
|
"grad_norm": 0.6472859730529533, |
|
"learning_rate": 6.075597245355589e-06, |
|
"loss": 0.6367, |
|
"mean_token_accuracy": 0.8252906337380409, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.982256020278834, |
|
"grad_norm": 0.7048827333728572, |
|
"learning_rate": 6.021517734718193e-06, |
|
"loss": 0.6331, |
|
"mean_token_accuracy": 0.8252324685454369, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9885931558935361, |
|
"grad_norm": 0.670917489168749, |
|
"learning_rate": 5.967313225653863e-06, |
|
"loss": 0.6311, |
|
"mean_token_accuracy": 0.8262254923582077, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9949302915082383, |
|
"grad_norm": 0.6294039276801214, |
|
"learning_rate": 5.912990350919075e-06, |
|
"loss": 0.6366, |
|
"mean_token_accuracy": 0.8250793889164925, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.0012674271229405, |
|
"grad_norm": 0.5750660780176277, |
|
"learning_rate": 5.85855575775416e-06, |
|
"loss": 0.6356, |
|
"mean_token_accuracy": 0.8255759388208389, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0076045627376427, |
|
"grad_norm": 0.5873083803261483, |
|
"learning_rate": 5.804016107069922e-06, |
|
"loss": 0.5899, |
|
"mean_token_accuracy": 0.8365576922893524, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.0139416983523448, |
|
"grad_norm": 0.7245732012982048, |
|
"learning_rate": 5.749378072632572e-06, |
|
"loss": 0.5924, |
|
"mean_token_accuracy": 0.8353384211659431, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.020278833967047, |
|
"grad_norm": 0.5625102045050165, |
|
"learning_rate": 5.694648340247087e-06, |
|
"loss": 0.5855, |
|
"mean_token_accuracy": 0.8365451633930207, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.026615969581749, |
|
"grad_norm": 0.5811796962078384, |
|
"learning_rate": 5.639833606939103e-06, |
|
"loss": 0.5835, |
|
"mean_token_accuracy": 0.8374374285340309, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.0329531051964511, |
|
"grad_norm": 0.6064815307080854, |
|
"learning_rate": 5.584940580135423e-06, |
|
"loss": 0.5918, |
|
"mean_token_accuracy": 0.835510890185833, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.0392902408111533, |
|
"grad_norm": 0.5469353793310435, |
|
"learning_rate": 5.529975976843268e-06, |
|
"loss": 0.5765, |
|
"mean_token_accuracy": 0.839336322247982, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.0456273764258555, |
|
"grad_norm": 0.591194718615289, |
|
"learning_rate": 5.474946522828344e-06, |
|
"loss": 0.571, |
|
"mean_token_accuracy": 0.8397138401865959, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.0519645120405576, |
|
"grad_norm": 0.6529351075330402, |
|
"learning_rate": 5.419858951791842e-06, |
|
"loss": 0.587, |
|
"mean_token_accuracy": 0.8367372244596482, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.0583016476552598, |
|
"grad_norm": 0.5750159656566077, |
|
"learning_rate": 5.364720004546467e-06, |
|
"loss": 0.5713, |
|
"mean_token_accuracy": 0.8396085217595101, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.064638783269962, |
|
"grad_norm": 0.5356356446036812, |
|
"learning_rate": 5.3095364281915905e-06, |
|
"loss": 0.5743, |
|
"mean_token_accuracy": 0.8390779420733452, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.0709759188846641, |
|
"grad_norm": 0.5657627605570825, |
|
"learning_rate": 5.254314975287649e-06, |
|
"loss": 0.5768, |
|
"mean_token_accuracy": 0.8388962477445603, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.0773130544993663, |
|
"grad_norm": 0.5994046834255601, |
|
"learning_rate": 5.199062403029851e-06, |
|
"loss": 0.5779, |
|
"mean_token_accuracy": 0.838576190173626, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0836501901140685, |
|
"grad_norm": 0.5512378922303693, |
|
"learning_rate": 5.143785472421341e-06, |
|
"loss": 0.5736, |
|
"mean_token_accuracy": 0.8392498835921287, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.0899873257287707, |
|
"grad_norm": 0.6231063067990558, |
|
"learning_rate": 5.088490947445884e-06, |
|
"loss": 0.5787, |
|
"mean_token_accuracy": 0.8382582783699035, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0963244613434728, |
|
"grad_norm": 0.6258759211004005, |
|
"learning_rate": 5.033185594240184e-06, |
|
"loss": 0.5867, |
|
"mean_token_accuracy": 0.8368578165769577, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.102661596958175, |
|
"grad_norm": 0.5758426513877979, |
|
"learning_rate": 4.977876180265948e-06, |
|
"loss": 0.5781, |
|
"mean_token_accuracy": 0.8380098447203637, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.1089987325728772, |
|
"grad_norm": 0.5362099307940532, |
|
"learning_rate": 4.922569473481779e-06, |
|
"loss": 0.579, |
|
"mean_token_accuracy": 0.8374864637851716, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.1153358681875791, |
|
"grad_norm": 0.6117359275633708, |
|
"learning_rate": 4.867272241515013e-06, |
|
"loss": 0.5745, |
|
"mean_token_accuracy": 0.8394086301326752, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.1216730038022813, |
|
"grad_norm": 0.6163525031585341, |
|
"learning_rate": 4.811991250833598e-06, |
|
"loss": 0.575, |
|
"mean_token_accuracy": 0.8387202203273774, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.1280101394169835, |
|
"grad_norm": 0.5344005108748248, |
|
"learning_rate": 4.756733265918111e-06, |
|
"loss": 0.5805, |
|
"mean_token_accuracy": 0.8385160818696022, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.1343472750316856, |
|
"grad_norm": 0.5606427842219186, |
|
"learning_rate": 4.701505048434017e-06, |
|
"loss": 0.58, |
|
"mean_token_accuracy": 0.837983712553978, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.1406844106463878, |
|
"grad_norm": 0.5635525365545201, |
|
"learning_rate": 4.646313356404278e-06, |
|
"loss": 0.5721, |
|
"mean_token_accuracy": 0.8402201250195503, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.14702154626109, |
|
"grad_norm": 0.5279253266696204, |
|
"learning_rate": 4.5911649433824055e-06, |
|
"loss": 0.5722, |
|
"mean_token_accuracy": 0.8398120388388634, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.1533586818757922, |
|
"grad_norm": 0.5355638715895371, |
|
"learning_rate": 4.536066557626057e-06, |
|
"loss": 0.5717, |
|
"mean_token_accuracy": 0.8396236389875412, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.1596958174904943, |
|
"grad_norm": 0.5298050755566127, |
|
"learning_rate": 4.481024941271283e-06, |
|
"loss": 0.5825, |
|
"mean_token_accuracy": 0.837471354007721, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.1660329531051965, |
|
"grad_norm": 0.6099091835516977, |
|
"learning_rate": 4.426046829507525e-06, |
|
"loss": 0.5739, |
|
"mean_token_accuracy": 0.8395572647452354, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.1723700887198987, |
|
"grad_norm": 0.5282685583180019, |
|
"learning_rate": 4.371138949753457e-06, |
|
"loss": 0.5758, |
|
"mean_token_accuracy": 0.8386889979243278, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.1787072243346008, |
|
"grad_norm": 0.5498053929758666, |
|
"learning_rate": 4.316308020833788e-06, |
|
"loss": 0.5717, |
|
"mean_token_accuracy": 0.8401581376791001, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1850443599493028, |
|
"grad_norm": 0.545684866299052, |
|
"learning_rate": 4.261560752157106e-06, |
|
"loss": 0.5821, |
|
"mean_token_accuracy": 0.8375889748334885, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.1913814955640052, |
|
"grad_norm": 0.5275676276739754, |
|
"learning_rate": 4.20690384289488e-06, |
|
"loss": 0.5865, |
|
"mean_token_accuracy": 0.8369634434580803, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1977186311787071, |
|
"grad_norm": 0.5147709084468725, |
|
"learning_rate": 4.152343981161713e-06, |
|
"loss": 0.5735, |
|
"mean_token_accuracy": 0.8388126537203788, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.2040557667934093, |
|
"grad_norm": 0.5553445767071536, |
|
"learning_rate": 4.097887843196949e-06, |
|
"loss": 0.5706, |
|
"mean_token_accuracy": 0.8400391504168511, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.2103929024081115, |
|
"grad_norm": 0.5755093000837989, |
|
"learning_rate": 4.043542092547729e-06, |
|
"loss": 0.5738, |
|
"mean_token_accuracy": 0.8393745362758637, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.2167300380228137, |
|
"grad_norm": 0.5323369182306833, |
|
"learning_rate": 3.989313379253609e-06, |
|
"loss": 0.5707, |
|
"mean_token_accuracy": 0.8395906254649163, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.2230671736375158, |
|
"grad_norm": 0.5398923057065517, |
|
"learning_rate": 3.935208339032819e-06, |
|
"loss": 0.5773, |
|
"mean_token_accuracy": 0.8380544230341911, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.229404309252218, |
|
"grad_norm": 0.5138506118324425, |
|
"learning_rate": 3.881233592470287e-06, |
|
"loss": 0.5697, |
|
"mean_token_accuracy": 0.8401115134358406, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.2357414448669202, |
|
"grad_norm": 0.531254594806762, |
|
"learning_rate": 3.827395744207504e-06, |
|
"loss": 0.5802, |
|
"mean_token_accuracy": 0.8385789826512337, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.2420785804816223, |
|
"grad_norm": 0.5209427066358759, |
|
"learning_rate": 3.773701382134345e-06, |
|
"loss": 0.5788, |
|
"mean_token_accuracy": 0.8383644595742226, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.2484157160963245, |
|
"grad_norm": 0.4981386065382922, |
|
"learning_rate": 3.7201570765829405e-06, |
|
"loss": 0.5803, |
|
"mean_token_accuracy": 0.8378679618239403, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.2547528517110267, |
|
"grad_norm": 0.5310216835837045, |
|
"learning_rate": 3.666769379523695e-06, |
|
"loss": 0.5816, |
|
"mean_token_accuracy": 0.8382963240146637, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.2610899873257289, |
|
"grad_norm": 0.5302964748937399, |
|
"learning_rate": 3.6135448237635505e-06, |
|
"loss": 0.568, |
|
"mean_token_accuracy": 0.8408621445298194, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.2674271229404308, |
|
"grad_norm": 0.6043312455865852, |
|
"learning_rate": 3.5604899221466003e-06, |
|
"loss": 0.5797, |
|
"mean_token_accuracy": 0.837955892086029, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2737642585551332, |
|
"grad_norm": 0.5404711838738012, |
|
"learning_rate": 3.507611166757141e-06, |
|
"loss": 0.577, |
|
"mean_token_accuracy": 0.8382121488451958, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.2801013941698351, |
|
"grad_norm": 0.5313905403777647, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.5759, |
|
"mean_token_accuracy": 0.8386555135250091, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.2864385297845373, |
|
"grad_norm": 0.5312545340451698, |
|
"learning_rate": 3.4024079544350874e-06, |
|
"loss": 0.5766, |
|
"mean_token_accuracy": 0.8384982272982597, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.2927756653992395, |
|
"grad_norm": 0.574010488002488, |
|
"learning_rate": 3.3500963707357236e-06, |
|
"loss": 0.5817, |
|
"mean_token_accuracy": 0.838199020922184, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2991128010139417, |
|
"grad_norm": 0.5162313236359333, |
|
"learning_rate": 3.297986678155074e-06, |
|
"loss": 0.5596, |
|
"mean_token_accuracy": 0.8421908557415009, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.3054499366286438, |
|
"grad_norm": 0.6187258006031299, |
|
"learning_rate": 3.24608525311655e-06, |
|
"loss": 0.5633, |
|
"mean_token_accuracy": 0.842179323732853, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.311787072243346, |
|
"grad_norm": 0.5140882862368508, |
|
"learning_rate": 3.1943984465588253e-06, |
|
"loss": 0.5704, |
|
"mean_token_accuracy": 0.8403183802962303, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.3181242078580482, |
|
"grad_norm": 0.5261806551468972, |
|
"learning_rate": 3.142932583158693e-06, |
|
"loss": 0.5664, |
|
"mean_token_accuracy": 0.8412504211068154, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.3244613434727504, |
|
"grad_norm": 0.5355046745744655, |
|
"learning_rate": 3.0916939605571534e-06, |
|
"loss": 0.5668, |
|
"mean_token_accuracy": 0.8411947041749954, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.3307984790874525, |
|
"grad_norm": 0.5828342485781398, |
|
"learning_rate": 3.040688848588788e-06, |
|
"loss": 0.5683, |
|
"mean_token_accuracy": 0.8403848618268966, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.3371356147021547, |
|
"grad_norm": 0.515568887419182, |
|
"learning_rate": 2.989923488514566e-06, |
|
"loss": 0.5734, |
|
"mean_token_accuracy": 0.8396067947149277, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.3434727503168569, |
|
"grad_norm": 0.533119717549416, |
|
"learning_rate": 2.9394040922581123e-06, |
|
"loss": 0.5788, |
|
"mean_token_accuracy": 0.8387560814619064, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.3498098859315588, |
|
"grad_norm": 0.5574493299249907, |
|
"learning_rate": 2.889136841645592e-06, |
|
"loss": 0.5738, |
|
"mean_token_accuracy": 0.839569516479969, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.3561470215462612, |
|
"grad_norm": 0.5301348229908708, |
|
"learning_rate": 2.839127887649271e-06, |
|
"loss": 0.5751, |
|
"mean_token_accuracy": 0.8394772946834564, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.3624841571609632, |
|
"grad_norm": 0.5071728571486687, |
|
"learning_rate": 2.789383349634841e-06, |
|
"loss": 0.5711, |
|
"mean_token_accuracy": 0.8398226588964463, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.3688212927756653, |
|
"grad_norm": 0.4997381831510659, |
|
"learning_rate": 2.73990931461263e-06, |
|
"loss": 0.5783, |
|
"mean_token_accuracy": 0.8384912863373757, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.3751584283903675, |
|
"grad_norm": 0.5019388182436546, |
|
"learning_rate": 2.690711836492758e-06, |
|
"loss": 0.5711, |
|
"mean_token_accuracy": 0.8396464511752129, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.3814955640050697, |
|
"grad_norm": 0.5165116686484276, |
|
"learning_rate": 2.6417969353443484e-06, |
|
"loss": 0.5721, |
|
"mean_token_accuracy": 0.8395859107375145, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.3878326996197718, |
|
"grad_norm": 0.5372603660779312, |
|
"learning_rate": 2.5931705966588803e-06, |
|
"loss": 0.5826, |
|
"mean_token_accuracy": 0.8370852112770081, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.394169835234474, |
|
"grad_norm": 0.5104565997924485, |
|
"learning_rate": 2.544838770617772e-06, |
|
"loss": 0.5785, |
|
"mean_token_accuracy": 0.8393797069787979, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.4005069708491762, |
|
"grad_norm": 0.5336610190327751, |
|
"learning_rate": 2.496807371364283e-06, |
|
"loss": 0.5759, |
|
"mean_token_accuracy": 0.8390834912657738, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.4068441064638784, |
|
"grad_norm": 0.662951455066245, |
|
"learning_rate": 2.44908227627983e-06, |
|
"loss": 0.5712, |
|
"mean_token_accuracy": 0.8397842928767204, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.4131812420785805, |
|
"grad_norm": 0.5438222471825553, |
|
"learning_rate": 2.4016693252647954e-06, |
|
"loss": 0.5703, |
|
"mean_token_accuracy": 0.8397609844803811, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.4195183776932827, |
|
"grad_norm": 0.5457903944622784, |
|
"learning_rate": 2.3545743200239303e-06, |
|
"loss": 0.5756, |
|
"mean_token_accuracy": 0.8387856274843216, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.4258555133079849, |
|
"grad_norm": 0.5413159299268847, |
|
"learning_rate": 2.3078030233564203e-06, |
|
"loss": 0.5796, |
|
"mean_token_accuracy": 0.8379950270056724, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.4321926489226868, |
|
"grad_norm": 0.5017485230997426, |
|
"learning_rate": 2.2613611584507227e-06, |
|
"loss": 0.5843, |
|
"mean_token_accuracy": 0.8371415048837662, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.4385297845373892, |
|
"grad_norm": 0.5036035556859302, |
|
"learning_rate": 2.215254408184249e-06, |
|
"loss": 0.5733, |
|
"mean_token_accuracy": 0.8397385001182556, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.4448669201520912, |
|
"grad_norm": 0.5512472367603704, |
|
"learning_rate": 2.169488414427969e-06, |
|
"loss": 0.5665, |
|
"mean_token_accuracy": 0.8411229193210602, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.4512040557667933, |
|
"grad_norm": 0.5122324337296091, |
|
"learning_rate": 2.1240687773560476e-06, |
|
"loss": 0.5754, |
|
"mean_token_accuracy": 0.838901475071907, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.4575411913814955, |
|
"grad_norm": 0.514428924855705, |
|
"learning_rate": 2.0790010547605743e-06, |
|
"loss": 0.5773, |
|
"mean_token_accuracy": 0.8385174334049225, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.4638783269961977, |
|
"grad_norm": 0.541489817693485, |
|
"learning_rate": 2.0342907613714837e-06, |
|
"loss": 0.5724, |
|
"mean_token_accuracy": 0.839878860116005, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.4702154626108999, |
|
"grad_norm": 0.5233399327286699, |
|
"learning_rate": 1.989943368181741e-06, |
|
"loss": 0.5683, |
|
"mean_token_accuracy": 0.8406485706567765, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.476552598225602, |
|
"grad_norm": 0.4977622157535387, |
|
"learning_rate": 1.945964301777883e-06, |
|
"loss": 0.5568, |
|
"mean_token_accuracy": 0.8429565221071244, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.4828897338403042, |
|
"grad_norm": 0.502171168050283, |
|
"learning_rate": 1.9023589436759954e-06, |
|
"loss": 0.555, |
|
"mean_token_accuracy": 0.8435925453901291, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.4892268694550064, |
|
"grad_norm": 0.5026240018805591, |
|
"learning_rate": 1.859132629663194e-06, |
|
"loss": 0.5609, |
|
"mean_token_accuracy": 0.8420811951160431, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.4955640050697085, |
|
"grad_norm": 0.5071369135189446, |
|
"learning_rate": 1.8162906491447136e-06, |
|
"loss": 0.5751, |
|
"mean_token_accuracy": 0.8397066414356231, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.5019011406844105, |
|
"grad_norm": 0.5012155091792143, |
|
"learning_rate": 1.7738382444966668e-06, |
|
"loss": 0.5714, |
|
"mean_token_accuracy": 0.839833353459835, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.508238276299113, |
|
"grad_norm": 0.4943163959620169, |
|
"learning_rate": 1.7317806104245599e-06, |
|
"loss": 0.5614, |
|
"mean_token_accuracy": 0.8422631338238716, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.5145754119138148, |
|
"grad_norm": 0.5168969148185261, |
|
"learning_rate": 1.6901228933276381e-06, |
|
"loss": 0.5734, |
|
"mean_token_accuracy": 0.8398737594485283, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.5209125475285172, |
|
"grad_norm": 0.5085722470934201, |
|
"learning_rate": 1.6488701906691462e-06, |
|
"loss": 0.5743, |
|
"mean_token_accuracy": 0.8395018294453621, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.5272496831432192, |
|
"grad_norm": 0.5145560441594629, |
|
"learning_rate": 1.6080275503525754e-06, |
|
"loss": 0.5714, |
|
"mean_token_accuracy": 0.8400074362754821, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.5335868187579216, |
|
"grad_norm": 0.5142209477213089, |
|
"learning_rate": 1.5675999701039734e-06, |
|
"loss": 0.5731, |
|
"mean_token_accuracy": 0.8395378664135933, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.5399239543726235, |
|
"grad_norm": 0.4817695083655761, |
|
"learning_rate": 1.5275923968603967e-06, |
|
"loss": 0.5668, |
|
"mean_token_accuracy": 0.840859878063202, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.5462610899873257, |
|
"grad_norm": 0.4958218170076731, |
|
"learning_rate": 1.4880097261645765e-06, |
|
"loss": 0.575, |
|
"mean_token_accuracy": 0.8392793446779251, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.5525982256020279, |
|
"grad_norm": 0.5150469794513786, |
|
"learning_rate": 1.4488568015658738e-06, |
|
"loss": 0.5702, |
|
"mean_token_accuracy": 0.8403733685612679, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.55893536121673, |
|
"grad_norm": 0.5415616286404993, |
|
"learning_rate": 1.4101384140275947e-06, |
|
"loss": 0.5724, |
|
"mean_token_accuracy": 0.8399771124124527, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.5652724968314322, |
|
"grad_norm": 0.5125659970580118, |
|
"learning_rate": 1.3718593013407455e-06, |
|
"loss": 0.565, |
|
"mean_token_accuracy": 0.8413113921880722, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.5716096324461344, |
|
"grad_norm": 0.5172557001838594, |
|
"learning_rate": 1.3340241475442889e-06, |
|
"loss": 0.5666, |
|
"mean_token_accuracy": 0.8413270160555839, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.5779467680608366, |
|
"grad_norm": 0.5218390924731011, |
|
"learning_rate": 1.296637582351979e-06, |
|
"loss": 0.5811, |
|
"mean_token_accuracy": 0.8378918588161468, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.5842839036755385, |
|
"grad_norm": 0.49941956793616216, |
|
"learning_rate": 1.2597041805858469e-06, |
|
"loss": 0.5597, |
|
"mean_token_accuracy": 0.8421694174408912, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.590621039290241, |
|
"grad_norm": 0.4810003693281146, |
|
"learning_rate": 1.2232284616163986e-06, |
|
"loss": 0.5646, |
|
"mean_token_accuracy": 0.8418364375829697, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.5969581749049429, |
|
"grad_norm": 0.49642278969512443, |
|
"learning_rate": 1.1872148888096024e-06, |
|
"loss": 0.5686, |
|
"mean_token_accuracy": 0.840269310772419, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.6032953105196452, |
|
"grad_norm": 0.5258808050772633, |
|
"learning_rate": 1.1516678689807249e-06, |
|
"loss": 0.5665, |
|
"mean_token_accuracy": 0.8409392833709717, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.6096324461343472, |
|
"grad_norm": 0.4807160453689938, |
|
"learning_rate": 1.1165917518550913e-06, |
|
"loss": 0.5671, |
|
"mean_token_accuracy": 0.8411058440804482, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.6159695817490496, |
|
"grad_norm": 0.48965855513910594, |
|
"learning_rate": 1.0819908295358284e-06, |
|
"loss": 0.5588, |
|
"mean_token_accuracy": 0.8429983571171761, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.6223067173637515, |
|
"grad_norm": 0.5202990154276527, |
|
"learning_rate": 1.0478693359786612e-06, |
|
"loss": 0.5716, |
|
"mean_token_accuracy": 0.8400727063417435, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.6286438529784537, |
|
"grad_norm": 0.5171890350253132, |
|
"learning_rate": 1.0142314464738195e-06, |
|
"loss": 0.5517, |
|
"mean_token_accuracy": 0.8443869799375534, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.6349809885931559, |
|
"grad_norm": 0.48132181865431867, |
|
"learning_rate": 9.810812771351335e-07, |
|
"loss": 0.5784, |
|
"mean_token_accuracy": 0.8387523666024208, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.641318124207858, |
|
"grad_norm": 0.48031587809861415, |
|
"learning_rate": 9.484228843963577e-07, |
|
"loss": 0.5609, |
|
"mean_token_accuracy": 0.8421882972121238, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.6476552598225602, |
|
"grad_norm": 0.48862410273482815, |
|
"learning_rate": 9.16260264514805e-07, |
|
"loss": 0.5739, |
|
"mean_token_accuracy": 0.8393760696053505, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.6539923954372624, |
|
"grad_norm": 0.4974345984726092, |
|
"learning_rate": 8.845973530823443e-07, |
|
"loss": 0.5623, |
|
"mean_token_accuracy": 0.842260554432869, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.6603295310519646, |
|
"grad_norm": 0.4969870292569671, |
|
"learning_rate": 8.534380245438212e-07, |
|
"loss": 0.5806, |
|
"mean_token_accuracy": 0.8379565149545669, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.51170305488906, |
|
"learning_rate": 8.22786091722958e-07, |
|
"loss": 0.5744, |
|
"mean_token_accuracy": 0.8394851118326188, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.673003802281369, |
|
"grad_norm": 0.4882536601279716, |
|
"learning_rate": 7.926453053557948e-07, |
|
"loss": 0.5694, |
|
"mean_token_accuracy": 0.8412208631634712, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.6793409378960709, |
|
"grad_norm": 0.5201633381345815, |
|
"learning_rate": 7.630193536317354e-07, |
|
"loss": 0.5779, |
|
"mean_token_accuracy": 0.8387572214007377, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.6856780735107733, |
|
"grad_norm": 0.4872309884092355, |
|
"learning_rate": 7.339118617422325e-07, |
|
"loss": 0.5721, |
|
"mean_token_accuracy": 0.840134784579277, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.6920152091254752, |
|
"grad_norm": 0.4742262519043048, |
|
"learning_rate": 7.05326391437195e-07, |
|
"loss": 0.567, |
|
"mean_token_accuracy": 0.8408115699887275, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.6983523447401776, |
|
"grad_norm": 0.48084605496078786, |
|
"learning_rate": 6.772664405891505e-07, |
|
"loss": 0.5739, |
|
"mean_token_accuracy": 0.8401078969240189, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.7046894803548795, |
|
"grad_norm": 0.4836055364313366, |
|
"learning_rate": 6.49735442765228e-07, |
|
"loss": 0.5771, |
|
"mean_token_accuracy": 0.8388657510280609, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.7110266159695817, |
|
"grad_norm": 0.4955193703741457, |
|
"learning_rate": 6.227367668070084e-07, |
|
"loss": 0.5641, |
|
"mean_token_accuracy": 0.8420116931200028, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.717363751584284, |
|
"grad_norm": 0.47888043666477453, |
|
"learning_rate": 5.962737164182942e-07, |
|
"loss": 0.5695, |
|
"mean_token_accuracy": 0.8411467924714089, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.723700887198986, |
|
"grad_norm": 0.48358100558875267, |
|
"learning_rate": 5.703495297608486e-07, |
|
"loss": 0.5672, |
|
"mean_token_accuracy": 0.8408854246139527, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.7300380228136882, |
|
"grad_norm": 0.48450381732440073, |
|
"learning_rate": 5.449673790581611e-07, |
|
"loss": 0.5756, |
|
"mean_token_accuracy": 0.8394671693444252, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.7363751584283904, |
|
"grad_norm": 0.524224789009983, |
|
"learning_rate": 5.201303702072724e-07, |
|
"loss": 0.564, |
|
"mean_token_accuracy": 0.8414558693766594, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.7427122940430926, |
|
"grad_norm": 0.47448699280100953, |
|
"learning_rate": 4.958415423987229e-07, |
|
"loss": 0.5576, |
|
"mean_token_accuracy": 0.8432327851653099, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.7490494296577945, |
|
"grad_norm": 0.4999589058798834, |
|
"learning_rate": 4.721038677446599e-07, |
|
"loss": 0.5543, |
|
"mean_token_accuracy": 0.8434969082474708, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.755386565272497, |
|
"grad_norm": 0.49519130319734356, |
|
"learning_rate": 4.4892025091515465e-07, |
|
"loss": 0.5744, |
|
"mean_token_accuracy": 0.8392727747559547, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.7617237008871989, |
|
"grad_norm": 0.47996153862103574, |
|
"learning_rate": 4.2629352878276964e-07, |
|
"loss": 0.5757, |
|
"mean_token_accuracy": 0.8395681723952293, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.7680608365019013, |
|
"grad_norm": 0.4743677174789034, |
|
"learning_rate": 4.04226470075425e-07, |
|
"loss": 0.5793, |
|
"mean_token_accuracy": 0.8383775666356087, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.7743979721166032, |
|
"grad_norm": 0.47358657093352546, |
|
"learning_rate": 3.8272177503760277e-07, |
|
"loss": 0.5666, |
|
"mean_token_accuracy": 0.8409555062651635, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7807351077313056, |
|
"grad_norm": 0.47898043422535136, |
|
"learning_rate": 3.6178207509992623e-07, |
|
"loss": 0.5588, |
|
"mean_token_accuracy": 0.8429359510540962, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.7870722433460076, |
|
"grad_norm": 0.48612638069980213, |
|
"learning_rate": 3.4140993255717123e-07, |
|
"loss": 0.5687, |
|
"mean_token_accuracy": 0.840995529294014, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.7934093789607097, |
|
"grad_norm": 0.47802067614271637, |
|
"learning_rate": 3.216078402547218e-07, |
|
"loss": 0.5651, |
|
"mean_token_accuracy": 0.8413813829421997, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.799746514575412, |
|
"grad_norm": 0.45575767680162316, |
|
"learning_rate": 3.0237822128353744e-07, |
|
"loss": 0.5551, |
|
"mean_token_accuracy": 0.8439073666930199, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.806083650190114, |
|
"grad_norm": 0.5008888425261698, |
|
"learning_rate": 2.8372342868364934e-07, |
|
"loss": 0.5763, |
|
"mean_token_accuracy": 0.8394736155867577, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.8124207858048162, |
|
"grad_norm": 0.47883052147679717, |
|
"learning_rate": 2.656457451562283e-07, |
|
"loss": 0.5847, |
|
"mean_token_accuracy": 0.8371838569641114, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.8187579214195184, |
|
"grad_norm": 0.48136837053701437, |
|
"learning_rate": 2.4814738278426287e-07, |
|
"loss": 0.5713, |
|
"mean_token_accuracy": 0.8400285989046097, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.8250950570342206, |
|
"grad_norm": 0.47630227923243995, |
|
"learning_rate": 2.3123048276187722e-07, |
|
"loss": 0.5663, |
|
"mean_token_accuracy": 0.8415055811405182, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.8314321926489225, |
|
"grad_norm": 0.48067639897927306, |
|
"learning_rate": 2.1489711513232038e-07, |
|
"loss": 0.5702, |
|
"mean_token_accuracy": 0.8404717803001404, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.837769328263625, |
|
"grad_norm": 0.48817733468841595, |
|
"learning_rate": 1.991492785346677e-07, |
|
"loss": 0.5659, |
|
"mean_token_accuracy": 0.8410487651824952, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.8441064638783269, |
|
"grad_norm": 0.4753854139627654, |
|
"learning_rate": 1.8398889995925428e-07, |
|
"loss": 0.5612, |
|
"mean_token_accuracy": 0.842425537109375, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.8504435994930293, |
|
"grad_norm": 0.4979097318389579, |
|
"learning_rate": 1.694178345118791e-07, |
|
"loss": 0.5554, |
|
"mean_token_accuracy": 0.843775661289692, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.8567807351077312, |
|
"grad_norm": 0.4829356927499738, |
|
"learning_rate": 1.5543786518680436e-07, |
|
"loss": 0.556, |
|
"mean_token_accuracy": 0.8434767201542854, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.8631178707224336, |
|
"grad_norm": 0.4651233227253299, |
|
"learning_rate": 1.4205070264857901e-07, |
|
"loss": 0.5704, |
|
"mean_token_accuracy": 0.8402711316943169, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.8694550063371356, |
|
"grad_norm": 0.47253676852018517, |
|
"learning_rate": 1.292579850227099e-07, |
|
"loss": 0.5777, |
|
"mean_token_accuracy": 0.8392020970582962, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.8757921419518377, |
|
"grad_norm": 0.4800740772721781, |
|
"learning_rate": 1.170612776952168e-07, |
|
"loss": 0.566, |
|
"mean_token_accuracy": 0.8414452761411667, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.88212927756654, |
|
"grad_norm": 0.46528025174750537, |
|
"learning_rate": 1.0546207312107814e-07, |
|
"loss": 0.5636, |
|
"mean_token_accuracy": 0.8416185140609741, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.888466413181242, |
|
"grad_norm": 0.47693097112640276, |
|
"learning_rate": 9.44617906416101e-08, |
|
"loss": 0.5727, |
|
"mean_token_accuracy": 0.8405211389064788, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.8948035487959443, |
|
"grad_norm": 0.4787485103517413, |
|
"learning_rate": 8.406177631078594e-08, |
|
"loss": 0.5708, |
|
"mean_token_accuracy": 0.8403903424739838, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.9011406844106464, |
|
"grad_norm": 0.45967120152380847, |
|
"learning_rate": 7.426330273052618e-08, |
|
"loss": 0.5496, |
|
"mean_token_accuracy": 0.8449963420629502, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9074778200253486, |
|
"grad_norm": 0.46451147059266606, |
|
"learning_rate": 6.506756889497756e-08, |
|
"loss": 0.5608, |
|
"mean_token_accuracy": 0.8425014033913613, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.9138149556400506, |
|
"grad_norm": 0.5057760468937542, |
|
"learning_rate": 5.647570004379432e-08, |
|
"loss": 0.5602, |
|
"mean_token_accuracy": 0.8427406966686248, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.920152091254753, |
|
"grad_norm": 0.48061481353459495, |
|
"learning_rate": 4.848874752445221e-08, |
|
"loss": 0.5675, |
|
"mean_token_accuracy": 0.8411912024021149, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.926489226869455, |
|
"grad_norm": 0.4689935228428535, |
|
"learning_rate": 4.110768866359638e-08, |
|
"loss": 0.5631, |
|
"mean_token_accuracy": 0.8418816044926644, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.9328263624841573, |
|
"grad_norm": 0.4698265767310371, |
|
"learning_rate": 3.43334266474521e-08, |
|
"loss": 0.5635, |
|
"mean_token_accuracy": 0.8423062637448311, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.9391634980988592, |
|
"grad_norm": 0.49190745957035076, |
|
"learning_rate": 2.8166790411304766e-08, |
|
"loss": 0.5644, |
|
"mean_token_accuracy": 0.8418506249785424, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.9455006337135616, |
|
"grad_norm": 0.4676519055114557, |
|
"learning_rate": 2.260853453806944e-08, |
|
"loss": 0.5691, |
|
"mean_token_accuracy": 0.8408907786011696, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.9518377693282636, |
|
"grad_norm": 0.4857147511585138, |
|
"learning_rate": 1.7659339165952417e-08, |
|
"loss": 0.5699, |
|
"mean_token_accuracy": 0.8406305849552155, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.9581749049429658, |
|
"grad_norm": 0.48303973039403075, |
|
"learning_rate": 1.3319809905228409e-08, |
|
"loss": 0.5765, |
|
"mean_token_accuracy": 0.8395203098654747, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.964512040557668, |
|
"grad_norm": 0.47745966065458134, |
|
"learning_rate": 9.590477764135353e-09, |
|
"loss": 0.5641, |
|
"mean_token_accuracy": 0.8417988792061806, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.97084917617237, |
|
"grad_norm": 0.4675012014451023, |
|
"learning_rate": 6.47179908389417e-09, |
|
"loss": 0.5699, |
|
"mean_token_accuracy": 0.8404615536332131, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.9771863117870723, |
|
"grad_norm": 0.4956658011789385, |
|
"learning_rate": 3.964155482871213e-09, |
|
"loss": 0.5592, |
|
"mean_token_accuracy": 0.842540180683136, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.9835234474017744, |
|
"grad_norm": 0.4689038627708401, |
|
"learning_rate": 2.0678538098806158e-09, |
|
"loss": 0.5745, |
|
"mean_token_accuracy": 0.8394525855779648, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.9898605830164766, |
|
"grad_norm": 0.4660374899806601, |
|
"learning_rate": 7.83126106637111e-10, |
|
"loss": 0.5643, |
|
"mean_token_accuracy": 0.8416339352726936, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.9961977186311786, |
|
"grad_norm": 0.4729664926744204, |
|
"learning_rate": 1.1012957935985224e-10, |
|
"loss": 0.5636, |
|
"mean_token_accuracy": 0.8414568796753883, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"mean_token_accuracy": 0.8406301041444143, |
|
"step": 1578, |
|
"total_flos": 827207983300608.0, |
|
"train_loss": 0.6521280055868006, |
|
"train_runtime": 235151.6683, |
|
"train_samples_per_second": 1.718, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1578, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 827207983300608.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|