OpenR1-Qwen-7B-Turkish / trainer_state.json
bezir's picture
Model save
d07b4df verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 1578,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0063371356147021544,
"grad_norm": 7.199723076955636,
"learning_rate": 3.164556962025317e-07,
"loss": 1.4397,
"mean_token_accuracy": 0.6951015710830688,
"step": 5
},
{
"epoch": 0.012674271229404309,
"grad_norm": 7.116001217650177,
"learning_rate": 6.329113924050634e-07,
"loss": 1.4552,
"mean_token_accuracy": 0.6930991888046265,
"step": 10
},
{
"epoch": 0.019011406844106463,
"grad_norm": 4.64551484224683,
"learning_rate": 9.493670886075951e-07,
"loss": 1.3993,
"mean_token_accuracy": 0.6986153647303581,
"step": 15
},
{
"epoch": 0.025348542458808618,
"grad_norm": 3.0027875956032366,
"learning_rate": 1.2658227848101267e-06,
"loss": 1.3103,
"mean_token_accuracy": 0.7071203991770745,
"step": 20
},
{
"epoch": 0.031685678073510776,
"grad_norm": 3.1251366551644244,
"learning_rate": 1.5822784810126585e-06,
"loss": 1.2458,
"mean_token_accuracy": 0.7130974352359771,
"step": 25
},
{
"epoch": 0.03802281368821293,
"grad_norm": 2.289149516732586,
"learning_rate": 1.8987341772151901e-06,
"loss": 1.1709,
"mean_token_accuracy": 0.7238569274544716,
"step": 30
},
{
"epoch": 0.044359949302915085,
"grad_norm": 2.0282034523858945,
"learning_rate": 2.2151898734177215e-06,
"loss": 1.1025,
"mean_token_accuracy": 0.7365155085921288,
"step": 35
},
{
"epoch": 0.050697084917617236,
"grad_norm": 1.4393865600169713,
"learning_rate": 2.5316455696202535e-06,
"loss": 1.0754,
"mean_token_accuracy": 0.7417579337954521,
"step": 40
},
{
"epoch": 0.057034220532319393,
"grad_norm": 0.9968108242787993,
"learning_rate": 2.848101265822785e-06,
"loss": 1.0382,
"mean_token_accuracy": 0.7486504480242729,
"step": 45
},
{
"epoch": 0.06337135614702155,
"grad_norm": 0.9450280587173171,
"learning_rate": 3.164556962025317e-06,
"loss": 1.0089,
"mean_token_accuracy": 0.7545697376132011,
"step": 50
},
{
"epoch": 0.0697084917617237,
"grad_norm": 0.9255531790602832,
"learning_rate": 3.4810126582278487e-06,
"loss": 0.974,
"mean_token_accuracy": 0.7610737249255181,
"step": 55
},
{
"epoch": 0.07604562737642585,
"grad_norm": 0.784862301963824,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.9446,
"mean_token_accuracy": 0.7658546909689903,
"step": 60
},
{
"epoch": 0.08238276299112801,
"grad_norm": 0.8654366770506445,
"learning_rate": 4.113924050632912e-06,
"loss": 0.9532,
"mean_token_accuracy": 0.764112365245819,
"step": 65
},
{
"epoch": 0.08871989860583017,
"grad_norm": 0.8440093108811262,
"learning_rate": 4.430379746835443e-06,
"loss": 0.9019,
"mean_token_accuracy": 0.7735387146472931,
"step": 70
},
{
"epoch": 0.09505703422053231,
"grad_norm": 0.7242515634546599,
"learning_rate": 4.746835443037975e-06,
"loss": 0.8972,
"mean_token_accuracy": 0.7742968738079071,
"step": 75
},
{
"epoch": 0.10139416983523447,
"grad_norm": 0.7207614994286641,
"learning_rate": 5.063291139240507e-06,
"loss": 0.8872,
"mean_token_accuracy": 0.7761535227298737,
"step": 80
},
{
"epoch": 0.10773130544993663,
"grad_norm": 0.742791199954622,
"learning_rate": 5.379746835443038e-06,
"loss": 0.8559,
"mean_token_accuracy": 0.7819930538535118,
"step": 85
},
{
"epoch": 0.11406844106463879,
"grad_norm": 0.7678641716925835,
"learning_rate": 5.69620253164557e-06,
"loss": 0.8473,
"mean_token_accuracy": 0.7834332928061485,
"step": 90
},
{
"epoch": 0.12040557667934093,
"grad_norm": 0.71180994773894,
"learning_rate": 6.012658227848101e-06,
"loss": 0.8352,
"mean_token_accuracy": 0.7855397373437881,
"step": 95
},
{
"epoch": 0.1267427122940431,
"grad_norm": 0.7993738785147041,
"learning_rate": 6.329113924050634e-06,
"loss": 0.8589,
"mean_token_accuracy": 0.7812221512198448,
"step": 100
},
{
"epoch": 0.13307984790874525,
"grad_norm": 0.7568194042750847,
"learning_rate": 6.645569620253165e-06,
"loss": 0.8431,
"mean_token_accuracy": 0.7850423708558083,
"step": 105
},
{
"epoch": 0.1394169835234474,
"grad_norm": 0.7969657403354691,
"learning_rate": 6.962025316455697e-06,
"loss": 0.8146,
"mean_token_accuracy": 0.7894491747021675,
"step": 110
},
{
"epoch": 0.14575411913814956,
"grad_norm": 0.7814384559927074,
"learning_rate": 7.2784810126582285e-06,
"loss": 0.816,
"mean_token_accuracy": 0.7893038675189018,
"step": 115
},
{
"epoch": 0.1520912547528517,
"grad_norm": 0.7970973600599863,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.8168,
"mean_token_accuracy": 0.7892953917384148,
"step": 120
},
{
"epoch": 0.15842839036755388,
"grad_norm": 0.7289531586042841,
"learning_rate": 7.911392405063292e-06,
"loss": 0.8036,
"mean_token_accuracy": 0.7918314695358276,
"step": 125
},
{
"epoch": 0.16476552598225602,
"grad_norm": 0.8996289034177167,
"learning_rate": 8.227848101265824e-06,
"loss": 0.7886,
"mean_token_accuracy": 0.7948763906955719,
"step": 130
},
{
"epoch": 0.17110266159695817,
"grad_norm": 0.9505048466982942,
"learning_rate": 8.544303797468356e-06,
"loss": 0.7765,
"mean_token_accuracy": 0.7972663462162017,
"step": 135
},
{
"epoch": 0.17743979721166034,
"grad_norm": 0.8547089186827208,
"learning_rate": 8.860759493670886e-06,
"loss": 0.7778,
"mean_token_accuracy": 0.7966541960835457,
"step": 140
},
{
"epoch": 0.18377693282636248,
"grad_norm": 0.8115832093940138,
"learning_rate": 9.177215189873418e-06,
"loss": 0.7755,
"mean_token_accuracy": 0.7976241648197174,
"step": 145
},
{
"epoch": 0.19011406844106463,
"grad_norm": 0.7240367508508893,
"learning_rate": 9.49367088607595e-06,
"loss": 0.7679,
"mean_token_accuracy": 0.7988486766815186,
"step": 150
},
{
"epoch": 0.1964512040557668,
"grad_norm": 0.8604548037210017,
"learning_rate": 9.810126582278482e-06,
"loss": 0.7666,
"mean_token_accuracy": 0.7985615819692612,
"step": 155
},
{
"epoch": 0.20278833967046894,
"grad_norm": 0.7650650036074902,
"learning_rate": 9.99995105342046e-06,
"loss": 0.7615,
"mean_token_accuracy": 0.8001444712281227,
"step": 160
},
{
"epoch": 0.20912547528517111,
"grad_norm": 0.7907667013526878,
"learning_rate": 9.999400415406145e-06,
"loss": 0.7662,
"mean_token_accuracy": 0.7991914421319961,
"step": 165
},
{
"epoch": 0.21546261089987326,
"grad_norm": 0.8483269008499935,
"learning_rate": 9.998238023756727e-06,
"loss": 0.7597,
"mean_token_accuracy": 0.800473365187645,
"step": 170
},
{
"epoch": 0.2217997465145754,
"grad_norm": 0.8423617289320647,
"learning_rate": 9.996464020708734e-06,
"loss": 0.7598,
"mean_token_accuracy": 0.7996020078659057,
"step": 175
},
{
"epoch": 0.22813688212927757,
"grad_norm": 0.8057636513151334,
"learning_rate": 9.994078623338757e-06,
"loss": 0.7566,
"mean_token_accuracy": 0.800896917283535,
"step": 180
},
{
"epoch": 0.23447401774397972,
"grad_norm": 0.8990102449117932,
"learning_rate": 9.991082123536902e-06,
"loss": 0.7522,
"mean_token_accuracy": 0.8013818353414536,
"step": 185
},
{
"epoch": 0.24081115335868186,
"grad_norm": 0.9698242122380497,
"learning_rate": 9.987474887971067e-06,
"loss": 0.7463,
"mean_token_accuracy": 0.8028701841831207,
"step": 190
},
{
"epoch": 0.24714828897338403,
"grad_norm": 0.960138107922893,
"learning_rate": 9.983257358042076e-06,
"loss": 0.7401,
"mean_token_accuracy": 0.8041222214698791,
"step": 195
},
{
"epoch": 0.2534854245880862,
"grad_norm": 0.8305799821436418,
"learning_rate": 9.978430049829672e-06,
"loss": 0.7601,
"mean_token_accuracy": 0.8001280605793,
"step": 200
},
{
"epoch": 0.2598225602027883,
"grad_norm": 0.7129991697669212,
"learning_rate": 9.972993554029357e-06,
"loss": 0.7575,
"mean_token_accuracy": 0.8003058210015297,
"step": 205
},
{
"epoch": 0.2661596958174905,
"grad_norm": 0.8819778135317846,
"learning_rate": 9.966948535880118e-06,
"loss": 0.7444,
"mean_token_accuracy": 0.8032929092645645,
"step": 210
},
{
"epoch": 0.27249683143219267,
"grad_norm": 0.7833769376025013,
"learning_rate": 9.960295735083023e-06,
"loss": 0.7151,
"mean_token_accuracy": 0.8091372177004814,
"step": 215
},
{
"epoch": 0.2788339670468948,
"grad_norm": 1.2183716099383828,
"learning_rate": 9.953035965710707e-06,
"loss": 0.7346,
"mean_token_accuracy": 0.8045761153101921,
"step": 220
},
{
"epoch": 0.28517110266159695,
"grad_norm": 0.901875985078722,
"learning_rate": 9.945170116107758e-06,
"loss": 0.7337,
"mean_token_accuracy": 0.805341312289238,
"step": 225
},
{
"epoch": 0.2915082382762991,
"grad_norm": 0.8010789142797352,
"learning_rate": 9.936699148782018e-06,
"loss": 0.737,
"mean_token_accuracy": 0.8051745280623436,
"step": 230
},
{
"epoch": 0.29784537389100124,
"grad_norm": 0.7625075089546256,
"learning_rate": 9.927624100286795e-06,
"loss": 0.7288,
"mean_token_accuracy": 0.8064413368701935,
"step": 235
},
{
"epoch": 0.3041825095057034,
"grad_norm": 0.7665506308733163,
"learning_rate": 9.917946081094033e-06,
"loss": 0.7001,
"mean_token_accuracy": 0.8119662031531334,
"step": 240
},
{
"epoch": 0.3105196451204056,
"grad_norm": 0.8173005458001997,
"learning_rate": 9.907666275458432e-06,
"loss": 0.7171,
"mean_token_accuracy": 0.8087792381644249,
"step": 245
},
{
"epoch": 0.31685678073510776,
"grad_norm": 0.7944707699072153,
"learning_rate": 9.896785941272524e-06,
"loss": 0.7169,
"mean_token_accuracy": 0.808886106312275,
"step": 250
},
{
"epoch": 0.3231939163498099,
"grad_norm": 0.7652270272633694,
"learning_rate": 9.885306409912767e-06,
"loss": 0.7122,
"mean_token_accuracy": 0.8092179223895073,
"step": 255
},
{
"epoch": 0.32953105196451205,
"grad_norm": 0.8298720811434571,
"learning_rate": 9.87322908607661e-06,
"loss": 0.7106,
"mean_token_accuracy": 0.8099273145198822,
"step": 260
},
{
"epoch": 0.3358681875792142,
"grad_norm": 0.6635750146125453,
"learning_rate": 9.860555447610626e-06,
"loss": 0.7205,
"mean_token_accuracy": 0.8083759486675263,
"step": 265
},
{
"epoch": 0.34220532319391633,
"grad_norm": 0.7701167215766528,
"learning_rate": 9.847287045329665e-06,
"loss": 0.7178,
"mean_token_accuracy": 0.8084105476737022,
"step": 270
},
{
"epoch": 0.3485424588086185,
"grad_norm": 0.8129554147937268,
"learning_rate": 9.833425502827087e-06,
"loss": 0.7191,
"mean_token_accuracy": 0.8078344166278839,
"step": 275
},
{
"epoch": 0.3548795944233207,
"grad_norm": 0.7153635278024493,
"learning_rate": 9.818972516276096e-06,
"loss": 0.6973,
"mean_token_accuracy": 0.8126269072294235,
"step": 280
},
{
"epoch": 0.3612167300380228,
"grad_norm": 0.7019835045316667,
"learning_rate": 9.803929854222182e-06,
"loss": 0.704,
"mean_token_accuracy": 0.8114176645874978,
"step": 285
},
{
"epoch": 0.36755386565272496,
"grad_norm": 0.7615682616292789,
"learning_rate": 9.788299357366717e-06,
"loss": 0.7089,
"mean_token_accuracy": 0.8106587365269661,
"step": 290
},
{
"epoch": 0.37389100126742714,
"grad_norm": 0.9786947635111585,
"learning_rate": 9.772082938341706e-06,
"loss": 0.7014,
"mean_token_accuracy": 0.8121261984109879,
"step": 295
},
{
"epoch": 0.38022813688212925,
"grad_norm": 0.8212453521500733,
"learning_rate": 9.755282581475769e-06,
"loss": 0.7072,
"mean_token_accuracy": 0.8106094494462013,
"step": 300
},
{
"epoch": 0.3865652724968314,
"grad_norm": 0.865055505917381,
"learning_rate": 9.7379003425513e-06,
"loss": 0.7163,
"mean_token_accuracy": 0.8092033118009567,
"step": 305
},
{
"epoch": 0.3929024081115336,
"grad_norm": 0.6716631342519259,
"learning_rate": 9.71993834855293e-06,
"loss": 0.7045,
"mean_token_accuracy": 0.8109571009874343,
"step": 310
},
{
"epoch": 0.39923954372623577,
"grad_norm": 0.7649280200479434,
"learning_rate": 9.701398797407258e-06,
"loss": 0.7044,
"mean_token_accuracy": 0.8110996559262276,
"step": 315
},
{
"epoch": 0.4055766793409379,
"grad_norm": 0.732205588585856,
"learning_rate": 9.68228395771388e-06,
"loss": 0.6906,
"mean_token_accuracy": 0.8138323068618775,
"step": 320
},
{
"epoch": 0.41191381495564006,
"grad_norm": 0.8349157198044287,
"learning_rate": 9.662596168467823e-06,
"loss": 0.6963,
"mean_token_accuracy": 0.8128764078021049,
"step": 325
},
{
"epoch": 0.41825095057034223,
"grad_norm": 0.7284163977404773,
"learning_rate": 9.6423378387733e-06,
"loss": 0.6926,
"mean_token_accuracy": 0.8138028383255005,
"step": 330
},
{
"epoch": 0.42458808618504434,
"grad_norm": 0.6903566919121088,
"learning_rate": 9.621511447548946e-06,
"loss": 0.6992,
"mean_token_accuracy": 0.8125665381550788,
"step": 335
},
{
"epoch": 0.4309252217997465,
"grad_norm": 0.7031778472809708,
"learning_rate": 9.600119543224467e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.8134042397141457,
"step": 340
},
{
"epoch": 0.4372623574144487,
"grad_norm": 0.8781454719155253,
"learning_rate": 9.578164743428808e-06,
"loss": 0.6938,
"mean_token_accuracy": 0.8132070809602737,
"step": 345
},
{
"epoch": 0.4435994930291508,
"grad_norm": 0.8306105321262149,
"learning_rate": 9.55564973466984e-06,
"loss": 0.6928,
"mean_token_accuracy": 0.8133361831307411,
"step": 350
},
{
"epoch": 0.449936628643853,
"grad_norm": 0.7046997598602632,
"learning_rate": 9.532577272005637e-06,
"loss": 0.679,
"mean_token_accuracy": 0.8159057974815369,
"step": 355
},
{
"epoch": 0.45627376425855515,
"grad_norm": 0.889929783644245,
"learning_rate": 9.508950178707335e-06,
"loss": 0.6872,
"mean_token_accuracy": 0.8148621737957,
"step": 360
},
{
"epoch": 0.46261089987325726,
"grad_norm": 0.9776823189593525,
"learning_rate": 9.484771345913673e-06,
"loss": 0.6902,
"mean_token_accuracy": 0.8141683742403985,
"step": 365
},
{
"epoch": 0.46894803548795944,
"grad_norm": 0.7706175050493524,
"learning_rate": 9.460043732277213e-06,
"loss": 0.6908,
"mean_token_accuracy": 0.8145220652222633,
"step": 370
},
{
"epoch": 0.4752851711026616,
"grad_norm": 0.6524214668295174,
"learning_rate": 9.434770363602307e-06,
"loss": 0.6983,
"mean_token_accuracy": 0.8123016864061355,
"step": 375
},
{
"epoch": 0.4816223067173637,
"grad_norm": 0.717843515473759,
"learning_rate": 9.408954332474845e-06,
"loss": 0.6677,
"mean_token_accuracy": 0.8185531318187713,
"step": 380
},
{
"epoch": 0.4879594423320659,
"grad_norm": 0.7618453217486776,
"learning_rate": 9.382598797883811e-06,
"loss": 0.6795,
"mean_token_accuracy": 0.8164624303579331,
"step": 385
},
{
"epoch": 0.49429657794676807,
"grad_norm": 0.7324610745032628,
"learning_rate": 9.355706984834765e-06,
"loss": 0.6836,
"mean_token_accuracy": 0.8149291038513183,
"step": 390
},
{
"epoch": 0.5006337135614702,
"grad_norm": 0.6779242960146721,
"learning_rate": 9.328282183955179e-06,
"loss": 0.6884,
"mean_token_accuracy": 0.8146958678960801,
"step": 395
},
{
"epoch": 0.5069708491761724,
"grad_norm": 0.827105664596769,
"learning_rate": 9.300327751091806e-06,
"loss": 0.6873,
"mean_token_accuracy": 0.814927139878273,
"step": 400
},
{
"epoch": 0.5133079847908745,
"grad_norm": 0.6798030291558349,
"learning_rate": 9.271847106900022e-06,
"loss": 0.6659,
"mean_token_accuracy": 0.8187542855739594,
"step": 405
},
{
"epoch": 0.5196451204055766,
"grad_norm": 0.6549281773308409,
"learning_rate": 9.242843736425269e-06,
"loss": 0.6749,
"mean_token_accuracy": 0.8172334164381028,
"step": 410
},
{
"epoch": 0.5259822560202788,
"grad_norm": 0.702757870059226,
"learning_rate": 9.213321188676595e-06,
"loss": 0.6799,
"mean_token_accuracy": 0.8162769109010697,
"step": 415
},
{
"epoch": 0.532319391634981,
"grad_norm": 0.663720096138884,
"learning_rate": 9.183283076192386e-06,
"loss": 0.6688,
"mean_token_accuracy": 0.8184930950403213,
"step": 420
},
{
"epoch": 0.5386565272496832,
"grad_norm": 0.6874302061015839,
"learning_rate": 9.152733074598312e-06,
"loss": 0.6742,
"mean_token_accuracy": 0.8174020066857338,
"step": 425
},
{
"epoch": 0.5449936628643853,
"grad_norm": 0.7315428759079102,
"learning_rate": 9.121674922157558e-06,
"loss": 0.6738,
"mean_token_accuracy": 0.817636775970459,
"step": 430
},
{
"epoch": 0.5513307984790875,
"grad_norm": 0.786643495084509,
"learning_rate": 9.090112419313395e-06,
"loss": 0.6736,
"mean_token_accuracy": 0.817160977423191,
"step": 435
},
{
"epoch": 0.5576679340937896,
"grad_norm": 0.6591137928729695,
"learning_rate": 9.058049428224128e-06,
"loss": 0.6617,
"mean_token_accuracy": 0.8197388723492622,
"step": 440
},
{
"epoch": 0.5640050697084917,
"grad_norm": 0.7812371618484959,
"learning_rate": 9.025489872290511e-06,
"loss": 0.6634,
"mean_token_accuracy": 0.8193035304546357,
"step": 445
},
{
"epoch": 0.5703422053231939,
"grad_norm": 0.6845961589145344,
"learning_rate": 8.99243773567565e-06,
"loss": 0.6834,
"mean_token_accuracy": 0.8159178540110588,
"step": 450
},
{
"epoch": 0.5766793409378961,
"grad_norm": 0.6657472122738636,
"learning_rate": 8.958897062817491e-06,
"loss": 0.6892,
"mean_token_accuracy": 0.8144657433032989,
"step": 455
},
{
"epoch": 0.5830164765525983,
"grad_norm": 0.6161660996953076,
"learning_rate": 8.924871957933904e-06,
"loss": 0.6746,
"mean_token_accuracy": 0.8171708762645722,
"step": 460
},
{
"epoch": 0.5893536121673004,
"grad_norm": 0.702287591616562,
"learning_rate": 8.890366584520482e-06,
"loss": 0.6696,
"mean_token_accuracy": 0.8184025406837463,
"step": 465
},
{
"epoch": 0.5956907477820025,
"grad_norm": 0.6857028656369465,
"learning_rate": 8.855385164841072e-06,
"loss": 0.6758,
"mean_token_accuracy": 0.8170812010765076,
"step": 470
},
{
"epoch": 0.6020278833967047,
"grad_norm": 0.6231296442226781,
"learning_rate": 8.819931979411107e-06,
"loss": 0.6734,
"mean_token_accuracy": 0.81716128885746,
"step": 475
},
{
"epoch": 0.6083650190114068,
"grad_norm": 0.6948180768376443,
"learning_rate": 8.78401136647383e-06,
"loss": 0.654,
"mean_token_accuracy": 0.8216980487108231,
"step": 480
},
{
"epoch": 0.614702154626109,
"grad_norm": 0.6911375954678098,
"learning_rate": 8.747627721469437e-06,
"loss": 0.6635,
"mean_token_accuracy": 0.8201975762844086,
"step": 485
},
{
"epoch": 0.6210392902408112,
"grad_norm": 0.7213580744708442,
"learning_rate": 8.710785496497226e-06,
"loss": 0.6651,
"mean_token_accuracy": 0.8194010749459266,
"step": 490
},
{
"epoch": 0.6273764258555133,
"grad_norm": 0.6543956981962712,
"learning_rate": 8.673489199770819e-06,
"loss": 0.6607,
"mean_token_accuracy": 0.8201611772179603,
"step": 495
},
{
"epoch": 0.6337135614702155,
"grad_norm": 0.7036498788596002,
"learning_rate": 8.635743395066511e-06,
"loss": 0.651,
"mean_token_accuracy": 0.8222277790307999,
"step": 500
},
{
"epoch": 0.6400506970849176,
"grad_norm": 0.6168799535449692,
"learning_rate": 8.597552701164818e-06,
"loss": 0.6592,
"mean_token_accuracy": 0.8199419066309929,
"step": 505
},
{
"epoch": 0.6463878326996197,
"grad_norm": 0.697481072279894,
"learning_rate": 8.558921791285304e-06,
"loss": 0.6513,
"mean_token_accuracy": 0.8216616123914718,
"step": 510
},
{
"epoch": 0.6527249683143219,
"grad_norm": 0.7978637983888536,
"learning_rate": 8.519855392514734e-06,
"loss": 0.6469,
"mean_token_accuracy": 0.8225123390555382,
"step": 515
},
{
"epoch": 0.6590621039290241,
"grad_norm": 0.709080907162204,
"learning_rate": 8.480358285228648e-06,
"loss": 0.6656,
"mean_token_accuracy": 0.8191539570689201,
"step": 520
},
{
"epoch": 0.6653992395437263,
"grad_norm": 0.7897211062263881,
"learning_rate": 8.440435302506405e-06,
"loss": 0.6412,
"mean_token_accuracy": 0.8238195776939392,
"step": 525
},
{
"epoch": 0.6717363751584284,
"grad_norm": 0.6661911891034582,
"learning_rate": 8.400091329539784e-06,
"loss": 0.6611,
"mean_token_accuracy": 0.8201816022396088,
"step": 530
},
{
"epoch": 0.6780735107731305,
"grad_norm": 0.6195761512766995,
"learning_rate": 8.359331303035205e-06,
"loss": 0.6593,
"mean_token_accuracy": 0.8203893005847931,
"step": 535
},
{
"epoch": 0.6844106463878327,
"grad_norm": 0.6310438648482855,
"learning_rate": 8.31816021060964e-06,
"loss": 0.6634,
"mean_token_accuracy": 0.8192948743700981,
"step": 540
},
{
"epoch": 0.6907477820025348,
"grad_norm": 0.6512895144306936,
"learning_rate": 8.276583090180311e-06,
"loss": 0.6666,
"mean_token_accuracy": 0.8186753287911415,
"step": 545
},
{
"epoch": 0.697084917617237,
"grad_norm": 0.6124105415248355,
"learning_rate": 8.234605029348224e-06,
"loss": 0.6511,
"mean_token_accuracy": 0.8219994261860848,
"step": 550
},
{
"epoch": 0.7034220532319392,
"grad_norm": 0.6601442192692848,
"learning_rate": 8.192231164775609e-06,
"loss": 0.6391,
"mean_token_accuracy": 0.8252027094364166,
"step": 555
},
{
"epoch": 0.7097591888466414,
"grad_norm": 0.7028934088221325,
"learning_rate": 8.149466681557384e-06,
"loss": 0.6558,
"mean_token_accuracy": 0.8209778189659118,
"step": 560
},
{
"epoch": 0.7160963244613435,
"grad_norm": 0.7857750596740971,
"learning_rate": 8.106316812586676e-06,
"loss": 0.6486,
"mean_token_accuracy": 0.8220974311232567,
"step": 565
},
{
"epoch": 0.7224334600760456,
"grad_norm": 0.8961234969300941,
"learning_rate": 8.062786837914492e-06,
"loss": 0.6386,
"mean_token_accuracy": 0.824979268014431,
"step": 570
},
{
"epoch": 0.7287705956907478,
"grad_norm": 0.686280664966789,
"learning_rate": 8.01888208410362e-06,
"loss": 0.6622,
"mean_token_accuracy": 0.8198226556181908,
"step": 575
},
{
"epoch": 0.7351077313054499,
"grad_norm": 0.8344864616701414,
"learning_rate": 7.974607923576859e-06,
"loss": 0.6537,
"mean_token_accuracy": 0.821578212082386,
"step": 580
},
{
"epoch": 0.7414448669201521,
"grad_norm": 0.9938826585970929,
"learning_rate": 7.9299697739596e-06,
"loss": 0.6544,
"mean_token_accuracy": 0.8208117336034775,
"step": 585
},
{
"epoch": 0.7477820025348543,
"grad_norm": 0.6249233717628465,
"learning_rate": 7.884973097416908e-06,
"loss": 0.6591,
"mean_token_accuracy": 0.8208227157592773,
"step": 590
},
{
"epoch": 0.7541191381495564,
"grad_norm": 0.6761543444596165,
"learning_rate": 7.83962339998514e-06,
"loss": 0.6439,
"mean_token_accuracy": 0.8236203759908676,
"step": 595
},
{
"epoch": 0.7604562737642585,
"grad_norm": 0.8850862666109794,
"learning_rate": 7.793926230898187e-06,
"loss": 0.6418,
"mean_token_accuracy": 0.8238036289811135,
"step": 600
},
{
"epoch": 0.7667934093789607,
"grad_norm": 0.6931013119334469,
"learning_rate": 7.747887181908464e-06,
"loss": 0.6513,
"mean_token_accuracy": 0.8221172288060188,
"step": 605
},
{
"epoch": 0.7731305449936628,
"grad_norm": 0.9142852384539434,
"learning_rate": 7.701511886602643e-06,
"loss": 0.6522,
"mean_token_accuracy": 0.8214233443140984,
"step": 610
},
{
"epoch": 0.779467680608365,
"grad_norm": 0.693942629552867,
"learning_rate": 7.65480601971232e-06,
"loss": 0.6555,
"mean_token_accuracy": 0.8214162334799766,
"step": 615
},
{
"epoch": 0.7858048162230672,
"grad_norm": 0.7185534733688809,
"learning_rate": 7.6077752964196095e-06,
"loss": 0.6514,
"mean_token_accuracy": 0.821819719672203,
"step": 620
},
{
"epoch": 0.7921419518377694,
"grad_norm": 0.7933553753697458,
"learning_rate": 7.560425471657814e-06,
"loss": 0.6507,
"mean_token_accuracy": 0.8215969070792198,
"step": 625
},
{
"epoch": 0.7984790874524715,
"grad_norm": 0.9942445013974323,
"learning_rate": 7.512762339407214e-06,
"loss": 0.6426,
"mean_token_accuracy": 0.8233709827065467,
"step": 630
},
{
"epoch": 0.8048162230671736,
"grad_norm": 0.7111122316238967,
"learning_rate": 7.464791731986084e-06,
"loss": 0.6446,
"mean_token_accuracy": 0.8233424022793769,
"step": 635
},
{
"epoch": 0.8111533586818758,
"grad_norm": 0.6760326829400325,
"learning_rate": 7.4165195193370245e-06,
"loss": 0.6411,
"mean_token_accuracy": 0.8234749510884285,
"step": 640
},
{
"epoch": 0.8174904942965779,
"grad_norm": 0.7157859491675397,
"learning_rate": 7.3679516083086785e-06,
"loss": 0.6403,
"mean_token_accuracy": 0.8245514526963234,
"step": 645
},
{
"epoch": 0.8238276299112801,
"grad_norm": 0.6125130117848593,
"learning_rate": 7.319093941932941e-06,
"loss": 0.648,
"mean_token_accuracy": 0.8229272648692131,
"step": 650
},
{
"epoch": 0.8301647655259823,
"grad_norm": 0.6193392226038144,
"learning_rate": 7.269952498697734e-06,
"loss": 0.6568,
"mean_token_accuracy": 0.8208993718028068,
"step": 655
},
{
"epoch": 0.8365019011406845,
"grad_norm": 0.5569382668639404,
"learning_rate": 7.2205332918154525e-06,
"loss": 0.6471,
"mean_token_accuracy": 0.8230623930692673,
"step": 660
},
{
"epoch": 0.8428390367553865,
"grad_norm": 0.6854397276184668,
"learning_rate": 7.170842368487145e-06,
"loss": 0.6394,
"mean_token_accuracy": 0.8240847915410996,
"step": 665
},
{
"epoch": 0.8491761723700887,
"grad_norm": 0.7247430930413721,
"learning_rate": 7.120885809162561e-06,
"loss": 0.6496,
"mean_token_accuracy": 0.8226393803954124,
"step": 670
},
{
"epoch": 0.8555133079847909,
"grad_norm": 0.5833185802048395,
"learning_rate": 7.070669726796095e-06,
"loss": 0.644,
"mean_token_accuracy": 0.8238432243466377,
"step": 675
},
{
"epoch": 0.861850443599493,
"grad_norm": 0.6587621871435737,
"learning_rate": 7.020200266098791e-06,
"loss": 0.6367,
"mean_token_accuracy": 0.8251640364527703,
"step": 680
},
{
"epoch": 0.8681875792141952,
"grad_norm": 0.9240470458812879,
"learning_rate": 6.969483602786429e-06,
"loss": 0.6335,
"mean_token_accuracy": 0.8250990778207778,
"step": 685
},
{
"epoch": 0.8745247148288974,
"grad_norm": 0.6647921620988979,
"learning_rate": 6.918525942823836e-06,
"loss": 0.6358,
"mean_token_accuracy": 0.8253032699227333,
"step": 690
},
{
"epoch": 0.8808618504435995,
"grad_norm": 0.7460235517208977,
"learning_rate": 6.8673335216654945e-06,
"loss": 0.6364,
"mean_token_accuracy": 0.8251613467931748,
"step": 695
},
{
"epoch": 0.8871989860583016,
"grad_norm": 0.5692165964237054,
"learning_rate": 6.815912603492531e-06,
"loss": 0.63,
"mean_token_accuracy": 0.8269012838602066,
"step": 700
},
{
"epoch": 0.8935361216730038,
"grad_norm": 0.7678044598257266,
"learning_rate": 6.7642694804462026e-06,
"loss": 0.641,
"mean_token_accuracy": 0.8240568235516548,
"step": 705
},
{
"epoch": 0.899873257287706,
"grad_norm": 0.6476587911488177,
"learning_rate": 6.712410471857955e-06,
"loss": 0.6389,
"mean_token_accuracy": 0.8243090897798538,
"step": 710
},
{
"epoch": 0.9062103929024081,
"grad_norm": 0.6996232991940935,
"learning_rate": 6.660341923476152e-06,
"loss": 0.6309,
"mean_token_accuracy": 0.8264057099819183,
"step": 715
},
{
"epoch": 0.9125475285171103,
"grad_norm": 0.6140056059724183,
"learning_rate": 6.608070206689583e-06,
"loss": 0.6284,
"mean_token_accuracy": 0.826878672838211,
"step": 720
},
{
"epoch": 0.9188846641318125,
"grad_norm": 0.5994244215051143,
"learning_rate": 6.555601717747815e-06,
"loss": 0.6469,
"mean_token_accuracy": 0.8231760680675506,
"step": 725
},
{
"epoch": 0.9252217997465145,
"grad_norm": 0.671715865180922,
"learning_rate": 6.502942876978524e-06,
"loss": 0.626,
"mean_token_accuracy": 0.8275385439395905,
"step": 730
},
{
"epoch": 0.9315589353612167,
"grad_norm": 0.6964725986892187,
"learning_rate": 6.450100128001861e-06,
"loss": 0.615,
"mean_token_accuracy": 0.8296460658311844,
"step": 735
},
{
"epoch": 0.9378960709759189,
"grad_norm": 0.6643867039068622,
"learning_rate": 6.397079936941975e-06,
"loss": 0.6425,
"mean_token_accuracy": 0.823666226863861,
"step": 740
},
{
"epoch": 0.944233206590621,
"grad_norm": 0.612108400302355,
"learning_rate": 6.343888791635797e-06,
"loss": 0.6222,
"mean_token_accuracy": 0.8274678066372871,
"step": 745
},
{
"epoch": 0.9505703422053232,
"grad_norm": 0.5888135214791528,
"learning_rate": 6.2905332008391304e-06,
"loss": 0.6457,
"mean_token_accuracy": 0.8232318565249443,
"step": 750
},
{
"epoch": 0.9569074778200254,
"grad_norm": 0.6023978340437303,
"learning_rate": 6.237019693430227e-06,
"loss": 0.6244,
"mean_token_accuracy": 0.8275379940867424,
"step": 755
},
{
"epoch": 0.9632446134347274,
"grad_norm": 0.5860893552553069,
"learning_rate": 6.18335481761086e-06,
"loss": 0.6258,
"mean_token_accuracy": 0.8275753378868103,
"step": 760
},
{
"epoch": 0.9695817490494296,
"grad_norm": 0.6183329734308459,
"learning_rate": 6.1295451401050645e-06,
"loss": 0.6487,
"mean_token_accuracy": 0.8231626331806183,
"step": 765
},
{
"epoch": 0.9759188846641318,
"grad_norm": 0.6472859730529533,
"learning_rate": 6.075597245355589e-06,
"loss": 0.6367,
"mean_token_accuracy": 0.8252906337380409,
"step": 770
},
{
"epoch": 0.982256020278834,
"grad_norm": 0.7048827333728572,
"learning_rate": 6.021517734718193e-06,
"loss": 0.6331,
"mean_token_accuracy": 0.8252324685454369,
"step": 775
},
{
"epoch": 0.9885931558935361,
"grad_norm": 0.670917489168749,
"learning_rate": 5.967313225653863e-06,
"loss": 0.6311,
"mean_token_accuracy": 0.8262254923582077,
"step": 780
},
{
"epoch": 0.9949302915082383,
"grad_norm": 0.6294039276801214,
"learning_rate": 5.912990350919075e-06,
"loss": 0.6366,
"mean_token_accuracy": 0.8250793889164925,
"step": 785
},
{
"epoch": 1.0012674271229405,
"grad_norm": 0.5750660780176277,
"learning_rate": 5.85855575775416e-06,
"loss": 0.6356,
"mean_token_accuracy": 0.8255759388208389,
"step": 790
},
{
"epoch": 1.0076045627376427,
"grad_norm": 0.5873083803261483,
"learning_rate": 5.804016107069922e-06,
"loss": 0.5899,
"mean_token_accuracy": 0.8365576922893524,
"step": 795
},
{
"epoch": 1.0139416983523448,
"grad_norm": 0.7245732012982048,
"learning_rate": 5.749378072632572e-06,
"loss": 0.5924,
"mean_token_accuracy": 0.8353384211659431,
"step": 800
},
{
"epoch": 1.020278833967047,
"grad_norm": 0.5625102045050165,
"learning_rate": 5.694648340247087e-06,
"loss": 0.5855,
"mean_token_accuracy": 0.8365451633930207,
"step": 805
},
{
"epoch": 1.026615969581749,
"grad_norm": 0.5811796962078384,
"learning_rate": 5.639833606939103e-06,
"loss": 0.5835,
"mean_token_accuracy": 0.8374374285340309,
"step": 810
},
{
"epoch": 1.0329531051964511,
"grad_norm": 0.6064815307080854,
"learning_rate": 5.584940580135423e-06,
"loss": 0.5918,
"mean_token_accuracy": 0.835510890185833,
"step": 815
},
{
"epoch": 1.0392902408111533,
"grad_norm": 0.5469353793310435,
"learning_rate": 5.529975976843268e-06,
"loss": 0.5765,
"mean_token_accuracy": 0.839336322247982,
"step": 820
},
{
"epoch": 1.0456273764258555,
"grad_norm": 0.591194718615289,
"learning_rate": 5.474946522828344e-06,
"loss": 0.571,
"mean_token_accuracy": 0.8397138401865959,
"step": 825
},
{
"epoch": 1.0519645120405576,
"grad_norm": 0.6529351075330402,
"learning_rate": 5.419858951791842e-06,
"loss": 0.587,
"mean_token_accuracy": 0.8367372244596482,
"step": 830
},
{
"epoch": 1.0583016476552598,
"grad_norm": 0.5750159656566077,
"learning_rate": 5.364720004546467e-06,
"loss": 0.5713,
"mean_token_accuracy": 0.8396085217595101,
"step": 835
},
{
"epoch": 1.064638783269962,
"grad_norm": 0.5356356446036812,
"learning_rate": 5.3095364281915905e-06,
"loss": 0.5743,
"mean_token_accuracy": 0.8390779420733452,
"step": 840
},
{
"epoch": 1.0709759188846641,
"grad_norm": 0.5657627605570825,
"learning_rate": 5.254314975287649e-06,
"loss": 0.5768,
"mean_token_accuracy": 0.8388962477445603,
"step": 845
},
{
"epoch": 1.0773130544993663,
"grad_norm": 0.5994046834255601,
"learning_rate": 5.199062403029851e-06,
"loss": 0.5779,
"mean_token_accuracy": 0.838576190173626,
"step": 850
},
{
"epoch": 1.0836501901140685,
"grad_norm": 0.5512378922303693,
"learning_rate": 5.143785472421341e-06,
"loss": 0.5736,
"mean_token_accuracy": 0.8392498835921287,
"step": 855
},
{
"epoch": 1.0899873257287707,
"grad_norm": 0.6231063067990558,
"learning_rate": 5.088490947445884e-06,
"loss": 0.5787,
"mean_token_accuracy": 0.8382582783699035,
"step": 860
},
{
"epoch": 1.0963244613434728,
"grad_norm": 0.6258759211004005,
"learning_rate": 5.033185594240184e-06,
"loss": 0.5867,
"mean_token_accuracy": 0.8368578165769577,
"step": 865
},
{
"epoch": 1.102661596958175,
"grad_norm": 0.5758426513877979,
"learning_rate": 4.977876180265948e-06,
"loss": 0.5781,
"mean_token_accuracy": 0.8380098447203637,
"step": 870
},
{
"epoch": 1.1089987325728772,
"grad_norm": 0.5362099307940532,
"learning_rate": 4.922569473481779e-06,
"loss": 0.579,
"mean_token_accuracy": 0.8374864637851716,
"step": 875
},
{
"epoch": 1.1153358681875791,
"grad_norm": 0.6117359275633708,
"learning_rate": 4.867272241515013e-06,
"loss": 0.5745,
"mean_token_accuracy": 0.8394086301326752,
"step": 880
},
{
"epoch": 1.1216730038022813,
"grad_norm": 0.6163525031585341,
"learning_rate": 4.811991250833598e-06,
"loss": 0.575,
"mean_token_accuracy": 0.8387202203273774,
"step": 885
},
{
"epoch": 1.1280101394169835,
"grad_norm": 0.5344005108748248,
"learning_rate": 4.756733265918111e-06,
"loss": 0.5805,
"mean_token_accuracy": 0.8385160818696022,
"step": 890
},
{
"epoch": 1.1343472750316856,
"grad_norm": 0.5606427842219186,
"learning_rate": 4.701505048434017e-06,
"loss": 0.58,
"mean_token_accuracy": 0.837983712553978,
"step": 895
},
{
"epoch": 1.1406844106463878,
"grad_norm": 0.5635525365545201,
"learning_rate": 4.646313356404278e-06,
"loss": 0.5721,
"mean_token_accuracy": 0.8402201250195503,
"step": 900
},
{
"epoch": 1.14702154626109,
"grad_norm": 0.5279253266696204,
"learning_rate": 4.5911649433824055e-06,
"loss": 0.5722,
"mean_token_accuracy": 0.8398120388388634,
"step": 905
},
{
"epoch": 1.1533586818757922,
"grad_norm": 0.5355638715895371,
"learning_rate": 4.536066557626057e-06,
"loss": 0.5717,
"mean_token_accuracy": 0.8396236389875412,
"step": 910
},
{
"epoch": 1.1596958174904943,
"grad_norm": 0.5298050755566127,
"learning_rate": 4.481024941271283e-06,
"loss": 0.5825,
"mean_token_accuracy": 0.837471354007721,
"step": 915
},
{
"epoch": 1.1660329531051965,
"grad_norm": 0.6099091835516977,
"learning_rate": 4.426046829507525e-06,
"loss": 0.5739,
"mean_token_accuracy": 0.8395572647452354,
"step": 920
},
{
"epoch": 1.1723700887198987,
"grad_norm": 0.5282685583180019,
"learning_rate": 4.371138949753457e-06,
"loss": 0.5758,
"mean_token_accuracy": 0.8386889979243278,
"step": 925
},
{
"epoch": 1.1787072243346008,
"grad_norm": 0.5498053929758666,
"learning_rate": 4.316308020833788e-06,
"loss": 0.5717,
"mean_token_accuracy": 0.8401581376791001,
"step": 930
},
{
"epoch": 1.1850443599493028,
"grad_norm": 0.545684866299052,
"learning_rate": 4.261560752157106e-06,
"loss": 0.5821,
"mean_token_accuracy": 0.8375889748334885,
"step": 935
},
{
"epoch": 1.1913814955640052,
"grad_norm": 0.5275676276739754,
"learning_rate": 4.20690384289488e-06,
"loss": 0.5865,
"mean_token_accuracy": 0.8369634434580803,
"step": 940
},
{
"epoch": 1.1977186311787071,
"grad_norm": 0.5147709084468725,
"learning_rate": 4.152343981161713e-06,
"loss": 0.5735,
"mean_token_accuracy": 0.8388126537203788,
"step": 945
},
{
"epoch": 1.2040557667934093,
"grad_norm": 0.5553445767071536,
"learning_rate": 4.097887843196949e-06,
"loss": 0.5706,
"mean_token_accuracy": 0.8400391504168511,
"step": 950
},
{
"epoch": 1.2103929024081115,
"grad_norm": 0.5755093000837989,
"learning_rate": 4.043542092547729e-06,
"loss": 0.5738,
"mean_token_accuracy": 0.8393745362758637,
"step": 955
},
{
"epoch": 1.2167300380228137,
"grad_norm": 0.5323369182306833,
"learning_rate": 3.989313379253609e-06,
"loss": 0.5707,
"mean_token_accuracy": 0.8395906254649163,
"step": 960
},
{
"epoch": 1.2230671736375158,
"grad_norm": 0.5398923057065517,
"learning_rate": 3.935208339032819e-06,
"loss": 0.5773,
"mean_token_accuracy": 0.8380544230341911,
"step": 965
},
{
"epoch": 1.229404309252218,
"grad_norm": 0.5138506118324425,
"learning_rate": 3.881233592470287e-06,
"loss": 0.5697,
"mean_token_accuracy": 0.8401115134358406,
"step": 970
},
{
"epoch": 1.2357414448669202,
"grad_norm": 0.531254594806762,
"learning_rate": 3.827395744207504e-06,
"loss": 0.5802,
"mean_token_accuracy": 0.8385789826512337,
"step": 975
},
{
"epoch": 1.2420785804816223,
"grad_norm": 0.5209427066358759,
"learning_rate": 3.773701382134345e-06,
"loss": 0.5788,
"mean_token_accuracy": 0.8383644595742226,
"step": 980
},
{
"epoch": 1.2484157160963245,
"grad_norm": 0.4981386065382922,
"learning_rate": 3.7201570765829405e-06,
"loss": 0.5803,
"mean_token_accuracy": 0.8378679618239403,
"step": 985
},
{
"epoch": 1.2547528517110267,
"grad_norm": 0.5310216835837045,
"learning_rate": 3.666769379523695e-06,
"loss": 0.5816,
"mean_token_accuracy": 0.8382963240146637,
"step": 990
},
{
"epoch": 1.2610899873257289,
"grad_norm": 0.5302964748937399,
"learning_rate": 3.6135448237635505e-06,
"loss": 0.568,
"mean_token_accuracy": 0.8408621445298194,
"step": 995
},
{
"epoch": 1.2674271229404308,
"grad_norm": 0.6043312455865852,
"learning_rate": 3.5604899221466003e-06,
"loss": 0.5797,
"mean_token_accuracy": 0.837955892086029,
"step": 1000
},
{
"epoch": 1.2737642585551332,
"grad_norm": 0.5404711838738012,
"learning_rate": 3.507611166757141e-06,
"loss": 0.577,
"mean_token_accuracy": 0.8382121488451958,
"step": 1005
},
{
"epoch": 1.2801013941698351,
"grad_norm": 0.5313905403777647,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.5759,
"mean_token_accuracy": 0.8386555135250091,
"step": 1010
},
{
"epoch": 1.2864385297845373,
"grad_norm": 0.5312545340451698,
"learning_rate": 3.4024079544350874e-06,
"loss": 0.5766,
"mean_token_accuracy": 0.8384982272982597,
"step": 1015
},
{
"epoch": 1.2927756653992395,
"grad_norm": 0.574010488002488,
"learning_rate": 3.3500963707357236e-06,
"loss": 0.5817,
"mean_token_accuracy": 0.838199020922184,
"step": 1020
},
{
"epoch": 1.2991128010139417,
"grad_norm": 0.5162313236359333,
"learning_rate": 3.297986678155074e-06,
"loss": 0.5596,
"mean_token_accuracy": 0.8421908557415009,
"step": 1025
},
{
"epoch": 1.3054499366286438,
"grad_norm": 0.6187258006031299,
"learning_rate": 3.24608525311655e-06,
"loss": 0.5633,
"mean_token_accuracy": 0.842179323732853,
"step": 1030
},
{
"epoch": 1.311787072243346,
"grad_norm": 0.5140882862368508,
"learning_rate": 3.1943984465588253e-06,
"loss": 0.5704,
"mean_token_accuracy": 0.8403183802962303,
"step": 1035
},
{
"epoch": 1.3181242078580482,
"grad_norm": 0.5261806551468972,
"learning_rate": 3.142932583158693e-06,
"loss": 0.5664,
"mean_token_accuracy": 0.8412504211068154,
"step": 1040
},
{
"epoch": 1.3244613434727504,
"grad_norm": 0.5355046745744655,
"learning_rate": 3.0916939605571534e-06,
"loss": 0.5668,
"mean_token_accuracy": 0.8411947041749954,
"step": 1045
},
{
"epoch": 1.3307984790874525,
"grad_norm": 0.5828342485781398,
"learning_rate": 3.040688848588788e-06,
"loss": 0.5683,
"mean_token_accuracy": 0.8403848618268966,
"step": 1050
},
{
"epoch": 1.3371356147021547,
"grad_norm": 0.515568887419182,
"learning_rate": 2.989923488514566e-06,
"loss": 0.5734,
"mean_token_accuracy": 0.8396067947149277,
"step": 1055
},
{
"epoch": 1.3434727503168569,
"grad_norm": 0.533119717549416,
"learning_rate": 2.9394040922581123e-06,
"loss": 0.5788,
"mean_token_accuracy": 0.8387560814619064,
"step": 1060
},
{
"epoch": 1.3498098859315588,
"grad_norm": 0.5574493299249907,
"learning_rate": 2.889136841645592e-06,
"loss": 0.5738,
"mean_token_accuracy": 0.839569516479969,
"step": 1065
},
{
"epoch": 1.3561470215462612,
"grad_norm": 0.5301348229908708,
"learning_rate": 2.839127887649271e-06,
"loss": 0.5751,
"mean_token_accuracy": 0.8394772946834564,
"step": 1070
},
{
"epoch": 1.3624841571609632,
"grad_norm": 0.5071728571486687,
"learning_rate": 2.789383349634841e-06,
"loss": 0.5711,
"mean_token_accuracy": 0.8398226588964463,
"step": 1075
},
{
"epoch": 1.3688212927756653,
"grad_norm": 0.4997381831510659,
"learning_rate": 2.73990931461263e-06,
"loss": 0.5783,
"mean_token_accuracy": 0.8384912863373757,
"step": 1080
},
{
"epoch": 1.3751584283903675,
"grad_norm": 0.5019388182436546,
"learning_rate": 2.690711836492758e-06,
"loss": 0.5711,
"mean_token_accuracy": 0.8396464511752129,
"step": 1085
},
{
"epoch": 1.3814955640050697,
"grad_norm": 0.5165116686484276,
"learning_rate": 2.6417969353443484e-06,
"loss": 0.5721,
"mean_token_accuracy": 0.8395859107375145,
"step": 1090
},
{
"epoch": 1.3878326996197718,
"grad_norm": 0.5372603660779312,
"learning_rate": 2.5931705966588803e-06,
"loss": 0.5826,
"mean_token_accuracy": 0.8370852112770081,
"step": 1095
},
{
"epoch": 1.394169835234474,
"grad_norm": 0.5104565997924485,
"learning_rate": 2.544838770617772e-06,
"loss": 0.5785,
"mean_token_accuracy": 0.8393797069787979,
"step": 1100
},
{
"epoch": 1.4005069708491762,
"grad_norm": 0.5336610190327751,
"learning_rate": 2.496807371364283e-06,
"loss": 0.5759,
"mean_token_accuracy": 0.8390834912657738,
"step": 1105
},
{
"epoch": 1.4068441064638784,
"grad_norm": 0.662951455066245,
"learning_rate": 2.44908227627983e-06,
"loss": 0.5712,
"mean_token_accuracy": 0.8397842928767204,
"step": 1110
},
{
"epoch": 1.4131812420785805,
"grad_norm": 0.5438222471825553,
"learning_rate": 2.4016693252647954e-06,
"loss": 0.5703,
"mean_token_accuracy": 0.8397609844803811,
"step": 1115
},
{
"epoch": 1.4195183776932827,
"grad_norm": 0.5457903944622784,
"learning_rate": 2.3545743200239303e-06,
"loss": 0.5756,
"mean_token_accuracy": 0.8387856274843216,
"step": 1120
},
{
"epoch": 1.4258555133079849,
"grad_norm": 0.5413159299268847,
"learning_rate": 2.3078030233564203e-06,
"loss": 0.5796,
"mean_token_accuracy": 0.8379950270056724,
"step": 1125
},
{
"epoch": 1.4321926489226868,
"grad_norm": 0.5017485230997426,
"learning_rate": 2.2613611584507227e-06,
"loss": 0.5843,
"mean_token_accuracy": 0.8371415048837662,
"step": 1130
},
{
"epoch": 1.4385297845373892,
"grad_norm": 0.5036035556859302,
"learning_rate": 2.215254408184249e-06,
"loss": 0.5733,
"mean_token_accuracy": 0.8397385001182556,
"step": 1135
},
{
"epoch": 1.4448669201520912,
"grad_norm": 0.5512472367603704,
"learning_rate": 2.169488414427969e-06,
"loss": 0.5665,
"mean_token_accuracy": 0.8411229193210602,
"step": 1140
},
{
"epoch": 1.4512040557667933,
"grad_norm": 0.5122324337296091,
"learning_rate": 2.1240687773560476e-06,
"loss": 0.5754,
"mean_token_accuracy": 0.838901475071907,
"step": 1145
},
{
"epoch": 1.4575411913814955,
"grad_norm": 0.514428924855705,
"learning_rate": 2.0790010547605743e-06,
"loss": 0.5773,
"mean_token_accuracy": 0.8385174334049225,
"step": 1150
},
{
"epoch": 1.4638783269961977,
"grad_norm": 0.541489817693485,
"learning_rate": 2.0342907613714837e-06,
"loss": 0.5724,
"mean_token_accuracy": 0.839878860116005,
"step": 1155
},
{
"epoch": 1.4702154626108999,
"grad_norm": 0.5233399327286699,
"learning_rate": 1.989943368181741e-06,
"loss": 0.5683,
"mean_token_accuracy": 0.8406485706567765,
"step": 1160
},
{
"epoch": 1.476552598225602,
"grad_norm": 0.4977622157535387,
"learning_rate": 1.945964301777883e-06,
"loss": 0.5568,
"mean_token_accuracy": 0.8429565221071244,
"step": 1165
},
{
"epoch": 1.4828897338403042,
"grad_norm": 0.502171168050283,
"learning_rate": 1.9023589436759954e-06,
"loss": 0.555,
"mean_token_accuracy": 0.8435925453901291,
"step": 1170
},
{
"epoch": 1.4892268694550064,
"grad_norm": 0.5026240018805591,
"learning_rate": 1.859132629663194e-06,
"loss": 0.5609,
"mean_token_accuracy": 0.8420811951160431,
"step": 1175
},
{
"epoch": 1.4955640050697085,
"grad_norm": 0.5071369135189446,
"learning_rate": 1.8162906491447136e-06,
"loss": 0.5751,
"mean_token_accuracy": 0.8397066414356231,
"step": 1180
},
{
"epoch": 1.5019011406844105,
"grad_norm": 0.5012155091792143,
"learning_rate": 1.7738382444966668e-06,
"loss": 0.5714,
"mean_token_accuracy": 0.839833353459835,
"step": 1185
},
{
"epoch": 1.508238276299113,
"grad_norm": 0.4943163959620169,
"learning_rate": 1.7317806104245599e-06,
"loss": 0.5614,
"mean_token_accuracy": 0.8422631338238716,
"step": 1190
},
{
"epoch": 1.5145754119138148,
"grad_norm": 0.5168969148185261,
"learning_rate": 1.6901228933276381e-06,
"loss": 0.5734,
"mean_token_accuracy": 0.8398737594485283,
"step": 1195
},
{
"epoch": 1.5209125475285172,
"grad_norm": 0.5085722470934201,
"learning_rate": 1.6488701906691462e-06,
"loss": 0.5743,
"mean_token_accuracy": 0.8395018294453621,
"step": 1200
},
{
"epoch": 1.5272496831432192,
"grad_norm": 0.5145560441594629,
"learning_rate": 1.6080275503525754e-06,
"loss": 0.5714,
"mean_token_accuracy": 0.8400074362754821,
"step": 1205
},
{
"epoch": 1.5335868187579216,
"grad_norm": 0.5142209477213089,
"learning_rate": 1.5675999701039734e-06,
"loss": 0.5731,
"mean_token_accuracy": 0.8395378664135933,
"step": 1210
},
{
"epoch": 1.5399239543726235,
"grad_norm": 0.4817695083655761,
"learning_rate": 1.5275923968603967e-06,
"loss": 0.5668,
"mean_token_accuracy": 0.840859878063202,
"step": 1215
},
{
"epoch": 1.5462610899873257,
"grad_norm": 0.4958218170076731,
"learning_rate": 1.4880097261645765e-06,
"loss": 0.575,
"mean_token_accuracy": 0.8392793446779251,
"step": 1220
},
{
"epoch": 1.5525982256020279,
"grad_norm": 0.5150469794513786,
"learning_rate": 1.4488568015658738e-06,
"loss": 0.5702,
"mean_token_accuracy": 0.8403733685612679,
"step": 1225
},
{
"epoch": 1.55893536121673,
"grad_norm": 0.5415616286404993,
"learning_rate": 1.4101384140275947e-06,
"loss": 0.5724,
"mean_token_accuracy": 0.8399771124124527,
"step": 1230
},
{
"epoch": 1.5652724968314322,
"grad_norm": 0.5125659970580118,
"learning_rate": 1.3718593013407455e-06,
"loss": 0.565,
"mean_token_accuracy": 0.8413113921880722,
"step": 1235
},
{
"epoch": 1.5716096324461344,
"grad_norm": 0.5172557001838594,
"learning_rate": 1.3340241475442889e-06,
"loss": 0.5666,
"mean_token_accuracy": 0.8413270160555839,
"step": 1240
},
{
"epoch": 1.5779467680608366,
"grad_norm": 0.5218390924731011,
"learning_rate": 1.296637582351979e-06,
"loss": 0.5811,
"mean_token_accuracy": 0.8378918588161468,
"step": 1245
},
{
"epoch": 1.5842839036755385,
"grad_norm": 0.49941956793616216,
"learning_rate": 1.2597041805858469e-06,
"loss": 0.5597,
"mean_token_accuracy": 0.8421694174408912,
"step": 1250
},
{
"epoch": 1.590621039290241,
"grad_norm": 0.4810003693281146,
"learning_rate": 1.2232284616163986e-06,
"loss": 0.5646,
"mean_token_accuracy": 0.8418364375829697,
"step": 1255
},
{
"epoch": 1.5969581749049429,
"grad_norm": 0.49642278969512443,
"learning_rate": 1.1872148888096024e-06,
"loss": 0.5686,
"mean_token_accuracy": 0.840269310772419,
"step": 1260
},
{
"epoch": 1.6032953105196452,
"grad_norm": 0.5258808050772633,
"learning_rate": 1.1516678689807249e-06,
"loss": 0.5665,
"mean_token_accuracy": 0.8409392833709717,
"step": 1265
},
{
"epoch": 1.6096324461343472,
"grad_norm": 0.4807160453689938,
"learning_rate": 1.1165917518550913e-06,
"loss": 0.5671,
"mean_token_accuracy": 0.8411058440804482,
"step": 1270
},
{
"epoch": 1.6159695817490496,
"grad_norm": 0.48965855513910594,
"learning_rate": 1.0819908295358284e-06,
"loss": 0.5588,
"mean_token_accuracy": 0.8429983571171761,
"step": 1275
},
{
"epoch": 1.6223067173637515,
"grad_norm": 0.5202990154276527,
"learning_rate": 1.0478693359786612e-06,
"loss": 0.5716,
"mean_token_accuracy": 0.8400727063417435,
"step": 1280
},
{
"epoch": 1.6286438529784537,
"grad_norm": 0.5171890350253132,
"learning_rate": 1.0142314464738195e-06,
"loss": 0.5517,
"mean_token_accuracy": 0.8443869799375534,
"step": 1285
},
{
"epoch": 1.6349809885931559,
"grad_norm": 0.48132181865431867,
"learning_rate": 9.810812771351335e-07,
"loss": 0.5784,
"mean_token_accuracy": 0.8387523666024208,
"step": 1290
},
{
"epoch": 1.641318124207858,
"grad_norm": 0.48031587809861415,
"learning_rate": 9.484228843963577e-07,
"loss": 0.5609,
"mean_token_accuracy": 0.8421882972121238,
"step": 1295
},
{
"epoch": 1.6476552598225602,
"grad_norm": 0.48862410273482815,
"learning_rate": 9.16260264514805e-07,
"loss": 0.5739,
"mean_token_accuracy": 0.8393760696053505,
"step": 1300
},
{
"epoch": 1.6539923954372624,
"grad_norm": 0.4974345984726092,
"learning_rate": 8.845973530823443e-07,
"loss": 0.5623,
"mean_token_accuracy": 0.842260554432869,
"step": 1305
},
{
"epoch": 1.6603295310519646,
"grad_norm": 0.4969870292569671,
"learning_rate": 8.534380245438212e-07,
"loss": 0.5806,
"mean_token_accuracy": 0.8379565149545669,
"step": 1310
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.51170305488906,
"learning_rate": 8.22786091722958e-07,
"loss": 0.5744,
"mean_token_accuracy": 0.8394851118326188,
"step": 1315
},
{
"epoch": 1.673003802281369,
"grad_norm": 0.4882536601279716,
"learning_rate": 7.926453053557948e-07,
"loss": 0.5694,
"mean_token_accuracy": 0.8412208631634712,
"step": 1320
},
{
"epoch": 1.6793409378960709,
"grad_norm": 0.5201633381345815,
"learning_rate": 7.630193536317354e-07,
"loss": 0.5779,
"mean_token_accuracy": 0.8387572214007377,
"step": 1325
},
{
"epoch": 1.6856780735107733,
"grad_norm": 0.4872309884092355,
"learning_rate": 7.339118617422325e-07,
"loss": 0.5721,
"mean_token_accuracy": 0.840134784579277,
"step": 1330
},
{
"epoch": 1.6920152091254752,
"grad_norm": 0.4742262519043048,
"learning_rate": 7.05326391437195e-07,
"loss": 0.567,
"mean_token_accuracy": 0.8408115699887275,
"step": 1335
},
{
"epoch": 1.6983523447401776,
"grad_norm": 0.48084605496078786,
"learning_rate": 6.772664405891505e-07,
"loss": 0.5739,
"mean_token_accuracy": 0.8401078969240189,
"step": 1340
},
{
"epoch": 1.7046894803548795,
"grad_norm": 0.4836055364313366,
"learning_rate": 6.49735442765228e-07,
"loss": 0.5771,
"mean_token_accuracy": 0.8388657510280609,
"step": 1345
},
{
"epoch": 1.7110266159695817,
"grad_norm": 0.4955193703741457,
"learning_rate": 6.227367668070084e-07,
"loss": 0.5641,
"mean_token_accuracy": 0.8420116931200028,
"step": 1350
},
{
"epoch": 1.717363751584284,
"grad_norm": 0.47888043666477453,
"learning_rate": 5.962737164182942e-07,
"loss": 0.5695,
"mean_token_accuracy": 0.8411467924714089,
"step": 1355
},
{
"epoch": 1.723700887198986,
"grad_norm": 0.48358100558875267,
"learning_rate": 5.703495297608486e-07,
"loss": 0.5672,
"mean_token_accuracy": 0.8408854246139527,
"step": 1360
},
{
"epoch": 1.7300380228136882,
"grad_norm": 0.48450381732440073,
"learning_rate": 5.449673790581611e-07,
"loss": 0.5756,
"mean_token_accuracy": 0.8394671693444252,
"step": 1365
},
{
"epoch": 1.7363751584283904,
"grad_norm": 0.524224789009983,
"learning_rate": 5.201303702072724e-07,
"loss": 0.564,
"mean_token_accuracy": 0.8414558693766594,
"step": 1370
},
{
"epoch": 1.7427122940430926,
"grad_norm": 0.47448699280100953,
"learning_rate": 4.958415423987229e-07,
"loss": 0.5576,
"mean_token_accuracy": 0.8432327851653099,
"step": 1375
},
{
"epoch": 1.7490494296577945,
"grad_norm": 0.4999589058798834,
"learning_rate": 4.721038677446599e-07,
"loss": 0.5543,
"mean_token_accuracy": 0.8434969082474708,
"step": 1380
},
{
"epoch": 1.755386565272497,
"grad_norm": 0.49519130319734356,
"learning_rate": 4.4892025091515465e-07,
"loss": 0.5744,
"mean_token_accuracy": 0.8392727747559547,
"step": 1385
},
{
"epoch": 1.7617237008871989,
"grad_norm": 0.47996153862103574,
"learning_rate": 4.2629352878276964e-07,
"loss": 0.5757,
"mean_token_accuracy": 0.8395681723952293,
"step": 1390
},
{
"epoch": 1.7680608365019013,
"grad_norm": 0.4743677174789034,
"learning_rate": 4.04226470075425e-07,
"loss": 0.5793,
"mean_token_accuracy": 0.8383775666356087,
"step": 1395
},
{
"epoch": 1.7743979721166032,
"grad_norm": 0.47358657093352546,
"learning_rate": 3.8272177503760277e-07,
"loss": 0.5666,
"mean_token_accuracy": 0.8409555062651635,
"step": 1400
},
{
"epoch": 1.7807351077313056,
"grad_norm": 0.47898043422535136,
"learning_rate": 3.6178207509992623e-07,
"loss": 0.5588,
"mean_token_accuracy": 0.8429359510540962,
"step": 1405
},
{
"epoch": 1.7870722433460076,
"grad_norm": 0.48612638069980213,
"learning_rate": 3.4140993255717123e-07,
"loss": 0.5687,
"mean_token_accuracy": 0.840995529294014,
"step": 1410
},
{
"epoch": 1.7934093789607097,
"grad_norm": 0.47802067614271637,
"learning_rate": 3.216078402547218e-07,
"loss": 0.5651,
"mean_token_accuracy": 0.8413813829421997,
"step": 1415
},
{
"epoch": 1.799746514575412,
"grad_norm": 0.45575767680162316,
"learning_rate": 3.0237822128353744e-07,
"loss": 0.5551,
"mean_token_accuracy": 0.8439073666930199,
"step": 1420
},
{
"epoch": 1.806083650190114,
"grad_norm": 0.5008888425261698,
"learning_rate": 2.8372342868364934e-07,
"loss": 0.5763,
"mean_token_accuracy": 0.8394736155867577,
"step": 1425
},
{
"epoch": 1.8124207858048162,
"grad_norm": 0.47883052147679717,
"learning_rate": 2.656457451562283e-07,
"loss": 0.5847,
"mean_token_accuracy": 0.8371838569641114,
"step": 1430
},
{
"epoch": 1.8187579214195184,
"grad_norm": 0.48136837053701437,
"learning_rate": 2.4814738278426287e-07,
"loss": 0.5713,
"mean_token_accuracy": 0.8400285989046097,
"step": 1435
},
{
"epoch": 1.8250950570342206,
"grad_norm": 0.47630227923243995,
"learning_rate": 2.3123048276187722e-07,
"loss": 0.5663,
"mean_token_accuracy": 0.8415055811405182,
"step": 1440
},
{
"epoch": 1.8314321926489225,
"grad_norm": 0.48067639897927306,
"learning_rate": 2.1489711513232038e-07,
"loss": 0.5702,
"mean_token_accuracy": 0.8404717803001404,
"step": 1445
},
{
"epoch": 1.837769328263625,
"grad_norm": 0.48817733468841595,
"learning_rate": 1.991492785346677e-07,
"loss": 0.5659,
"mean_token_accuracy": 0.8410487651824952,
"step": 1450
},
{
"epoch": 1.8441064638783269,
"grad_norm": 0.4753854139627654,
"learning_rate": 1.8398889995925428e-07,
"loss": 0.5612,
"mean_token_accuracy": 0.842425537109375,
"step": 1455
},
{
"epoch": 1.8504435994930293,
"grad_norm": 0.4979097318389579,
"learning_rate": 1.694178345118791e-07,
"loss": 0.5554,
"mean_token_accuracy": 0.843775661289692,
"step": 1460
},
{
"epoch": 1.8567807351077312,
"grad_norm": 0.4829356927499738,
"learning_rate": 1.5543786518680436e-07,
"loss": 0.556,
"mean_token_accuracy": 0.8434767201542854,
"step": 1465
},
{
"epoch": 1.8631178707224336,
"grad_norm": 0.4651233227253299,
"learning_rate": 1.4205070264857901e-07,
"loss": 0.5704,
"mean_token_accuracy": 0.8402711316943169,
"step": 1470
},
{
"epoch": 1.8694550063371356,
"grad_norm": 0.47253676852018517,
"learning_rate": 1.292579850227099e-07,
"loss": 0.5777,
"mean_token_accuracy": 0.8392020970582962,
"step": 1475
},
{
"epoch": 1.8757921419518377,
"grad_norm": 0.4800740772721781,
"learning_rate": 1.170612776952168e-07,
"loss": 0.566,
"mean_token_accuracy": 0.8414452761411667,
"step": 1480
},
{
"epoch": 1.88212927756654,
"grad_norm": 0.46528025174750537,
"learning_rate": 1.0546207312107814e-07,
"loss": 0.5636,
"mean_token_accuracy": 0.8416185140609741,
"step": 1485
},
{
"epoch": 1.888466413181242,
"grad_norm": 0.47693097112640276,
"learning_rate": 9.44617906416101e-08,
"loss": 0.5727,
"mean_token_accuracy": 0.8405211389064788,
"step": 1490
},
{
"epoch": 1.8948035487959443,
"grad_norm": 0.4787485103517413,
"learning_rate": 8.406177631078594e-08,
"loss": 0.5708,
"mean_token_accuracy": 0.8403903424739838,
"step": 1495
},
{
"epoch": 1.9011406844106464,
"grad_norm": 0.45967120152380847,
"learning_rate": 7.426330273052618e-08,
"loss": 0.5496,
"mean_token_accuracy": 0.8449963420629502,
"step": 1500
},
{
"epoch": 1.9074778200253486,
"grad_norm": 0.46451147059266606,
"learning_rate": 6.506756889497756e-08,
"loss": 0.5608,
"mean_token_accuracy": 0.8425014033913613,
"step": 1505
},
{
"epoch": 1.9138149556400506,
"grad_norm": 0.5057760468937542,
"learning_rate": 5.647570004379432e-08,
"loss": 0.5602,
"mean_token_accuracy": 0.8427406966686248,
"step": 1510
},
{
"epoch": 1.920152091254753,
"grad_norm": 0.48061481353459495,
"learning_rate": 4.848874752445221e-08,
"loss": 0.5675,
"mean_token_accuracy": 0.8411912024021149,
"step": 1515
},
{
"epoch": 1.926489226869455,
"grad_norm": 0.4689935228428535,
"learning_rate": 4.110768866359638e-08,
"loss": 0.5631,
"mean_token_accuracy": 0.8418816044926644,
"step": 1520
},
{
"epoch": 1.9328263624841573,
"grad_norm": 0.4698265767310371,
"learning_rate": 3.43334266474521e-08,
"loss": 0.5635,
"mean_token_accuracy": 0.8423062637448311,
"step": 1525
},
{
"epoch": 1.9391634980988592,
"grad_norm": 0.49190745957035076,
"learning_rate": 2.8166790411304766e-08,
"loss": 0.5644,
"mean_token_accuracy": 0.8418506249785424,
"step": 1530
},
{
"epoch": 1.9455006337135616,
"grad_norm": 0.4676519055114557,
"learning_rate": 2.260853453806944e-08,
"loss": 0.5691,
"mean_token_accuracy": 0.8408907786011696,
"step": 1535
},
{
"epoch": 1.9518377693282636,
"grad_norm": 0.4857147511585138,
"learning_rate": 1.7659339165952417e-08,
"loss": 0.5699,
"mean_token_accuracy": 0.8406305849552155,
"step": 1540
},
{
"epoch": 1.9581749049429658,
"grad_norm": 0.48303973039403075,
"learning_rate": 1.3319809905228409e-08,
"loss": 0.5765,
"mean_token_accuracy": 0.8395203098654747,
"step": 1545
},
{
"epoch": 1.964512040557668,
"grad_norm": 0.47745966065458134,
"learning_rate": 9.590477764135353e-09,
"loss": 0.5641,
"mean_token_accuracy": 0.8417988792061806,
"step": 1550
},
{
"epoch": 1.97084917617237,
"grad_norm": 0.4675012014451023,
"learning_rate": 6.47179908389417e-09,
"loss": 0.5699,
"mean_token_accuracy": 0.8404615536332131,
"step": 1555
},
{
"epoch": 1.9771863117870723,
"grad_norm": 0.4956658011789385,
"learning_rate": 3.964155482871213e-09,
"loss": 0.5592,
"mean_token_accuracy": 0.842540180683136,
"step": 1560
},
{
"epoch": 1.9835234474017744,
"grad_norm": 0.4689038627708401,
"learning_rate": 2.0678538098806158e-09,
"loss": 0.5745,
"mean_token_accuracy": 0.8394525855779648,
"step": 1565
},
{
"epoch": 1.9898605830164766,
"grad_norm": 0.4660374899806601,
"learning_rate": 7.83126106637111e-10,
"loss": 0.5643,
"mean_token_accuracy": 0.8416339352726936,
"step": 1570
},
{
"epoch": 1.9961977186311786,
"grad_norm": 0.4729664926744204,
"learning_rate": 1.1012957935985224e-10,
"loss": 0.5636,
"mean_token_accuracy": 0.8414568796753883,
"step": 1575
},
{
"epoch": 2.0,
"mean_token_accuracy": 0.8406301041444143,
"step": 1578,
"total_flos": 827207983300608.0,
"train_loss": 0.6521280055868006,
"train_runtime": 235151.6683,
"train_samples_per_second": 1.718,
"train_steps_per_second": 0.007
}
],
"logging_steps": 5,
"max_steps": 1578,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 827207983300608.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}