ds_llama / trainer_state.json
mshayan38's picture
Upload 14 files
102fdb4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9982238010657194,
"eval_steps": 500,
"global_step": 1266,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011841326228537596,
"grad_norm": 0.6710847020149231,
"learning_rate": 4.9998075682257415e-05,
"loss": 3.876,
"step": 5
},
{
"epoch": 0.023682652457075192,
"grad_norm": 0.8348745107650757,
"learning_rate": 4.9992303025269555e-05,
"loss": 3.8538,
"step": 10
},
{
"epoch": 0.035523978685612786,
"grad_norm": 2.387429714202881,
"learning_rate": 4.9982682917710524e-05,
"loss": 3.5742,
"step": 15
},
{
"epoch": 0.047365304914150384,
"grad_norm": 1.0946344137191772,
"learning_rate": 4.996921684055182e-05,
"loss": 3.5193,
"step": 20
},
{
"epoch": 0.05920663114268798,
"grad_norm": 1.1773927211761475,
"learning_rate": 4.9951906866834316e-05,
"loss": 2.9465,
"step": 25
},
{
"epoch": 0.07104795737122557,
"grad_norm": 1.147207260131836,
"learning_rate": 4.993075566134921e-05,
"loss": 3.0208,
"step": 30
},
{
"epoch": 0.08288928359976318,
"grad_norm": 1.8781776428222656,
"learning_rate": 4.990576648022768e-05,
"loss": 2.7173,
"step": 35
},
{
"epoch": 0.09473060982830077,
"grad_norm": 1.009155511856079,
"learning_rate": 4.987694317043969e-05,
"loss": 2.6235,
"step": 40
},
{
"epoch": 0.10657193605683836,
"grad_norm": 0.9919832944869995,
"learning_rate": 4.984429016920178e-05,
"loss": 2.4021,
"step": 45
},
{
"epoch": 0.11841326228537596,
"grad_norm": 0.8563632965087891,
"learning_rate": 4.980781250329389e-05,
"loss": 2.1688,
"step": 50
},
{
"epoch": 0.13025458851391356,
"grad_norm": 0.7374653816223145,
"learning_rate": 4.976751578828562e-05,
"loss": 2.2943,
"step": 55
},
{
"epoch": 0.14209591474245115,
"grad_norm": 0.9438661336898804,
"learning_rate": 4.9723406227671643e-05,
"loss": 1.9571,
"step": 60
},
{
"epoch": 0.15393724097098876,
"grad_norm": 0.9622551202774048,
"learning_rate": 4.967549061191679e-05,
"loss": 2.1024,
"step": 65
},
{
"epoch": 0.16577856719952636,
"grad_norm": 0.6480829119682312,
"learning_rate": 4.96237763174106e-05,
"loss": 1.9145,
"step": 70
},
{
"epoch": 0.17761989342806395,
"grad_norm": 1.2615852355957031,
"learning_rate": 4.956827130533185e-05,
"loss": 2.1472,
"step": 75
},
{
"epoch": 0.18946121965660154,
"grad_norm": 1.0979194641113281,
"learning_rate": 4.95089841204229e-05,
"loss": 2.0441,
"step": 80
},
{
"epoch": 0.20130254588513913,
"grad_norm": 0.8610992431640625,
"learning_rate": 4.944592388967428e-05,
"loss": 2.1605,
"step": 85
},
{
"epoch": 0.21314387211367672,
"grad_norm": 1.0579572916030884,
"learning_rate": 4.937910032091968e-05,
"loss": 2.2801,
"step": 90
},
{
"epoch": 0.22498519834221434,
"grad_norm": 1.0034611225128174,
"learning_rate": 4.930852370134141e-05,
"loss": 2.2327,
"step": 95
},
{
"epoch": 0.23682652457075193,
"grad_norm": 0.8670480847358704,
"learning_rate": 4.923420489588677e-05,
"loss": 2.0031,
"step": 100
},
{
"epoch": 0.24866785079928952,
"grad_norm": 0.8441548347473145,
"learning_rate": 4.9156155345595445e-05,
"loss": 1.8637,
"step": 105
},
{
"epoch": 0.2605091770278271,
"grad_norm": 0.8399316072463989,
"learning_rate": 4.907438706583818e-05,
"loss": 1.9813,
"step": 110
},
{
"epoch": 0.27235050325636473,
"grad_norm": 0.9002792835235596,
"learning_rate": 4.898891264446709e-05,
"loss": 2.1698,
"step": 115
},
{
"epoch": 0.2841918294849023,
"grad_norm": 1.1735421419143677,
"learning_rate": 4.8899745239877845e-05,
"loss": 2.1691,
"step": 120
},
{
"epoch": 0.2960331557134399,
"grad_norm": 1.2177324295043945,
"learning_rate": 4.880689857898392e-05,
"loss": 2.1437,
"step": 125
},
{
"epoch": 0.30787448194197753,
"grad_norm": 1.062441349029541,
"learning_rate": 4.871038695510347e-05,
"loss": 2.078,
"step": 130
},
{
"epoch": 0.3197158081705151,
"grad_norm": 0.8719960451126099,
"learning_rate": 4.861022522575892e-05,
"loss": 2.0338,
"step": 135
},
{
"epoch": 0.3315571343990527,
"grad_norm": 1.042143702507019,
"learning_rate": 4.8506428810389696e-05,
"loss": 2.1387,
"step": 140
},
{
"epoch": 0.3433984606275903,
"grad_norm": 0.8554458022117615,
"learning_rate": 4.839901368797849e-05,
"loss": 2.0853,
"step": 145
},
{
"epoch": 0.3552397868561279,
"grad_norm": 1.033553957939148,
"learning_rate": 4.828799639459138e-05,
"loss": 1.9919,
"step": 150
},
{
"epoch": 0.36708111308466546,
"grad_norm": 0.7688170075416565,
"learning_rate": 4.8173394020832164e-05,
"loss": 2.1078,
"step": 155
},
{
"epoch": 0.3789224393132031,
"grad_norm": 0.9199577569961548,
"learning_rate": 4.8055224209211316e-05,
"loss": 1.9385,
"step": 160
},
{
"epoch": 0.3907637655417407,
"grad_norm": 1.0168030261993408,
"learning_rate": 4.793350515143007e-05,
"loss": 1.8268,
"step": 165
},
{
"epoch": 0.40260509177027826,
"grad_norm": 1.2735998630523682,
"learning_rate": 4.780825558557981e-05,
"loss": 1.8595,
"step": 170
},
{
"epoch": 0.4144464179988159,
"grad_norm": 0.7815925478935242,
"learning_rate": 4.767949479325748e-05,
"loss": 2.005,
"step": 175
},
{
"epoch": 0.42628774422735344,
"grad_norm": 0.9479308724403381,
"learning_rate": 4.7547242596597274e-05,
"loss": 2.0088,
"step": 180
},
{
"epoch": 0.43812907045589106,
"grad_norm": 1.5356885194778442,
"learning_rate": 4.7411519355219066e-05,
"loss": 2.1591,
"step": 185
},
{
"epoch": 0.4499703966844287,
"grad_norm": 1.3683360815048218,
"learning_rate": 4.727234596309417e-05,
"loss": 1.9772,
"step": 190
},
{
"epoch": 0.46181172291296624,
"grad_norm": 1.2764394283294678,
"learning_rate": 4.71297438453288e-05,
"loss": 1.9476,
"step": 195
},
{
"epoch": 0.47365304914150386,
"grad_norm": 1.4485411643981934,
"learning_rate": 4.698373495486579e-05,
"loss": 2.0495,
"step": 200
},
{
"epoch": 0.4854943753700414,
"grad_norm": 0.9207140207290649,
"learning_rate": 4.683434176910503e-05,
"loss": 2.0267,
"step": 205
},
{
"epoch": 0.49733570159857904,
"grad_norm": 1.0169957876205444,
"learning_rate": 4.6681587286443146e-05,
"loss": 1.8998,
"step": 210
},
{
"epoch": 0.5091770278271166,
"grad_norm": 1.0399326086044312,
"learning_rate": 4.652549502273304e-05,
"loss": 1.9095,
"step": 215
},
{
"epoch": 0.5210183540556542,
"grad_norm": 1.6732031106948853,
"learning_rate": 4.636608900766372e-05,
"loss": 1.935,
"step": 220
},
{
"epoch": 0.5328596802841918,
"grad_norm": 1.1295055150985718,
"learning_rate": 4.620339378106102e-05,
"loss": 2.084,
"step": 225
},
{
"epoch": 0.5447010065127295,
"grad_norm": 1.196055293083191,
"learning_rate": 4.603743438910986e-05,
"loss": 1.9387,
"step": 230
},
{
"epoch": 0.5565423327412671,
"grad_norm": 1.519239902496338,
"learning_rate": 4.586823638049841e-05,
"loss": 1.871,
"step": 235
},
{
"epoch": 0.5683836589698046,
"grad_norm": 1.1272022724151611,
"learning_rate": 4.5695825802485085e-05,
"loss": 1.9629,
"step": 240
},
{
"epoch": 0.5802249851983422,
"grad_norm": 1.135977864265442,
"learning_rate": 4.552022919688861e-05,
"loss": 2.1182,
"step": 245
},
{
"epoch": 0.5920663114268798,
"grad_norm": 1.3236933946609497,
"learning_rate": 4.53414735960021e-05,
"loss": 1.8874,
"step": 250
},
{
"epoch": 0.6039076376554174,
"grad_norm": 2.230280637741089,
"learning_rate": 4.51595865184315e-05,
"loss": 1.8281,
"step": 255
},
{
"epoch": 0.6157489638839551,
"grad_norm": 1.3642871379852295,
"learning_rate": 4.497459596485924e-05,
"loss": 2.0387,
"step": 260
},
{
"epoch": 0.6275902901124926,
"grad_norm": 0.9770427942276001,
"learning_rate": 4.47865304137337e-05,
"loss": 2.1307,
"step": 265
},
{
"epoch": 0.6394316163410302,
"grad_norm": 3.981346845626831,
"learning_rate": 4.4595418816885004e-05,
"loss": 2.0755,
"step": 270
},
{
"epoch": 0.6512729425695678,
"grad_norm": 1.1023027896881104,
"learning_rate": 4.440129059506808e-05,
"loss": 1.9149,
"step": 275
},
{
"epoch": 0.6631142687981054,
"grad_norm": 1.648698091506958,
"learning_rate": 4.420417563343346e-05,
"loss": 1.984,
"step": 280
},
{
"epoch": 0.6749555950266429,
"grad_norm": 1.157861590385437,
"learning_rate": 4.40041042769266e-05,
"loss": 1.8697,
"step": 285
},
{
"epoch": 0.6867969212551805,
"grad_norm": 1.0009181499481201,
"learning_rate": 4.380110732561637e-05,
"loss": 1.7951,
"step": 290
},
{
"epoch": 0.6986382474837182,
"grad_norm": 2.11307692527771,
"learning_rate": 4.3595216029953575e-05,
"loss": 1.6276,
"step": 295
},
{
"epoch": 0.7104795737122558,
"grad_norm": 1.0800515413284302,
"learning_rate": 4.3386462085960086e-05,
"loss": 1.8438,
"step": 300
},
{
"epoch": 0.7223208999407934,
"grad_norm": 0.9078517556190491,
"learning_rate": 4.3174877630349366e-05,
"loss": 1.7839,
"step": 305
},
{
"epoch": 0.7341622261693309,
"grad_norm": 1.0842117071151733,
"learning_rate": 4.296049523557917e-05,
"loss": 1.8604,
"step": 310
},
{
"epoch": 0.7460035523978685,
"grad_norm": 1.2695246934890747,
"learning_rate": 4.2743347904837176e-05,
"loss": 1.9514,
"step": 315
},
{
"epoch": 0.7578448786264061,
"grad_norm": 1.0501705408096313,
"learning_rate": 4.2523469066960295e-05,
"loss": 1.9439,
"step": 320
},
{
"epoch": 0.7696862048549438,
"grad_norm": 1.3608386516571045,
"learning_rate": 4.230089257128842e-05,
"loss": 2.0266,
"step": 325
},
{
"epoch": 0.7815275310834814,
"grad_norm": 1.1087582111358643,
"learning_rate": 4.2075652682453554e-05,
"loss": 1.9851,
"step": 330
},
{
"epoch": 0.7933688573120189,
"grad_norm": 1.4247711896896362,
"learning_rate": 4.184778407510484e-05,
"loss": 1.976,
"step": 335
},
{
"epoch": 0.8052101835405565,
"grad_norm": 1.5246973037719727,
"learning_rate": 4.16173218285706e-05,
"loss": 1.9039,
"step": 340
},
{
"epoch": 0.8170515097690941,
"grad_norm": 0.8650221824645996,
"learning_rate": 4.138430142145805e-05,
"loss": 1.9577,
"step": 345
},
{
"epoch": 0.8288928359976317,
"grad_norm": 1.5152937173843384,
"learning_rate": 4.114875872619147e-05,
"loss": 1.9182,
"step": 350
},
{
"epoch": 0.8407341622261694,
"grad_norm": 1.2927278280258179,
"learning_rate": 4.0910730003489894e-05,
"loss": 1.9574,
"step": 355
},
{
"epoch": 0.8525754884547069,
"grad_norm": 1.193464756011963,
"learning_rate": 4.067025189678485e-05,
"loss": 1.9129,
"step": 360
},
{
"epoch": 0.8644168146832445,
"grad_norm": 1.1596301794052124,
"learning_rate": 4.042736142657935e-05,
"loss": 2.0108,
"step": 365
},
{
"epoch": 0.8762581409117821,
"grad_norm": 1.278910756111145,
"learning_rate": 4.018209598474869e-05,
"loss": 2.1509,
"step": 370
},
{
"epoch": 0.8880994671403197,
"grad_norm": 0.9644595384597778,
"learning_rate": 3.993449332878418e-05,
"loss": 2.01,
"step": 375
},
{
"epoch": 0.8999407933688574,
"grad_norm": 1.2030296325683594,
"learning_rate": 3.9684591575980546e-05,
"loss": 2.0312,
"step": 380
},
{
"epoch": 0.9117821195973949,
"grad_norm": 1.4285073280334473,
"learning_rate": 3.943242919756792e-05,
"loss": 1.9705,
"step": 385
},
{
"epoch": 0.9236234458259325,
"grad_norm": 1.1423532962799072,
"learning_rate": 3.917804501278942e-05,
"loss": 2.1293,
"step": 390
},
{
"epoch": 0.9354647720544701,
"grad_norm": 1.3577874898910522,
"learning_rate": 3.8921478182925055e-05,
"loss": 2.1897,
"step": 395
},
{
"epoch": 0.9473060982830077,
"grad_norm": 1.166759967803955,
"learning_rate": 3.8662768205263044e-05,
"loss": 1.966,
"step": 400
},
{
"epoch": 0.9591474245115453,
"grad_norm": 0.8779304623603821,
"learning_rate": 3.8401954907019424e-05,
"loss": 1.898,
"step": 405
},
{
"epoch": 0.9709887507400828,
"grad_norm": 0.9446001648902893,
"learning_rate": 3.813907843920675e-05,
"loss": 2.1288,
"step": 410
},
{
"epoch": 0.9828300769686205,
"grad_norm": 1.2340161800384521,
"learning_rate": 3.787417927045315e-05,
"loss": 1.8449,
"step": 415
},
{
"epoch": 0.9946714031971581,
"grad_norm": 1.0638560056686401,
"learning_rate": 3.7607298180772236e-05,
"loss": 1.8785,
"step": 420
},
{
"epoch": 1.0065127294256957,
"grad_norm": 1.046241044998169,
"learning_rate": 3.733847625528529e-05,
"loss": 1.801,
"step": 425
},
{
"epoch": 1.0183540556542332,
"grad_norm": 0.9987891316413879,
"learning_rate": 3.706775487789639e-05,
"loss": 1.9475,
"step": 430
},
{
"epoch": 1.030195381882771,
"grad_norm": 1.4792598485946655,
"learning_rate": 3.679517572492151e-05,
"loss": 1.9996,
"step": 435
},
{
"epoch": 1.0420367081113084,
"grad_norm": 1.3502384424209595,
"learning_rate": 3.652078075867267e-05,
"loss": 1.9598,
"step": 440
},
{
"epoch": 1.0538780343398462,
"grad_norm": 0.9646159410476685,
"learning_rate": 3.624461222099804e-05,
"loss": 1.9598,
"step": 445
},
{
"epoch": 1.0657193605683837,
"grad_norm": 1.464591145515442,
"learning_rate": 3.596671262677898e-05,
"loss": 1.9089,
"step": 450
},
{
"epoch": 1.0775606867969212,
"grad_norm": 0.9556955099105835,
"learning_rate": 3.568712475738508e-05,
"loss": 1.9496,
"step": 455
},
{
"epoch": 1.089402013025459,
"grad_norm": 1.2917728424072266,
"learning_rate": 3.5405891654088154e-05,
"loss": 1.9467,
"step": 460
},
{
"epoch": 1.1012433392539964,
"grad_norm": 1.3475714921951294,
"learning_rate": 3.5123056611436224e-05,
"loss": 2.0235,
"step": 465
},
{
"epoch": 1.1130846654825342,
"grad_norm": 0.9648019671440125,
"learning_rate": 3.483866317058857e-05,
"loss": 1.8844,
"step": 470
},
{
"epoch": 1.1249259917110717,
"grad_norm": 1.3338149785995483,
"learning_rate": 3.4552755112612714e-05,
"loss": 1.8287,
"step": 475
},
{
"epoch": 1.1367673179396092,
"grad_norm": 1.1770730018615723,
"learning_rate": 3.4265376451744565e-05,
"loss": 2.0754,
"step": 480
},
{
"epoch": 1.148608644168147,
"grad_norm": 1.0296565294265747,
"learning_rate": 3.397657142861258e-05,
"loss": 1.9269,
"step": 485
},
{
"epoch": 1.1604499703966844,
"grad_norm": 1.1220016479492188,
"learning_rate": 3.3686384503427174e-05,
"loss": 2.0613,
"step": 490
},
{
"epoch": 1.1722912966252221,
"grad_norm": 1.0247845649719238,
"learning_rate": 3.339486034913627e-05,
"loss": 1.8047,
"step": 495
},
{
"epoch": 1.1841326228537596,
"grad_norm": 0.9551325440406799,
"learning_rate": 3.3102043844548044e-05,
"loss": 1.8639,
"step": 500
},
{
"epoch": 1.1959739490822971,
"grad_norm": 1.7456352710723877,
"learning_rate": 3.280798006742213e-05,
"loss": 1.9545,
"step": 505
},
{
"epoch": 1.2078152753108349,
"grad_norm": 1.260713815689087,
"learning_rate": 3.2512714287530006e-05,
"loss": 1.874,
"step": 510
},
{
"epoch": 1.2196566015393724,
"grad_norm": 1.4492847919464111,
"learning_rate": 3.2216291959686006e-05,
"loss": 1.9178,
"step": 515
},
{
"epoch": 1.2314979277679101,
"grad_norm": 1.4412329196929932,
"learning_rate": 3.191875871674971e-05,
"loss": 1.919,
"step": 520
},
{
"epoch": 1.2433392539964476,
"grad_norm": 1.2013176679611206,
"learning_rate": 3.1620160362600984e-05,
"loss": 1.9128,
"step": 525
},
{
"epoch": 1.2551805802249851,
"grad_norm": 1.0584838390350342,
"learning_rate": 3.1320542865088696e-05,
"loss": 2.0132,
"step": 530
},
{
"epoch": 1.2670219064535229,
"grad_norm": 0.9713219404220581,
"learning_rate": 3.101995234895416e-05,
"loss": 1.9014,
"step": 535
},
{
"epoch": 1.2788632326820604,
"grad_norm": 1.167724609375,
"learning_rate": 3.071843508873046e-05,
"loss": 1.8482,
"step": 540
},
{
"epoch": 1.290704558910598,
"grad_norm": 1.43012273311615,
"learning_rate": 3.0416037501618677e-05,
"loss": 1.7475,
"step": 545
},
{
"epoch": 1.3025458851391356,
"grad_norm": 1.8622283935546875,
"learning_rate": 3.0112806140342176e-05,
"loss": 2.0185,
"step": 550
},
{
"epoch": 1.3143872113676731,
"grad_norm": 1.513700008392334,
"learning_rate": 2.9808787685980054e-05,
"loss": 1.8825,
"step": 555
},
{
"epoch": 1.3262285375962108,
"grad_norm": 1.4935449361801147,
"learning_rate": 2.9504028940780776e-05,
"loss": 1.8319,
"step": 560
},
{
"epoch": 1.3380698638247484,
"grad_norm": 1.4678127765655518,
"learning_rate": 2.9198576820957187e-05,
"loss": 1.9613,
"step": 565
},
{
"epoch": 1.349911190053286,
"grad_norm": 1.2753045558929443,
"learning_rate": 2.8892478349463986e-05,
"loss": 1.9367,
"step": 570
},
{
"epoch": 1.3617525162818236,
"grad_norm": 1.5118064880371094,
"learning_rate": 2.858578064875874e-05,
"loss": 1.895,
"step": 575
},
{
"epoch": 1.373593842510361,
"grad_norm": 1.3416668176651,
"learning_rate": 2.8278530933547624e-05,
"loss": 2.0023,
"step": 580
},
{
"epoch": 1.3854351687388988,
"grad_norm": 1.6278949975967407,
"learning_rate": 2.79707765035169e-05,
"loss": 1.9469,
"step": 585
},
{
"epoch": 1.3972764949674363,
"grad_norm": 1.2708017826080322,
"learning_rate": 2.7662564736051377e-05,
"loss": 1.7915,
"step": 590
},
{
"epoch": 1.409117821195974,
"grad_norm": 1.0602233409881592,
"learning_rate": 2.7353943078940875e-05,
"loss": 1.8253,
"step": 595
},
{
"epoch": 1.4209591474245116,
"grad_norm": 1.3578535318374634,
"learning_rate": 2.7044959043075814e-05,
"loss": 2.0741,
"step": 600
},
{
"epoch": 1.432800473653049,
"grad_norm": 1.1489042043685913,
"learning_rate": 2.67356601951332e-05,
"loss": 1.901,
"step": 605
},
{
"epoch": 1.4446417998815868,
"grad_norm": 1.680655598640442,
"learning_rate": 2.64260941502539e-05,
"loss": 2.0099,
"step": 610
},
{
"epoch": 1.4564831261101243,
"grad_norm": 1.7020906209945679,
"learning_rate": 2.611630856471252e-05,
"loss": 1.8853,
"step": 615
},
{
"epoch": 1.468324452338662,
"grad_norm": 0.9535301923751831,
"learning_rate": 2.5806351128580964e-05,
"loss": 1.8205,
"step": 620
},
{
"epoch": 1.4801657785671996,
"grad_norm": 1.0446418523788452,
"learning_rate": 2.5496269558386725e-05,
"loss": 2.0851,
"step": 625
},
{
"epoch": 1.492007104795737,
"grad_norm": 1.1212449073791504,
"learning_rate": 2.5186111589767187e-05,
"loss": 2.0913,
"step": 630
},
{
"epoch": 1.5038484310242746,
"grad_norm": 1.5539659261703491,
"learning_rate": 2.487592497012089e-05,
"loss": 1.9521,
"step": 635
},
{
"epoch": 1.5156897572528123,
"grad_norm": 1.3954375982284546,
"learning_rate": 2.4565757451257128e-05,
"loss": 1.8525,
"step": 640
},
{
"epoch": 1.52753108348135,
"grad_norm": 1.5919753313064575,
"learning_rate": 2.4255656782044644e-05,
"loss": 1.8034,
"step": 645
},
{
"epoch": 1.5393724097098875,
"grad_norm": 1.420255184173584,
"learning_rate": 2.3945670701061033e-05,
"loss": 2.053,
"step": 650
},
{
"epoch": 1.551213735938425,
"grad_norm": 1.0267295837402344,
"learning_rate": 2.3635846929243537e-05,
"loss": 1.7877,
"step": 655
},
{
"epoch": 1.5630550621669625,
"grad_norm": 1.778398036956787,
"learning_rate": 2.3326233162542655e-05,
"loss": 1.9865,
"step": 660
},
{
"epoch": 1.5748963883955003,
"grad_norm": 1.0454083681106567,
"learning_rate": 2.3016877064579564e-05,
"loss": 1.8799,
"step": 665
},
{
"epoch": 1.586737714624038,
"grad_norm": 1.4005528688430786,
"learning_rate": 2.2707826259308492e-05,
"loss": 1.9329,
"step": 670
},
{
"epoch": 1.5985790408525755,
"grad_norm": 1.240485429763794,
"learning_rate": 2.2399128323685286e-05,
"loss": 1.7828,
"step": 675
},
{
"epoch": 1.610420367081113,
"grad_norm": 1.2078522443771362,
"learning_rate": 2.2090830780343113e-05,
"loss": 2.0114,
"step": 680
},
{
"epoch": 1.6222616933096505,
"grad_norm": 1.1760393381118774,
"learning_rate": 2.1782981090276585e-05,
"loss": 1.8671,
"step": 685
},
{
"epoch": 1.6341030195381883,
"grad_norm": 1.362109661102295,
"learning_rate": 2.147562664553537e-05,
"loss": 1.9453,
"step": 690
},
{
"epoch": 1.645944345766726,
"grad_norm": 2.057487964630127,
"learning_rate": 2.1168814761928336e-05,
"loss": 1.7151,
"step": 695
},
{
"epoch": 1.6577856719952635,
"grad_norm": 1.5289130210876465,
"learning_rate": 2.0862592671739608e-05,
"loss": 1.9591,
"step": 700
},
{
"epoch": 1.669626998223801,
"grad_norm": 1.42037832736969,
"learning_rate": 2.0557007516457288e-05,
"loss": 1.8999,
"step": 705
},
{
"epoch": 1.6814683244523385,
"grad_norm": 1.101355791091919,
"learning_rate": 2.0252106339516272e-05,
"loss": 2.0037,
"step": 710
},
{
"epoch": 1.6933096506808762,
"grad_norm": 5.125020980834961,
"learning_rate": 1.9947936079056117e-05,
"loss": 1.7028,
"step": 715
},
{
"epoch": 1.705150976909414,
"grad_norm": 1.3062840700149536,
"learning_rate": 1.964454356069514e-05,
"loss": 2.1283,
"step": 720
},
{
"epoch": 1.7169923031379515,
"grad_norm": 1.273743987083435,
"learning_rate": 1.9341975490321827e-05,
"loss": 2.0017,
"step": 725
},
{
"epoch": 1.728833629366489,
"grad_norm": 1.3251115083694458,
"learning_rate": 1.9040278446904677e-05,
"loss": 1.7329,
"step": 730
},
{
"epoch": 1.7406749555950265,
"grad_norm": 1.5570122003555298,
"learning_rate": 1.873949887532156e-05,
"loss": 1.9965,
"step": 735
},
{
"epoch": 1.7525162818235642,
"grad_norm": 1.3525424003601074,
"learning_rate": 1.8439683079209787e-05,
"loss": 1.9321,
"step": 740
},
{
"epoch": 1.764357608052102,
"grad_norm": 1.776936411857605,
"learning_rate": 1.8140877213837823e-05,
"loss": 1.9739,
"step": 745
},
{
"epoch": 1.7761989342806395,
"grad_norm": 1.317015290260315,
"learning_rate": 1.7843127278999943e-05,
"loss": 1.8864,
"step": 750
},
{
"epoch": 1.788040260509177,
"grad_norm": 1.6961033344268799,
"learning_rate": 1.754647911193473e-05,
"loss": 1.9227,
"step": 755
},
{
"epoch": 1.7998815867377145,
"grad_norm": 1.448065161705017,
"learning_rate": 1.7250978380268694e-05,
"loss": 2.0025,
"step": 760
},
{
"epoch": 1.8117229129662522,
"grad_norm": 1.4134864807128906,
"learning_rate": 1.6956670574985908e-05,
"loss": 1.8039,
"step": 765
},
{
"epoch": 1.82356423919479,
"grad_norm": 1.3354402780532837,
"learning_rate": 1.6663601003424883e-05,
"loss": 1.8008,
"step": 770
},
{
"epoch": 1.8354055654233274,
"grad_norm": 1.2575969696044922,
"learning_rate": 1.6371814782303722e-05,
"loss": 1.9609,
"step": 775
},
{
"epoch": 1.847246891651865,
"grad_norm": 1.4361883401870728,
"learning_rate": 1.6081356830774625e-05,
"loss": 2.0843,
"step": 780
},
{
"epoch": 1.8590882178804025,
"grad_norm": 1.320430874824524,
"learning_rate": 1.579227186350875e-05,
"loss": 1.8281,
"step": 785
},
{
"epoch": 1.8709295441089402,
"grad_norm": 1.2178723812103271,
"learning_rate": 1.5504604383812646e-05,
"loss": 1.9628,
"step": 790
},
{
"epoch": 1.882770870337478,
"grad_norm": 1.3220632076263428,
"learning_rate": 1.5218398676777102e-05,
"loss": 1.9867,
"step": 795
},
{
"epoch": 1.8946121965660154,
"grad_norm": 0.9402835965156555,
"learning_rate": 1.4933698802459731e-05,
"loss": 1.8819,
"step": 800
},
{
"epoch": 1.906453522794553,
"grad_norm": 1.3736234903335571,
"learning_rate": 1.4650548589102092e-05,
"loss": 1.8938,
"step": 805
},
{
"epoch": 1.9182948490230904,
"grad_norm": 0.9985255002975464,
"learning_rate": 1.436899162638255e-05,
"loss": 1.6936,
"step": 810
},
{
"epoch": 1.9301361752516282,
"grad_norm": 1.5471998453140259,
"learning_rate": 1.4089071258705783e-05,
"loss": 2.1922,
"step": 815
},
{
"epoch": 1.941977501480166,
"grad_norm": 1.515426754951477,
"learning_rate": 1.3810830578530225e-05,
"loss": 1.8547,
"step": 820
},
{
"epoch": 1.9538188277087034,
"grad_norm": 1.2444261312484741,
"learning_rate": 1.3534312419734066e-05,
"loss": 1.9437,
"step": 825
},
{
"epoch": 1.965660153937241,
"grad_norm": 1.3579230308532715,
"learning_rate": 1.3259559351021247e-05,
"loss": 1.7016,
"step": 830
},
{
"epoch": 1.9775014801657784,
"grad_norm": 1.436259388923645,
"learning_rate": 1.2986613669368158e-05,
"loss": 1.8801,
"step": 835
},
{
"epoch": 1.9893428063943162,
"grad_norm": 1.196303367614746,
"learning_rate": 1.271551739351224e-05,
"loss": 1.9408,
"step": 840
},
{
"epoch": 2.001184132622854,
"grad_norm": 1.1158932447433472,
"learning_rate": 1.2446312257483358e-05,
"loss": 1.6631,
"step": 845
},
{
"epoch": 2.0130254588513914,
"grad_norm": 1.18464195728302,
"learning_rate": 1.2179039704179118e-05,
"loss": 1.859,
"step": 850
},
{
"epoch": 2.024866785079929,
"grad_norm": 1.7019284963607788,
"learning_rate": 1.1913740878984816e-05,
"loss": 1.9671,
"step": 855
},
{
"epoch": 2.0367081113084664,
"grad_norm": 1.4283084869384766,
"learning_rate": 1.1650456623439367e-05,
"loss": 1.8109,
"step": 860
},
{
"epoch": 2.0485494375370044,
"grad_norm": 1.2812663316726685,
"learning_rate": 1.1389227468947906e-05,
"loss": 1.8024,
"step": 865
},
{
"epoch": 2.060390763765542,
"grad_norm": 1.4143946170806885,
"learning_rate": 1.1130093630542198e-05,
"loss": 1.8327,
"step": 870
},
{
"epoch": 2.0722320899940794,
"grad_norm": 1.2379989624023438,
"learning_rate": 1.0873095000689675e-05,
"loss": 1.9797,
"step": 875
},
{
"epoch": 2.084073416222617,
"grad_norm": 1.524370551109314,
"learning_rate": 1.0618271143152184e-05,
"loss": 1.8164,
"step": 880
},
{
"epoch": 2.0959147424511544,
"grad_norm": 1.7849695682525635,
"learning_rate": 1.0365661286895365e-05,
"loss": 1.7994,
"step": 885
},
{
"epoch": 2.1077560686796923,
"grad_norm": 2.3028218746185303,
"learning_rate": 1.0115304320049479e-05,
"loss": 1.9428,
"step": 890
},
{
"epoch": 2.11959739490823,
"grad_norm": 1.4404668807983398,
"learning_rate": 9.867238783922789e-06,
"loss": 1.9297,
"step": 895
},
{
"epoch": 2.1314387211367674,
"grad_norm": 2.096019744873047,
"learning_rate": 9.621502867068285e-06,
"loss": 1.8185,
"step": 900
},
{
"epoch": 2.143280047365305,
"grad_norm": 1.393315076828003,
"learning_rate": 9.378134399404767e-06,
"loss": 1.7902,
"step": 905
},
{
"epoch": 2.1551213735938424,
"grad_norm": 1.4423267841339111,
"learning_rate": 9.137170846393054e-06,
"loss": 1.7111,
"step": 910
},
{
"epoch": 2.1669626998223803,
"grad_norm": 1.1944553852081299,
"learning_rate": 8.898649303268372e-06,
"loss": 2.04,
"step": 915
},
{
"epoch": 2.178804026050918,
"grad_norm": 1.407127857208252,
"learning_rate": 8.662606489329711e-06,
"loss": 1.9046,
"step": 920
},
{
"epoch": 2.1906453522794553,
"grad_norm": 1.855188012123108,
"learning_rate": 8.429078742287073e-06,
"loss": 2.1426,
"step": 925
},
{
"epoch": 2.202486678507993,
"grad_norm": 1.5526237487792969,
"learning_rate": 8.198102012667407e-06,
"loss": 2.0743,
"step": 930
},
{
"epoch": 2.2143280047365304,
"grad_norm": 1.1409013271331787,
"learning_rate": 7.969711858280252e-06,
"loss": 1.7135,
"step": 935
},
{
"epoch": 2.2261693309650683,
"grad_norm": 1.7926199436187744,
"learning_rate": 7.743943438743676e-06,
"loss": 1.8054,
"step": 940
},
{
"epoch": 2.238010657193606,
"grad_norm": 2.43799090385437,
"learning_rate": 7.520831510071744e-06,
"loss": 1.8244,
"step": 945
},
{
"epoch": 2.2498519834221433,
"grad_norm": 1.3174521923065186,
"learning_rate": 7.300410419323869e-06,
"loss": 1.8097,
"step": 950
},
{
"epoch": 2.261693309650681,
"grad_norm": 1.5044059753417969,
"learning_rate": 7.082714099317334e-06,
"loss": 1.9919,
"step": 955
},
{
"epoch": 2.2735346358792183,
"grad_norm": 1.3440165519714355,
"learning_rate": 6.867776063403411e-06,
"loss": 1.9084,
"step": 960
},
{
"epoch": 2.2853759621077563,
"grad_norm": 1.5049198865890503,
"learning_rate": 6.6556294003081914e-06,
"loss": 1.7835,
"step": 965
},
{
"epoch": 2.297217288336294,
"grad_norm": 1.3639992475509644,
"learning_rate": 6.44630676903869e-06,
"loss": 1.7542,
"step": 970
},
{
"epoch": 2.3090586145648313,
"grad_norm": 1.5013291835784912,
"learning_rate": 6.239840393855184e-06,
"loss": 2.0351,
"step": 975
},
{
"epoch": 2.320899940793369,
"grad_norm": 1.4973437786102295,
"learning_rate": 6.036262059310382e-06,
"loss": 1.7908,
"step": 980
},
{
"epoch": 2.3327412670219063,
"grad_norm": 2.3206512928009033,
"learning_rate": 5.835603105356396e-06,
"loss": 1.7511,
"step": 985
},
{
"epoch": 2.3445825932504443,
"grad_norm": 1.5731827020645142,
"learning_rate": 5.637894422520027e-06,
"loss": 1.9621,
"step": 990
},
{
"epoch": 2.3564239194789818,
"grad_norm": 1.6597813367843628,
"learning_rate": 5.443166447147391e-06,
"loss": 2.0578,
"step": 995
},
{
"epoch": 2.3682652457075193,
"grad_norm": 1.659856915473938,
"learning_rate": 5.251449156718313e-06,
"loss": 1.9198,
"step": 1000
},
{
"epoch": 2.380106571936057,
"grad_norm": 1.1122225522994995,
"learning_rate": 5.062772065231491e-06,
"loss": 1.6089,
"step": 1005
},
{
"epoch": 2.3919478981645943,
"grad_norm": 1.6533218622207642,
"learning_rate": 4.877164218660901e-06,
"loss": 1.9499,
"step": 1010
},
{
"epoch": 2.4037892243931323,
"grad_norm": 1.2786110639572144,
"learning_rate": 4.694654190484327e-06,
"loss": 2.0044,
"step": 1015
},
{
"epoch": 2.4156305506216698,
"grad_norm": 1.542189359664917,
"learning_rate": 4.515270077284595e-06,
"loss": 2.0896,
"step": 1020
},
{
"epoch": 2.4274718768502073,
"grad_norm": 1.5293947458267212,
"learning_rate": 4.339039494424263e-06,
"loss": 1.8803,
"step": 1025
},
{
"epoch": 2.4393132030787448,
"grad_norm": 1.390453815460205,
"learning_rate": 4.16598957179431e-06,
"loss": 2.0795,
"step": 1030
},
{
"epoch": 2.4511545293072823,
"grad_norm": 2.7319912910461426,
"learning_rate": 3.996146949637658e-06,
"loss": 1.7221,
"step": 1035
},
{
"epoch": 2.4629958555358202,
"grad_norm": 1.080298900604248,
"learning_rate": 3.8295377744479995e-06,
"loss": 1.6881,
"step": 1040
},
{
"epoch": 2.4748371817643577,
"grad_norm": 1.4576321840286255,
"learning_rate": 3.6661876949447007e-06,
"loss": 1.8039,
"step": 1045
},
{
"epoch": 2.4866785079928952,
"grad_norm": 1.5691012144088745,
"learning_rate": 3.5061218581242535e-06,
"loss": 1.9333,
"step": 1050
},
{
"epoch": 2.4985198342214328,
"grad_norm": 1.2626398801803589,
"learning_rate": 3.3493649053890326e-06,
"loss": 1.6731,
"step": 1055
},
{
"epoch": 2.5103611604499703,
"grad_norm": 1.5094714164733887,
"learning_rate": 3.1959409687538853e-06,
"loss": 1.8437,
"step": 1060
},
{
"epoch": 2.522202486678508,
"grad_norm": 2.1249232292175293,
"learning_rate": 3.04587366713108e-06,
"loss": 1.9577,
"step": 1065
},
{
"epoch": 2.5340438129070457,
"grad_norm": 1.563010334968567,
"learning_rate": 2.8991861026943014e-06,
"loss": 2.0162,
"step": 1070
},
{
"epoch": 2.5458851391355832,
"grad_norm": 2.769664764404297,
"learning_rate": 2.7559008573221717e-06,
"loss": 1.8091,
"step": 1075
},
{
"epoch": 2.5577264653641207,
"grad_norm": 1.178745985031128,
"learning_rate": 2.6160399891218988e-06,
"loss": 1.7485,
"step": 1080
},
{
"epoch": 2.5695677915926582,
"grad_norm": 1.479054570198059,
"learning_rate": 2.4796250290334887e-06,
"loss": 1.9556,
"step": 1085
},
{
"epoch": 2.581409117821196,
"grad_norm": 1.1433284282684326,
"learning_rate": 2.346676977515189e-06,
"loss": 1.7762,
"step": 1090
},
{
"epoch": 2.5932504440497337,
"grad_norm": 1.6979899406433105,
"learning_rate": 2.21721630131054e-06,
"loss": 1.7463,
"step": 1095
},
{
"epoch": 2.605091770278271,
"grad_norm": 2.4911599159240723,
"learning_rate": 2.0912629302976493e-06,
"loss": 1.9825,
"step": 1100
},
{
"epoch": 2.6169330965068087,
"grad_norm": 1.3858684301376343,
"learning_rate": 1.968836254421036e-06,
"loss": 1.8317,
"step": 1105
},
{
"epoch": 2.6287744227353462,
"grad_norm": 1.304112195968628,
"learning_rate": 1.849955120706673e-06,
"loss": 1.7377,
"step": 1110
},
{
"epoch": 2.640615748963884,
"grad_norm": 1.253029465675354,
"learning_rate": 1.7346378303605359e-06,
"loss": 1.8165,
"step": 1115
},
{
"epoch": 2.6524570751924217,
"grad_norm": 1.166486382484436,
"learning_rate": 1.6229021359512624e-06,
"loss": 1.8233,
"step": 1120
},
{
"epoch": 2.664298401420959,
"grad_norm": 1.5361779928207397,
"learning_rate": 1.5147652386771848e-06,
"loss": 1.9029,
"step": 1125
},
{
"epoch": 2.6761397276494967,
"grad_norm": 1.3075627088546753,
"learning_rate": 1.4102437857183155e-06,
"loss": 1.8527,
"step": 1130
},
{
"epoch": 2.687981053878034,
"grad_norm": 1.133543610572815,
"learning_rate": 1.3093538676735601e-06,
"loss": 1.8855,
"step": 1135
},
{
"epoch": 2.699822380106572,
"grad_norm": 1.6549872159957886,
"learning_rate": 1.2121110160836696e-06,
"loss": 1.8746,
"step": 1140
},
{
"epoch": 2.7116637063351097,
"grad_norm": 1.2035839557647705,
"learning_rate": 1.1185302010402105e-06,
"loss": 1.696,
"step": 1145
},
{
"epoch": 2.723505032563647,
"grad_norm": 1.338542103767395,
"learning_rate": 1.0286258288810107e-06,
"loss": 1.7904,
"step": 1150
},
{
"epoch": 2.7353463587921847,
"grad_norm": 1.4907745122909546,
"learning_rate": 9.424117399723431e-07,
"loss": 1.962,
"step": 1155
},
{
"epoch": 2.747187685020722,
"grad_norm": 1.6321779489517212,
"learning_rate": 8.599012065782924e-07,
"loss": 1.9262,
"step": 1160
},
{
"epoch": 2.75902901124926,
"grad_norm": 1.2635369300842285,
"learning_rate": 7.811069308175156e-07,
"loss": 1.8789,
"step": 1165
},
{
"epoch": 2.7708703374777977,
"grad_norm": 1.5785208940505981,
"learning_rate": 7.060410427078473e-07,
"loss": 1.7546,
"step": 1170
},
{
"epoch": 2.782711663706335,
"grad_norm": 1.1907869577407837,
"learning_rate": 6.347150982989159e-07,
"loss": 1.7979,
"step": 1175
},
{
"epoch": 2.7945529899348727,
"grad_norm": 1.3581962585449219,
"learning_rate": 5.671400778931468e-07,
"loss": 1.7874,
"step": 1180
},
{
"epoch": 2.80639431616341,
"grad_norm": 1.0370872020721436,
"learning_rate": 5.033263843554015e-07,
"loss": 1.6197,
"step": 1185
},
{
"epoch": 2.818235642391948,
"grad_norm": 1.2103910446166992,
"learning_rate": 4.4328384151149095e-07,
"loss": 1.5225,
"step": 1190
},
{
"epoch": 2.8300769686204856,
"grad_norm": 1.4887464046478271,
"learning_rate": 3.8702169263585554e-07,
"loss": 1.9811,
"step": 1195
},
{
"epoch": 2.841918294849023,
"grad_norm": 1.4641401767730713,
"learning_rate": 3.345485990286029e-07,
"loss": 1.9999,
"step": 1200
},
{
"epoch": 2.8537596210775606,
"grad_norm": 1.529491901397705,
"learning_rate": 2.8587263868213585e-07,
"loss": 2.1399,
"step": 1205
},
{
"epoch": 2.865600947306098,
"grad_norm": 1.5479165315628052,
"learning_rate": 2.410013050375859e-07,
"loss": 1.8664,
"step": 1210
},
{
"epoch": 2.877442273534636,
"grad_norm": 1.1747933626174927,
"learning_rate": 1.999415058312276e-07,
"loss": 1.9633,
"step": 1215
},
{
"epoch": 2.8892835997631736,
"grad_norm": 1.358820915222168,
"learning_rate": 1.6269956203107117e-07,
"loss": 2.0106,
"step": 1220
},
{
"epoch": 2.901124925991711,
"grad_norm": 1.554287314414978,
"learning_rate": 1.2928120686377388e-07,
"loss": 1.7896,
"step": 1225
},
{
"epoch": 2.9129662522202486,
"grad_norm": 1.5915781259536743,
"learning_rate": 9.969158493204067e-08,
"loss": 1.8759,
"step": 1230
},
{
"epoch": 2.924807578448786,
"grad_norm": 1.6586300134658813,
"learning_rate": 7.393525142262991e-08,
"loss": 1.9413,
"step": 1235
},
{
"epoch": 2.936648904677324,
"grad_norm": 1.416338562965393,
"learning_rate": 5.2016171405103174e-08,
"loss": 1.8404,
"step": 1240
},
{
"epoch": 2.9484902309058616,
"grad_norm": 1.3145766258239746,
"learning_rate": 3.393771922142741e-08,
"loss": 2.0654,
"step": 1245
},
{
"epoch": 2.960331557134399,
"grad_norm": 1.3440285921096802,
"learning_rate": 1.9702677966507154e-08,
"loss": 1.8111,
"step": 1250
},
{
"epoch": 2.9721728833629366,
"grad_norm": 1.3247921466827393,
"learning_rate": 9.31323905974113e-09,
"loss": 1.8708,
"step": 1255
},
{
"epoch": 2.984014209591474,
"grad_norm": 1.7957427501678467,
"learning_rate": 2.771001907653226e-09,
"loss": 1.8842,
"step": 1260
},
{
"epoch": 2.995855535820012,
"grad_norm": 1.4069582223892212,
"learning_rate": 7.697365768943864e-11,
"loss": 1.9665,
"step": 1265
},
{
"epoch": 2.9982238010657194,
"step": 1266,
"total_flos": 6.019503790030848e+16,
"train_loss": 1.9697479162170988,
"train_runtime": 4932.4291,
"train_samples_per_second": 4.109,
"train_steps_per_second": 0.257
}
],
"logging_steps": 5,
"max_steps": 1266,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 6.019503790030848e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}