t5-small-scratch-custom-iwslt2017 / trainer_state.json
minseok0809's picture
End of training
bb1653d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 100.0,
"eval_steps": 500,
"global_step": 182300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.27427317608337903,
"grad_norm": 2.1429903507232666,
"learning_rate": 4.986286341195831e-05,
"loss": 7.2532,
"step": 500
},
{
"epoch": 0.5485463521667581,
"grad_norm": 2.0887398719787598,
"learning_rate": 4.972572682391663e-05,
"loss": 6.4724,
"step": 1000
},
{
"epoch": 0.8228195282501372,
"grad_norm": 2.1614580154418945,
"learning_rate": 4.958859023587493e-05,
"loss": 6.0973,
"step": 1500
},
{
"epoch": 1.0970927043335161,
"grad_norm": 2.202652931213379,
"learning_rate": 4.9451453647833245e-05,
"loss": 5.7872,
"step": 2000
},
{
"epoch": 1.3713658804168953,
"grad_norm": 2.147414445877075,
"learning_rate": 4.931431705979155e-05,
"loss": 5.5399,
"step": 2500
},
{
"epoch": 1.6456390565002743,
"grad_norm": 2.1624412536621094,
"learning_rate": 4.917718047174987e-05,
"loss": 5.3515,
"step": 3000
},
{
"epoch": 1.9199122325836533,
"grad_norm": 2.155363082885742,
"learning_rate": 4.9040043883708175e-05,
"loss": 5.1762,
"step": 3500
},
{
"epoch": 2.1941854086670323,
"grad_norm": 2.2420756816864014,
"learning_rate": 4.890290729566648e-05,
"loss": 5.0364,
"step": 4000
},
{
"epoch": 2.4684585847504112,
"grad_norm": 2.4155259132385254,
"learning_rate": 4.87657707076248e-05,
"loss": 4.9178,
"step": 4500
},
{
"epoch": 2.7427317608337907,
"grad_norm": 2.2149574756622314,
"learning_rate": 4.8628634119583105e-05,
"loss": 4.8449,
"step": 5000
},
{
"epoch": 3.0170049369171696,
"grad_norm": 2.25925350189209,
"learning_rate": 4.849149753154142e-05,
"loss": 4.762,
"step": 5500
},
{
"epoch": 3.2912781130005486,
"grad_norm": 2.315990686416626,
"learning_rate": 4.835436094349973e-05,
"loss": 4.6713,
"step": 6000
},
{
"epoch": 3.5655512890839276,
"grad_norm": 2.425288677215576,
"learning_rate": 4.821722435545804e-05,
"loss": 4.6267,
"step": 6500
},
{
"epoch": 3.8398244651673066,
"grad_norm": 2.3451356887817383,
"learning_rate": 4.808008776741635e-05,
"loss": 4.579,
"step": 7000
},
{
"epoch": 4.1140976412506856,
"grad_norm": 2.306058645248413,
"learning_rate": 4.794295117937466e-05,
"loss": 4.5148,
"step": 7500
},
{
"epoch": 4.3883708173340645,
"grad_norm": 2.3386404514312744,
"learning_rate": 4.780581459133297e-05,
"loss": 4.4659,
"step": 8000
},
{
"epoch": 4.6626439934174435,
"grad_norm": 2.3117551803588867,
"learning_rate": 4.766867800329128e-05,
"loss": 4.4327,
"step": 8500
},
{
"epoch": 4.9369171695008225,
"grad_norm": 2.36466908454895,
"learning_rate": 4.753154141524959e-05,
"loss": 4.3947,
"step": 9000
},
{
"epoch": 5.2111903455842015,
"grad_norm": 2.348733901977539,
"learning_rate": 4.73944048272079e-05,
"loss": 4.3441,
"step": 9500
},
{
"epoch": 5.485463521667581,
"grad_norm": 2.9133706092834473,
"learning_rate": 4.7257268239166215e-05,
"loss": 4.3025,
"step": 10000
},
{
"epoch": 5.75973669775096,
"grad_norm": 2.6369545459747314,
"learning_rate": 4.712013165112452e-05,
"loss": 4.2785,
"step": 10500
},
{
"epoch": 6.034009873834339,
"grad_norm": 2.7040719985961914,
"learning_rate": 4.698299506308283e-05,
"loss": 4.2431,
"step": 11000
},
{
"epoch": 6.308283049917718,
"grad_norm": 2.7137389183044434,
"learning_rate": 4.6845858475041146e-05,
"loss": 4.1796,
"step": 11500
},
{
"epoch": 6.582556226001097,
"grad_norm": 2.963534355163574,
"learning_rate": 4.670872188699945e-05,
"loss": 4.1432,
"step": 12000
},
{
"epoch": 6.856829402084476,
"grad_norm": 2.8243420124053955,
"learning_rate": 4.6571585298957763e-05,
"loss": 4.1252,
"step": 12500
},
{
"epoch": 7.131102578167855,
"grad_norm": 3.03757905960083,
"learning_rate": 4.6434448710916076e-05,
"loss": 4.0713,
"step": 13000
},
{
"epoch": 7.405375754251234,
"grad_norm": 3.007908821105957,
"learning_rate": 4.629731212287439e-05,
"loss": 4.0248,
"step": 13500
},
{
"epoch": 7.679648930334613,
"grad_norm": 2.846788167953491,
"learning_rate": 4.6160175534832694e-05,
"loss": 4.0099,
"step": 14000
},
{
"epoch": 7.953922106417992,
"grad_norm": 2.961183786392212,
"learning_rate": 4.6023038946791006e-05,
"loss": 3.9728,
"step": 14500
},
{
"epoch": 8.228195282501371,
"grad_norm": 3.066025733947754,
"learning_rate": 4.588590235874932e-05,
"loss": 3.9118,
"step": 15000
},
{
"epoch": 8.502468458584751,
"grad_norm": 2.9394822120666504,
"learning_rate": 4.5748765770707624e-05,
"loss": 3.8828,
"step": 15500
},
{
"epoch": 8.776741634668129,
"grad_norm": 3.012153387069702,
"learning_rate": 4.5611629182665936e-05,
"loss": 3.8832,
"step": 16000
},
{
"epoch": 9.051014810751509,
"grad_norm": 2.899332046508789,
"learning_rate": 4.547449259462425e-05,
"loss": 3.8481,
"step": 16500
},
{
"epoch": 9.325287986834887,
"grad_norm": 3.164444923400879,
"learning_rate": 4.533735600658256e-05,
"loss": 3.7773,
"step": 17000
},
{
"epoch": 9.599561162918267,
"grad_norm": 3.017282009124756,
"learning_rate": 4.5200219418540867e-05,
"loss": 3.7689,
"step": 17500
},
{
"epoch": 9.873834339001645,
"grad_norm": 3.367647647857666,
"learning_rate": 4.506308283049918e-05,
"loss": 3.752,
"step": 18000
},
{
"epoch": 10.148107515085025,
"grad_norm": 2.9855947494506836,
"learning_rate": 4.492594624245749e-05,
"loss": 3.7182,
"step": 18500
},
{
"epoch": 10.422380691168403,
"grad_norm": 3.505870819091797,
"learning_rate": 4.47888096544158e-05,
"loss": 3.674,
"step": 19000
},
{
"epoch": 10.696653867251783,
"grad_norm": 3.438145160675049,
"learning_rate": 4.465167306637411e-05,
"loss": 3.651,
"step": 19500
},
{
"epoch": 10.970927043335163,
"grad_norm": 3.0687413215637207,
"learning_rate": 4.451453647833242e-05,
"loss": 3.6479,
"step": 20000
},
{
"epoch": 11.24520021941854,
"grad_norm": 3.2287588119506836,
"learning_rate": 4.4377399890290734e-05,
"loss": 3.5693,
"step": 20500
},
{
"epoch": 11.51947339550192,
"grad_norm": 3.3848471641540527,
"learning_rate": 4.424026330224904e-05,
"loss": 3.5667,
"step": 21000
},
{
"epoch": 11.793746571585299,
"grad_norm": 3.5464422702789307,
"learning_rate": 4.410312671420735e-05,
"loss": 3.5589,
"step": 21500
},
{
"epoch": 12.068019747668679,
"grad_norm": 3.6160085201263428,
"learning_rate": 4.3965990126165664e-05,
"loss": 3.5313,
"step": 22000
},
{
"epoch": 12.342292923752057,
"grad_norm": 3.6420817375183105,
"learning_rate": 4.3828853538123976e-05,
"loss": 3.4855,
"step": 22500
},
{
"epoch": 12.616566099835437,
"grad_norm": 3.583449363708496,
"learning_rate": 4.369171695008228e-05,
"loss": 3.4654,
"step": 23000
},
{
"epoch": 12.890839275918815,
"grad_norm": 3.5506091117858887,
"learning_rate": 4.3554580362040594e-05,
"loss": 3.4512,
"step": 23500
},
{
"epoch": 13.165112452002194,
"grad_norm": 4.101990699768066,
"learning_rate": 4.341744377399891e-05,
"loss": 3.4055,
"step": 24000
},
{
"epoch": 13.439385628085573,
"grad_norm": 4.144250392913818,
"learning_rate": 4.328030718595721e-05,
"loss": 3.3755,
"step": 24500
},
{
"epoch": 13.713658804168952,
"grad_norm": 3.6288070678710938,
"learning_rate": 4.3143170597915525e-05,
"loss": 3.3725,
"step": 25000
},
{
"epoch": 13.98793198025233,
"grad_norm": 3.5927882194519043,
"learning_rate": 4.300603400987384e-05,
"loss": 3.3693,
"step": 25500
},
{
"epoch": 14.26220515633571,
"grad_norm": 3.405404567718506,
"learning_rate": 4.286889742183215e-05,
"loss": 3.2955,
"step": 26000
},
{
"epoch": 14.53647833241909,
"grad_norm": 4.086198329925537,
"learning_rate": 4.2731760833790455e-05,
"loss": 3.3038,
"step": 26500
},
{
"epoch": 14.810751508502468,
"grad_norm": 3.3961052894592285,
"learning_rate": 4.259462424574877e-05,
"loss": 3.2561,
"step": 27000
},
{
"epoch": 15.085024684585848,
"grad_norm": 3.6080105304718018,
"learning_rate": 4.245748765770708e-05,
"loss": 3.2539,
"step": 27500
},
{
"epoch": 15.359297860669226,
"grad_norm": 3.597956657409668,
"learning_rate": 4.2320351069665385e-05,
"loss": 3.2148,
"step": 28000
},
{
"epoch": 15.633571036752606,
"grad_norm": 3.466057062149048,
"learning_rate": 4.21832144816237e-05,
"loss": 3.2031,
"step": 28500
},
{
"epoch": 15.907844212835984,
"grad_norm": 4.239918231964111,
"learning_rate": 4.204607789358201e-05,
"loss": 3.1768,
"step": 29000
},
{
"epoch": 16.182117388919362,
"grad_norm": 4.533541202545166,
"learning_rate": 4.190894130554032e-05,
"loss": 3.1258,
"step": 29500
},
{
"epoch": 16.456390565002742,
"grad_norm": 3.8643674850463867,
"learning_rate": 4.177180471749863e-05,
"loss": 3.1162,
"step": 30000
},
{
"epoch": 16.730663741086122,
"grad_norm": 3.710988998413086,
"learning_rate": 4.163466812945694e-05,
"loss": 3.1159,
"step": 30500
},
{
"epoch": 17.004936917169502,
"grad_norm": 3.6405742168426514,
"learning_rate": 4.149753154141525e-05,
"loss": 3.1137,
"step": 31000
},
{
"epoch": 17.27921009325288,
"grad_norm": 4.127532482147217,
"learning_rate": 4.136039495337356e-05,
"loss": 3.0327,
"step": 31500
},
{
"epoch": 17.553483269336258,
"grad_norm": 3.6551403999328613,
"learning_rate": 4.122325836533188e-05,
"loss": 3.0611,
"step": 32000
},
{
"epoch": 17.827756445419638,
"grad_norm": 3.6437666416168213,
"learning_rate": 4.108612177729018e-05,
"loss": 3.0233,
"step": 32500
},
{
"epoch": 18.102029621503018,
"grad_norm": 4.008886814117432,
"learning_rate": 4.0948985189248495e-05,
"loss": 3.0099,
"step": 33000
},
{
"epoch": 18.376302797586398,
"grad_norm": 3.779545783996582,
"learning_rate": 4.08118486012068e-05,
"loss": 2.9663,
"step": 33500
},
{
"epoch": 18.650575973669774,
"grad_norm": 3.7845826148986816,
"learning_rate": 4.067471201316512e-05,
"loss": 2.9628,
"step": 34000
},
{
"epoch": 18.924849149753154,
"grad_norm": 3.866852283477783,
"learning_rate": 4.0537575425123425e-05,
"loss": 2.9649,
"step": 34500
},
{
"epoch": 19.199122325836534,
"grad_norm": 3.9092442989349365,
"learning_rate": 4.040043883708173e-05,
"loss": 2.9199,
"step": 35000
},
{
"epoch": 19.473395501919914,
"grad_norm": 4.038732528686523,
"learning_rate": 4.026330224904005e-05,
"loss": 2.9078,
"step": 35500
},
{
"epoch": 19.74766867800329,
"grad_norm": 3.717470645904541,
"learning_rate": 4.0126165660998355e-05,
"loss": 2.8907,
"step": 36000
},
{
"epoch": 20.02194185408667,
"grad_norm": 3.9166011810302734,
"learning_rate": 3.998902907295667e-05,
"loss": 2.8892,
"step": 36500
},
{
"epoch": 20.29621503017005,
"grad_norm": 3.9473681449890137,
"learning_rate": 3.985189248491497e-05,
"loss": 2.8274,
"step": 37000
},
{
"epoch": 20.57048820625343,
"grad_norm": 4.139565467834473,
"learning_rate": 3.971475589687329e-05,
"loss": 2.8371,
"step": 37500
},
{
"epoch": 20.844761382336806,
"grad_norm": 3.7124762535095215,
"learning_rate": 3.95776193088316e-05,
"loss": 2.8401,
"step": 38000
},
{
"epoch": 21.119034558420186,
"grad_norm": 3.772149085998535,
"learning_rate": 3.9440482720789904e-05,
"loss": 2.8083,
"step": 38500
},
{
"epoch": 21.393307734503566,
"grad_norm": 4.185425758361816,
"learning_rate": 3.930334613274822e-05,
"loss": 2.7767,
"step": 39000
},
{
"epoch": 21.667580910586945,
"grad_norm": 3.800649881362915,
"learning_rate": 3.916620954470653e-05,
"loss": 2.7723,
"step": 39500
},
{
"epoch": 21.941854086670325,
"grad_norm": 3.800741195678711,
"learning_rate": 3.902907295666484e-05,
"loss": 2.774,
"step": 40000
},
{
"epoch": 22.2161272627537,
"grad_norm": 3.7834713459014893,
"learning_rate": 3.889193636862315e-05,
"loss": 2.7329,
"step": 40500
},
{
"epoch": 22.49040043883708,
"grad_norm": 4.18643045425415,
"learning_rate": 3.8754799780581465e-05,
"loss": 2.7181,
"step": 41000
},
{
"epoch": 22.76467361492046,
"grad_norm": 3.758415460586548,
"learning_rate": 3.861766319253977e-05,
"loss": 2.7148,
"step": 41500
},
{
"epoch": 23.03894679100384,
"grad_norm": 4.028139114379883,
"learning_rate": 3.8480526604498076e-05,
"loss": 2.7025,
"step": 42000
},
{
"epoch": 23.313219967087218,
"grad_norm": 3.779428243637085,
"learning_rate": 3.8343390016456395e-05,
"loss": 2.6542,
"step": 42500
},
{
"epoch": 23.587493143170597,
"grad_norm": 3.6896047592163086,
"learning_rate": 3.82062534284147e-05,
"loss": 2.6662,
"step": 43000
},
{
"epoch": 23.861766319253977,
"grad_norm": 3.527114152908325,
"learning_rate": 3.806911684037301e-05,
"loss": 2.6697,
"step": 43500
},
{
"epoch": 24.136039495337357,
"grad_norm": 3.832408905029297,
"learning_rate": 3.7931980252331326e-05,
"loss": 2.6443,
"step": 44000
},
{
"epoch": 24.410312671420733,
"grad_norm": 3.912022352218628,
"learning_rate": 3.779484366428964e-05,
"loss": 2.6098,
"step": 44500
},
{
"epoch": 24.684585847504113,
"grad_norm": 3.832465648651123,
"learning_rate": 3.7657707076247944e-05,
"loss": 2.6187,
"step": 45000
},
{
"epoch": 24.958859023587493,
"grad_norm": 4.292030334472656,
"learning_rate": 3.752057048820625e-05,
"loss": 2.6216,
"step": 45500
},
{
"epoch": 25.233132199670873,
"grad_norm": 3.9441139698028564,
"learning_rate": 3.738343390016457e-05,
"loss": 2.5674,
"step": 46000
},
{
"epoch": 25.507405375754253,
"grad_norm": 3.8073363304138184,
"learning_rate": 3.7246297312122874e-05,
"loss": 2.5613,
"step": 46500
},
{
"epoch": 25.78167855183763,
"grad_norm": 3.95381498336792,
"learning_rate": 3.7109160724081186e-05,
"loss": 2.5744,
"step": 47000
},
{
"epoch": 26.05595172792101,
"grad_norm": 4.265843391418457,
"learning_rate": 3.69720241360395e-05,
"loss": 2.5581,
"step": 47500
},
{
"epoch": 26.33022490400439,
"grad_norm": 3.7907886505126953,
"learning_rate": 3.683488754799781e-05,
"loss": 2.5085,
"step": 48000
},
{
"epoch": 26.60449808008777,
"grad_norm": 3.9580938816070557,
"learning_rate": 3.6697750959956116e-05,
"loss": 2.5244,
"step": 48500
},
{
"epoch": 26.878771256171145,
"grad_norm": 3.725271701812744,
"learning_rate": 3.656061437191443e-05,
"loss": 2.5283,
"step": 49000
},
{
"epoch": 27.153044432254525,
"grad_norm": 4.27789831161499,
"learning_rate": 3.642347778387274e-05,
"loss": 2.4978,
"step": 49500
},
{
"epoch": 27.427317608337905,
"grad_norm": 4.248454570770264,
"learning_rate": 3.628634119583105e-05,
"loss": 2.478,
"step": 50000
},
{
"epoch": 27.701590784421285,
"grad_norm": 3.7782256603240967,
"learning_rate": 3.614920460778936e-05,
"loss": 2.4797,
"step": 50500
},
{
"epoch": 27.97586396050466,
"grad_norm": 3.996277332305908,
"learning_rate": 3.601206801974767e-05,
"loss": 2.4872,
"step": 51000
},
{
"epoch": 28.25013713658804,
"grad_norm": 4.143040657043457,
"learning_rate": 3.5874931431705984e-05,
"loss": 2.4311,
"step": 51500
},
{
"epoch": 28.52441031267142,
"grad_norm": 3.6849589347839355,
"learning_rate": 3.573779484366429e-05,
"loss": 2.4377,
"step": 52000
},
{
"epoch": 28.7986834887548,
"grad_norm": 3.621760606765747,
"learning_rate": 3.56006582556226e-05,
"loss": 2.4425,
"step": 52500
},
{
"epoch": 29.07295666483818,
"grad_norm": 3.7394306659698486,
"learning_rate": 3.5463521667580914e-05,
"loss": 2.4287,
"step": 53000
},
{
"epoch": 29.347229840921557,
"grad_norm": 3.782111167907715,
"learning_rate": 3.532638507953922e-05,
"loss": 2.3911,
"step": 53500
},
{
"epoch": 29.621503017004937,
"grad_norm": 4.35050106048584,
"learning_rate": 3.518924849149753e-05,
"loss": 2.4084,
"step": 54000
},
{
"epoch": 29.895776193088317,
"grad_norm": 3.8727004528045654,
"learning_rate": 3.5052111903455844e-05,
"loss": 2.3928,
"step": 54500
},
{
"epoch": 30.170049369171696,
"grad_norm": 3.974501371383667,
"learning_rate": 3.4914975315414157e-05,
"loss": 2.3668,
"step": 55000
},
{
"epoch": 30.444322545255073,
"grad_norm": 3.7882275581359863,
"learning_rate": 3.477783872737246e-05,
"loss": 2.3686,
"step": 55500
},
{
"epoch": 30.718595721338453,
"grad_norm": 3.8313581943511963,
"learning_rate": 3.4640702139330774e-05,
"loss": 2.3707,
"step": 56000
},
{
"epoch": 30.992868897421832,
"grad_norm": 4.046344757080078,
"learning_rate": 3.450356555128909e-05,
"loss": 2.3632,
"step": 56500
},
{
"epoch": 31.267142073505212,
"grad_norm": 3.7165708541870117,
"learning_rate": 3.436642896324739e-05,
"loss": 2.3229,
"step": 57000
},
{
"epoch": 31.54141524958859,
"grad_norm": 3.8072948455810547,
"learning_rate": 3.4229292375205705e-05,
"loss": 2.3262,
"step": 57500
},
{
"epoch": 31.81568842567197,
"grad_norm": 3.8423380851745605,
"learning_rate": 3.409215578716402e-05,
"loss": 2.3311,
"step": 58000
},
{
"epoch": 32.08996160175535,
"grad_norm": 3.831343650817871,
"learning_rate": 3.395501919912233e-05,
"loss": 2.3213,
"step": 58500
},
{
"epoch": 32.364234777838725,
"grad_norm": 3.8060476779937744,
"learning_rate": 3.3817882611080635e-05,
"loss": 2.2849,
"step": 59000
},
{
"epoch": 32.63850795392211,
"grad_norm": 4.033987998962402,
"learning_rate": 3.368074602303895e-05,
"loss": 2.2979,
"step": 59500
},
{
"epoch": 32.912781130005484,
"grad_norm": 3.870171546936035,
"learning_rate": 3.354360943499726e-05,
"loss": 2.2982,
"step": 60000
},
{
"epoch": 33.18705430608887,
"grad_norm": 3.848620653152466,
"learning_rate": 3.340647284695557e-05,
"loss": 2.264,
"step": 60500
},
{
"epoch": 33.461327482172244,
"grad_norm": 4.048386573791504,
"learning_rate": 3.326933625891388e-05,
"loss": 2.2568,
"step": 61000
},
{
"epoch": 33.73560065825562,
"grad_norm": 4.029069900512695,
"learning_rate": 3.313219967087219e-05,
"loss": 2.2602,
"step": 61500
},
{
"epoch": 34.009873834339004,
"grad_norm": 3.759799003601074,
"learning_rate": 3.29950630828305e-05,
"loss": 2.2763,
"step": 62000
},
{
"epoch": 34.28414701042238,
"grad_norm": 4.440002918243408,
"learning_rate": 3.285792649478881e-05,
"loss": 2.2137,
"step": 62500
},
{
"epoch": 34.55842018650576,
"grad_norm": 3.961390972137451,
"learning_rate": 3.272078990674713e-05,
"loss": 2.233,
"step": 63000
},
{
"epoch": 34.83269336258914,
"grad_norm": 3.916156768798828,
"learning_rate": 3.258365331870543e-05,
"loss": 2.2414,
"step": 63500
},
{
"epoch": 35.106966538672516,
"grad_norm": 3.746572732925415,
"learning_rate": 3.2446516730663745e-05,
"loss": 2.2137,
"step": 64000
},
{
"epoch": 35.3812397147559,
"grad_norm": 3.8424971103668213,
"learning_rate": 3.230938014262205e-05,
"loss": 2.1907,
"step": 64500
},
{
"epoch": 35.655512890839276,
"grad_norm": 4.250007152557373,
"learning_rate": 3.217224355458036e-05,
"loss": 2.2104,
"step": 65000
},
{
"epoch": 35.92978606692265,
"grad_norm": 3.760779857635498,
"learning_rate": 3.2035106966538675e-05,
"loss": 2.2134,
"step": 65500
},
{
"epoch": 36.204059243006036,
"grad_norm": 4.189092636108398,
"learning_rate": 3.189797037849698e-05,
"loss": 2.1714,
"step": 66000
},
{
"epoch": 36.47833241908941,
"grad_norm": 3.6505000591278076,
"learning_rate": 3.17608337904553e-05,
"loss": 2.1635,
"step": 66500
},
{
"epoch": 36.752605595172795,
"grad_norm": 3.7696096897125244,
"learning_rate": 3.1623697202413605e-05,
"loss": 2.1832,
"step": 67000
},
{
"epoch": 37.02687877125617,
"grad_norm": 3.744976282119751,
"learning_rate": 3.148656061437192e-05,
"loss": 2.1753,
"step": 67500
},
{
"epoch": 37.30115194733955,
"grad_norm": 3.9759116172790527,
"learning_rate": 3.134942402633022e-05,
"loss": 2.1326,
"step": 68000
},
{
"epoch": 37.57542512342293,
"grad_norm": 3.719237804412842,
"learning_rate": 3.1212287438288536e-05,
"loss": 2.1423,
"step": 68500
},
{
"epoch": 37.84969829950631,
"grad_norm": 4.290117263793945,
"learning_rate": 3.107515085024685e-05,
"loss": 2.1602,
"step": 69000
},
{
"epoch": 38.123971475589684,
"grad_norm": 4.060342311859131,
"learning_rate": 3.0938014262205153e-05,
"loss": 2.1349,
"step": 69500
},
{
"epoch": 38.39824465167307,
"grad_norm": 4.048706531524658,
"learning_rate": 3.080087767416347e-05,
"loss": 2.1018,
"step": 70000
},
{
"epoch": 38.672517827756444,
"grad_norm": 4.130014896392822,
"learning_rate": 3.066374108612178e-05,
"loss": 2.1284,
"step": 70500
},
{
"epoch": 38.94679100383983,
"grad_norm": 3.893848419189453,
"learning_rate": 3.052660449808009e-05,
"loss": 2.1255,
"step": 71000
},
{
"epoch": 39.221064179923204,
"grad_norm": 3.6254563331604004,
"learning_rate": 3.03894679100384e-05,
"loss": 2.0848,
"step": 71500
},
{
"epoch": 39.49533735600658,
"grad_norm": 4.06374979019165,
"learning_rate": 3.025233132199671e-05,
"loss": 2.0933,
"step": 72000
},
{
"epoch": 39.76961053208996,
"grad_norm": 3.763274908065796,
"learning_rate": 3.011519473395502e-05,
"loss": 2.11,
"step": 72500
},
{
"epoch": 40.04388370817334,
"grad_norm": 4.014530181884766,
"learning_rate": 2.997805814591333e-05,
"loss": 2.0918,
"step": 73000
},
{
"epoch": 40.31815688425672,
"grad_norm": 3.6787962913513184,
"learning_rate": 2.9840921557871642e-05,
"loss": 2.0559,
"step": 73500
},
{
"epoch": 40.5924300603401,
"grad_norm": 3.752711057662964,
"learning_rate": 2.970378496982995e-05,
"loss": 2.067,
"step": 74000
},
{
"epoch": 40.866703236423476,
"grad_norm": 3.795217752456665,
"learning_rate": 2.9566648381788263e-05,
"loss": 2.0792,
"step": 74500
},
{
"epoch": 41.14097641250686,
"grad_norm": 3.7484569549560547,
"learning_rate": 2.9429511793746572e-05,
"loss": 2.0608,
"step": 75000
},
{
"epoch": 41.415249588590235,
"grad_norm": 3.601229190826416,
"learning_rate": 2.9292375205704885e-05,
"loss": 2.0337,
"step": 75500
},
{
"epoch": 41.68952276467361,
"grad_norm": 3.9707863330841064,
"learning_rate": 2.9155238617663194e-05,
"loss": 2.0426,
"step": 76000
},
{
"epoch": 41.963795940756995,
"grad_norm": 3.9523677825927734,
"learning_rate": 2.9018102029621502e-05,
"loss": 2.0571,
"step": 76500
},
{
"epoch": 42.23806911684037,
"grad_norm": 4.046602725982666,
"learning_rate": 2.8880965441579815e-05,
"loss": 2.0225,
"step": 77000
},
{
"epoch": 42.512342292923755,
"grad_norm": 4.059443950653076,
"learning_rate": 2.8743828853538124e-05,
"loss": 2.0185,
"step": 77500
},
{
"epoch": 42.78661546900713,
"grad_norm": 4.066934108734131,
"learning_rate": 2.8606692265496436e-05,
"loss": 2.0342,
"step": 78000
},
{
"epoch": 43.06088864509051,
"grad_norm": 3.811591386795044,
"learning_rate": 2.8469555677454745e-05,
"loss": 2.0216,
"step": 78500
},
{
"epoch": 43.33516182117389,
"grad_norm": 3.979374408721924,
"learning_rate": 2.833241908941306e-05,
"loss": 1.989,
"step": 79000
},
{
"epoch": 43.60943499725727,
"grad_norm": 3.67275333404541,
"learning_rate": 2.8195282501371366e-05,
"loss": 1.9958,
"step": 79500
},
{
"epoch": 43.88370817334065,
"grad_norm": 3.790217399597168,
"learning_rate": 2.8058145913329675e-05,
"loss": 2.0102,
"step": 80000
},
{
"epoch": 44.15798134942403,
"grad_norm": 3.9934496879577637,
"learning_rate": 2.7921009325287988e-05,
"loss": 1.9847,
"step": 80500
},
{
"epoch": 44.4322545255074,
"grad_norm": 4.339521408081055,
"learning_rate": 2.7783872737246297e-05,
"loss": 1.9752,
"step": 81000
},
{
"epoch": 44.70652770159079,
"grad_norm": 3.5851519107818604,
"learning_rate": 2.764673614920461e-05,
"loss": 1.9883,
"step": 81500
},
{
"epoch": 44.98080087767416,
"grad_norm": 4.129305362701416,
"learning_rate": 2.7509599561162918e-05,
"loss": 1.9896,
"step": 82000
},
{
"epoch": 45.25507405375754,
"grad_norm": 3.752852201461792,
"learning_rate": 2.7372462973121234e-05,
"loss": 1.9465,
"step": 82500
},
{
"epoch": 45.52934722984092,
"grad_norm": 3.923309087753296,
"learning_rate": 2.723532638507954e-05,
"loss": 1.9589,
"step": 83000
},
{
"epoch": 45.8036204059243,
"grad_norm": 4.141747951507568,
"learning_rate": 2.7098189797037848e-05,
"loss": 1.9662,
"step": 83500
},
{
"epoch": 46.07789358200768,
"grad_norm": 4.118216514587402,
"learning_rate": 2.696105320899616e-05,
"loss": 1.9518,
"step": 84000
},
{
"epoch": 46.35216675809106,
"grad_norm": 4.061371326446533,
"learning_rate": 2.682391662095447e-05,
"loss": 1.928,
"step": 84500
},
{
"epoch": 46.626439934174435,
"grad_norm": 4.138849258422852,
"learning_rate": 2.6686780032912785e-05,
"loss": 1.9456,
"step": 85000
},
{
"epoch": 46.90071311025782,
"grad_norm": 3.9675650596618652,
"learning_rate": 2.654964344487109e-05,
"loss": 1.9465,
"step": 85500
},
{
"epoch": 47.174986286341195,
"grad_norm": 3.745779514312744,
"learning_rate": 2.6412506856829406e-05,
"loss": 1.9293,
"step": 86000
},
{
"epoch": 47.44925946242458,
"grad_norm": 3.6988871097564697,
"learning_rate": 2.6275370268787712e-05,
"loss": 1.915,
"step": 86500
},
{
"epoch": 47.723532638507955,
"grad_norm": 3.7044730186462402,
"learning_rate": 2.613823368074602e-05,
"loss": 1.9199,
"step": 87000
},
{
"epoch": 47.99780581459133,
"grad_norm": 3.6700057983398438,
"learning_rate": 2.6001097092704337e-05,
"loss": 1.9243,
"step": 87500
},
{
"epoch": 48.272078990674714,
"grad_norm": 3.89973521232605,
"learning_rate": 2.5863960504662642e-05,
"loss": 1.8846,
"step": 88000
},
{
"epoch": 48.54635216675809,
"grad_norm": 4.041015625,
"learning_rate": 2.5726823916620958e-05,
"loss": 1.8999,
"step": 88500
},
{
"epoch": 48.82062534284147,
"grad_norm": 3.7937917709350586,
"learning_rate": 2.5589687328579264e-05,
"loss": 1.9085,
"step": 89000
},
{
"epoch": 49.09489851892485,
"grad_norm": 4.050382614135742,
"learning_rate": 2.545255074053758e-05,
"loss": 1.8934,
"step": 89500
},
{
"epoch": 49.36917169500823,
"grad_norm": 3.809558391571045,
"learning_rate": 2.5315414152495888e-05,
"loss": 1.8705,
"step": 90000
},
{
"epoch": 49.64344487109161,
"grad_norm": 3.6460201740264893,
"learning_rate": 2.51782775644542e-05,
"loss": 1.8904,
"step": 90500
},
{
"epoch": 49.917718047174986,
"grad_norm": 3.959718704223633,
"learning_rate": 2.504114097641251e-05,
"loss": 1.8936,
"step": 91000
},
{
"epoch": 50.19199122325836,
"grad_norm": 3.786888837814331,
"learning_rate": 2.490400438837082e-05,
"loss": 1.8683,
"step": 91500
},
{
"epoch": 50.466264399341746,
"grad_norm": 3.477952241897583,
"learning_rate": 2.476686780032913e-05,
"loss": 1.8634,
"step": 92000
},
{
"epoch": 50.74053757542512,
"grad_norm": 3.998764991760254,
"learning_rate": 2.462973121228744e-05,
"loss": 1.8637,
"step": 92500
},
{
"epoch": 51.014810751508506,
"grad_norm": 4.029101848602295,
"learning_rate": 2.449259462424575e-05,
"loss": 1.8749,
"step": 93000
},
{
"epoch": 51.28908392759188,
"grad_norm": 3.8711071014404297,
"learning_rate": 2.435545803620406e-05,
"loss": 1.8377,
"step": 93500
},
{
"epoch": 51.56335710367526,
"grad_norm": 3.922783136367798,
"learning_rate": 2.421832144816237e-05,
"loss": 1.8397,
"step": 94000
},
{
"epoch": 51.83763027975864,
"grad_norm": 4.025134086608887,
"learning_rate": 2.4081184860120682e-05,
"loss": 1.8609,
"step": 94500
},
{
"epoch": 52.11190345584202,
"grad_norm": 3.800508975982666,
"learning_rate": 2.394404827207899e-05,
"loss": 1.8407,
"step": 95000
},
{
"epoch": 52.386176631925395,
"grad_norm": 3.944465160369873,
"learning_rate": 2.3806911684037304e-05,
"loss": 1.8263,
"step": 95500
},
{
"epoch": 52.66044980800878,
"grad_norm": 4.014648914337158,
"learning_rate": 2.3669775095995613e-05,
"loss": 1.8374,
"step": 96000
},
{
"epoch": 52.934722984092154,
"grad_norm": 4.08259916305542,
"learning_rate": 2.353263850795392e-05,
"loss": 1.8448,
"step": 96500
},
{
"epoch": 53.20899616017554,
"grad_norm": 3.941981792449951,
"learning_rate": 2.3395501919912234e-05,
"loss": 1.8131,
"step": 97000
},
{
"epoch": 53.483269336258914,
"grad_norm": 3.8573715686798096,
"learning_rate": 2.3258365331870543e-05,
"loss": 1.8169,
"step": 97500
},
{
"epoch": 53.75754251234229,
"grad_norm": 3.987938165664673,
"learning_rate": 2.3121228743828855e-05,
"loss": 1.8145,
"step": 98000
},
{
"epoch": 54.031815688425674,
"grad_norm": 3.652238607406616,
"learning_rate": 2.2984092155787164e-05,
"loss": 1.8213,
"step": 98500
},
{
"epoch": 54.30608886450905,
"grad_norm": 3.640587568283081,
"learning_rate": 2.2846955567745476e-05,
"loss": 1.7938,
"step": 99000
},
{
"epoch": 54.58036204059243,
"grad_norm": 3.884443759918213,
"learning_rate": 2.2709818979703785e-05,
"loss": 1.7963,
"step": 99500
},
{
"epoch": 54.85463521667581,
"grad_norm": 4.245452880859375,
"learning_rate": 2.2572682391662098e-05,
"loss": 1.811,
"step": 100000
},
{
"epoch": 55.128908392759186,
"grad_norm": 3.97247576713562,
"learning_rate": 2.2435545803620407e-05,
"loss": 1.7964,
"step": 100500
},
{
"epoch": 55.40318156884257,
"grad_norm": 3.8827009201049805,
"learning_rate": 2.2298409215578716e-05,
"loss": 1.7774,
"step": 101000
},
{
"epoch": 55.677454744925946,
"grad_norm": 4.079446792602539,
"learning_rate": 2.2161272627537028e-05,
"loss": 1.7884,
"step": 101500
},
{
"epoch": 55.95172792100932,
"grad_norm": 4.093244552612305,
"learning_rate": 2.2024136039495337e-05,
"loss": 1.7904,
"step": 102000
},
{
"epoch": 56.226001097092706,
"grad_norm": 3.674686908721924,
"learning_rate": 2.188699945145365e-05,
"loss": 1.7655,
"step": 102500
},
{
"epoch": 56.50027427317608,
"grad_norm": 4.042862892150879,
"learning_rate": 2.174986286341196e-05,
"loss": 1.7708,
"step": 103000
},
{
"epoch": 56.774547449259465,
"grad_norm": 4.069617748260498,
"learning_rate": 2.161272627537027e-05,
"loss": 1.777,
"step": 103500
},
{
"epoch": 57.04882062534284,
"grad_norm": 3.68752384185791,
"learning_rate": 2.1475589687328583e-05,
"loss": 1.7702,
"step": 104000
},
{
"epoch": 57.32309380142622,
"grad_norm": 3.998215436935425,
"learning_rate": 2.133845309928689e-05,
"loss": 1.7508,
"step": 104500
},
{
"epoch": 57.5973669775096,
"grad_norm": 4.300554275512695,
"learning_rate": 2.12013165112452e-05,
"loss": 1.7634,
"step": 105000
},
{
"epoch": 57.87164015359298,
"grad_norm": 3.649411678314209,
"learning_rate": 2.106417992320351e-05,
"loss": 1.7603,
"step": 105500
},
{
"epoch": 58.14591332967636,
"grad_norm": 4.17492151260376,
"learning_rate": 2.0927043335161822e-05,
"loss": 1.7441,
"step": 106000
},
{
"epoch": 58.42018650575974,
"grad_norm": 3.8550057411193848,
"learning_rate": 2.0789906747120134e-05,
"loss": 1.7381,
"step": 106500
},
{
"epoch": 58.694459681843114,
"grad_norm": 4.004961967468262,
"learning_rate": 2.0652770159078443e-05,
"loss": 1.7475,
"step": 107000
},
{
"epoch": 58.9687328579265,
"grad_norm": 3.936483144760132,
"learning_rate": 2.0515633571036756e-05,
"loss": 1.7532,
"step": 107500
},
{
"epoch": 59.24300603400987,
"grad_norm": 3.812488317489624,
"learning_rate": 2.037849698299506e-05,
"loss": 1.7232,
"step": 108000
},
{
"epoch": 59.51727921009325,
"grad_norm": 4.185512542724609,
"learning_rate": 2.0241360394953374e-05,
"loss": 1.7196,
"step": 108500
},
{
"epoch": 59.79155238617663,
"grad_norm": 4.278858184814453,
"learning_rate": 2.0104223806911686e-05,
"loss": 1.7356,
"step": 109000
},
{
"epoch": 60.06582556226001,
"grad_norm": 4.104213714599609,
"learning_rate": 1.9967087218869995e-05,
"loss": 1.7328,
"step": 109500
},
{
"epoch": 60.34009873834339,
"grad_norm": 4.215428352355957,
"learning_rate": 1.9829950630828307e-05,
"loss": 1.7109,
"step": 110000
},
{
"epoch": 60.61437191442677,
"grad_norm": 4.020122528076172,
"learning_rate": 1.9692814042786616e-05,
"loss": 1.7144,
"step": 110500
},
{
"epoch": 60.888645090510146,
"grad_norm": 3.9703729152679443,
"learning_rate": 1.955567745474493e-05,
"loss": 1.7335,
"step": 111000
},
{
"epoch": 61.16291826659353,
"grad_norm": 3.900017023086548,
"learning_rate": 1.9418540866703238e-05,
"loss": 1.7048,
"step": 111500
},
{
"epoch": 61.437191442676905,
"grad_norm": 4.7137627601623535,
"learning_rate": 1.9281404278661547e-05,
"loss": 1.6902,
"step": 112000
},
{
"epoch": 61.71146461876029,
"grad_norm": 4.035908222198486,
"learning_rate": 1.914426769061986e-05,
"loss": 1.7104,
"step": 112500
},
{
"epoch": 61.985737794843665,
"grad_norm": 4.041805744171143,
"learning_rate": 1.9007131102578168e-05,
"loss": 1.7155,
"step": 113000
},
{
"epoch": 62.26001097092704,
"grad_norm": 3.817702054977417,
"learning_rate": 1.886999451453648e-05,
"loss": 1.6851,
"step": 113500
},
{
"epoch": 62.534284147010425,
"grad_norm": 3.8696234226226807,
"learning_rate": 1.873285792649479e-05,
"loss": 1.6857,
"step": 114000
},
{
"epoch": 62.8085573230938,
"grad_norm": 3.909179925918579,
"learning_rate": 1.85957213384531e-05,
"loss": 1.7037,
"step": 114500
},
{
"epoch": 63.08283049917718,
"grad_norm": 3.8557326793670654,
"learning_rate": 1.845858475041141e-05,
"loss": 1.6936,
"step": 115000
},
{
"epoch": 63.35710367526056,
"grad_norm": 4.332828044891357,
"learning_rate": 1.832144816236972e-05,
"loss": 1.6786,
"step": 115500
},
{
"epoch": 63.63137685134394,
"grad_norm": 4.454130172729492,
"learning_rate": 1.818431157432803e-05,
"loss": 1.6768,
"step": 116000
},
{
"epoch": 63.90565002742732,
"grad_norm": 3.943071126937866,
"learning_rate": 1.804717498628634e-05,
"loss": 1.6905,
"step": 116500
},
{
"epoch": 64.1799232035107,
"grad_norm": 4.255739688873291,
"learning_rate": 1.7910038398244653e-05,
"loss": 1.663,
"step": 117000
},
{
"epoch": 64.45419637959408,
"grad_norm": 4.027384281158447,
"learning_rate": 1.7772901810202962e-05,
"loss": 1.6603,
"step": 117500
},
{
"epoch": 64.72846955567745,
"grad_norm": 3.8232147693634033,
"learning_rate": 1.7635765222161274e-05,
"loss": 1.6701,
"step": 118000
},
{
"epoch": 65.00274273176083,
"grad_norm": 3.83734130859375,
"learning_rate": 1.7498628634119583e-05,
"loss": 1.6797,
"step": 118500
},
{
"epoch": 65.27701590784422,
"grad_norm": 3.9775922298431396,
"learning_rate": 1.7361492046077896e-05,
"loss": 1.6441,
"step": 119000
},
{
"epoch": 65.55128908392759,
"grad_norm": 3.810086250305176,
"learning_rate": 1.7224355458036205e-05,
"loss": 1.6534,
"step": 119500
},
{
"epoch": 65.82556226001097,
"grad_norm": 4.3292036056518555,
"learning_rate": 1.7087218869994513e-05,
"loss": 1.6651,
"step": 120000
},
{
"epoch": 66.09983543609435,
"grad_norm": 4.046462535858154,
"learning_rate": 1.6950082281952826e-05,
"loss": 1.654,
"step": 120500
},
{
"epoch": 66.37410861217774,
"grad_norm": 4.200257778167725,
"learning_rate": 1.6812945693911135e-05,
"loss": 1.6383,
"step": 121000
},
{
"epoch": 66.6483817882611,
"grad_norm": 3.98045015335083,
"learning_rate": 1.6675809105869447e-05,
"loss": 1.6571,
"step": 121500
},
{
"epoch": 66.92265496434449,
"grad_norm": 3.9323537349700928,
"learning_rate": 1.653867251782776e-05,
"loss": 1.6486,
"step": 122000
},
{
"epoch": 67.19692814042787,
"grad_norm": 3.898150682449341,
"learning_rate": 1.640153592978607e-05,
"loss": 1.6288,
"step": 122500
},
{
"epoch": 67.47120131651124,
"grad_norm": 3.8490869998931885,
"learning_rate": 1.6264399341744377e-05,
"loss": 1.6305,
"step": 123000
},
{
"epoch": 67.74547449259462,
"grad_norm": 4.125833034515381,
"learning_rate": 1.6127262753702686e-05,
"loss": 1.6393,
"step": 123500
},
{
"epoch": 68.01974766867801,
"grad_norm": 4.1837286949157715,
"learning_rate": 1.5990126165661e-05,
"loss": 1.6441,
"step": 124000
},
{
"epoch": 68.29402084476138,
"grad_norm": 4.150059700012207,
"learning_rate": 1.585298957761931e-05,
"loss": 1.6144,
"step": 124500
},
{
"epoch": 68.56829402084476,
"grad_norm": 4.325094223022461,
"learning_rate": 1.571585298957762e-05,
"loss": 1.6254,
"step": 125000
},
{
"epoch": 68.84256719692814,
"grad_norm": 3.9832139015197754,
"learning_rate": 1.5578716401535932e-05,
"loss": 1.636,
"step": 125500
},
{
"epoch": 69.11684037301151,
"grad_norm": 3.9516079425811768,
"learning_rate": 1.544157981349424e-05,
"loss": 1.6183,
"step": 126000
},
{
"epoch": 69.3911135490949,
"grad_norm": 3.982802391052246,
"learning_rate": 1.5304443225452554e-05,
"loss": 1.6116,
"step": 126500
},
{
"epoch": 69.66538672517828,
"grad_norm": 4.178645610809326,
"learning_rate": 1.516730663741086e-05,
"loss": 1.6183,
"step": 127000
},
{
"epoch": 69.93965990126166,
"grad_norm": 4.045616149902344,
"learning_rate": 1.5030170049369171e-05,
"loss": 1.6226,
"step": 127500
},
{
"epoch": 70.21393307734503,
"grad_norm": 4.098151206970215,
"learning_rate": 1.4893033461327482e-05,
"loss": 1.5999,
"step": 128000
},
{
"epoch": 70.48820625342842,
"grad_norm": 4.052021026611328,
"learning_rate": 1.4755896873285793e-05,
"loss": 1.5946,
"step": 128500
},
{
"epoch": 70.7624794295118,
"grad_norm": 3.7580652236938477,
"learning_rate": 1.4618760285244103e-05,
"loss": 1.6172,
"step": 129000
},
{
"epoch": 71.03675260559517,
"grad_norm": 4.1435866355896,
"learning_rate": 1.4481623697202416e-05,
"loss": 1.6121,
"step": 129500
},
{
"epoch": 71.31102578167855,
"grad_norm": 3.739654302597046,
"learning_rate": 1.4344487109160726e-05,
"loss": 1.588,
"step": 130000
},
{
"epoch": 71.58529895776194,
"grad_norm": 4.380291938781738,
"learning_rate": 1.4207350521119034e-05,
"loss": 1.5998,
"step": 130500
},
{
"epoch": 71.8595721338453,
"grad_norm": 3.7885782718658447,
"learning_rate": 1.4070213933077344e-05,
"loss": 1.5958,
"step": 131000
},
{
"epoch": 72.13384530992869,
"grad_norm": 4.236293792724609,
"learning_rate": 1.3933077345035655e-05,
"loss": 1.5956,
"step": 131500
},
{
"epoch": 72.40811848601207,
"grad_norm": 4.205173015594482,
"learning_rate": 1.3795940756993966e-05,
"loss": 1.5766,
"step": 132000
},
{
"epoch": 72.68239166209544,
"grad_norm": 4.034268379211426,
"learning_rate": 1.3658804168952278e-05,
"loss": 1.5912,
"step": 132500
},
{
"epoch": 72.95666483817882,
"grad_norm": 3.9170260429382324,
"learning_rate": 1.3521667580910589e-05,
"loss": 1.5897,
"step": 133000
},
{
"epoch": 73.23093801426221,
"grad_norm": 3.925799608230591,
"learning_rate": 1.33845309928689e-05,
"loss": 1.5765,
"step": 133500
},
{
"epoch": 73.50521119034559,
"grad_norm": 4.052227973937988,
"learning_rate": 1.324739440482721e-05,
"loss": 1.5775,
"step": 134000
},
{
"epoch": 73.77948436642896,
"grad_norm": 4.2378034591674805,
"learning_rate": 1.3110257816785517e-05,
"loss": 1.5834,
"step": 134500
},
{
"epoch": 74.05375754251234,
"grad_norm": 4.073320388793945,
"learning_rate": 1.297312122874383e-05,
"loss": 1.5812,
"step": 135000
},
{
"epoch": 74.32803071859573,
"grad_norm": 4.102873802185059,
"learning_rate": 1.283598464070214e-05,
"loss": 1.5588,
"step": 135500
},
{
"epoch": 74.6023038946791,
"grad_norm": 4.223252773284912,
"learning_rate": 1.269884805266045e-05,
"loss": 1.5785,
"step": 136000
},
{
"epoch": 74.87657707076248,
"grad_norm": 4.320130825042725,
"learning_rate": 1.2561711464618761e-05,
"loss": 1.5686,
"step": 136500
},
{
"epoch": 75.15085024684586,
"grad_norm": 4.706448078155518,
"learning_rate": 1.2424574876577072e-05,
"loss": 1.5688,
"step": 137000
},
{
"epoch": 75.42512342292923,
"grad_norm": 4.024387359619141,
"learning_rate": 1.2287438288535381e-05,
"loss": 1.5563,
"step": 137500
},
{
"epoch": 75.69939659901262,
"grad_norm": 3.9221880435943604,
"learning_rate": 1.2150301700493692e-05,
"loss": 1.5712,
"step": 138000
},
{
"epoch": 75.973669775096,
"grad_norm": 4.27291202545166,
"learning_rate": 1.2013165112452002e-05,
"loss": 1.5654,
"step": 138500
},
{
"epoch": 76.24794295117937,
"grad_norm": 4.373564720153809,
"learning_rate": 1.1876028524410313e-05,
"loss": 1.5449,
"step": 139000
},
{
"epoch": 76.52221612726275,
"grad_norm": 4.030310153961182,
"learning_rate": 1.1738891936368624e-05,
"loss": 1.5571,
"step": 139500
},
{
"epoch": 76.79648930334614,
"grad_norm": 4.002580165863037,
"learning_rate": 1.1601755348326934e-05,
"loss": 1.5568,
"step": 140000
},
{
"epoch": 77.07076247942952,
"grad_norm": 4.0623369216918945,
"learning_rate": 1.1464618760285245e-05,
"loss": 1.5545,
"step": 140500
},
{
"epoch": 77.34503565551289,
"grad_norm": 4.049304485321045,
"learning_rate": 1.1327482172243554e-05,
"loss": 1.5442,
"step": 141000
},
{
"epoch": 77.61930883159627,
"grad_norm": 3.891969680786133,
"learning_rate": 1.1190345584201866e-05,
"loss": 1.5464,
"step": 141500
},
{
"epoch": 77.89358200767965,
"grad_norm": 4.165316104888916,
"learning_rate": 1.1053208996160177e-05,
"loss": 1.5509,
"step": 142000
},
{
"epoch": 78.16785518376302,
"grad_norm": 4.1472249031066895,
"learning_rate": 1.0916072408118487e-05,
"loss": 1.5375,
"step": 142500
},
{
"epoch": 78.44212835984641,
"grad_norm": 4.173414707183838,
"learning_rate": 1.0778935820076796e-05,
"loss": 1.5345,
"step": 143000
},
{
"epoch": 78.71640153592979,
"grad_norm": 3.9279398918151855,
"learning_rate": 1.0641799232035107e-05,
"loss": 1.5381,
"step": 143500
},
{
"epoch": 78.99067471201316,
"grad_norm": 4.222446441650391,
"learning_rate": 1.0504662643993418e-05,
"loss": 1.552,
"step": 144000
},
{
"epoch": 79.26494788809654,
"grad_norm": 3.8020248413085938,
"learning_rate": 1.0367526055951728e-05,
"loss": 1.5217,
"step": 144500
},
{
"epoch": 79.53922106417993,
"grad_norm": 3.8790934085845947,
"learning_rate": 1.0230389467910039e-05,
"loss": 1.5346,
"step": 145000
},
{
"epoch": 79.8134942402633,
"grad_norm": 4.49297571182251,
"learning_rate": 1.009325287986835e-05,
"loss": 1.5354,
"step": 145500
},
{
"epoch": 80.08776741634668,
"grad_norm": 4.024161338806152,
"learning_rate": 9.95611629182666e-06,
"loss": 1.5326,
"step": 146000
},
{
"epoch": 80.36204059243006,
"grad_norm": 3.997326374053955,
"learning_rate": 9.818979703784971e-06,
"loss": 1.5133,
"step": 146500
},
{
"epoch": 80.63631376851345,
"grad_norm": 4.163906574249268,
"learning_rate": 9.68184311574328e-06,
"loss": 1.5252,
"step": 147000
},
{
"epoch": 80.91058694459682,
"grad_norm": 4.333358287811279,
"learning_rate": 9.54470652770159e-06,
"loss": 1.5356,
"step": 147500
},
{
"epoch": 81.1848601206802,
"grad_norm": 4.201995372772217,
"learning_rate": 9.407569939659903e-06,
"loss": 1.5191,
"step": 148000
},
{
"epoch": 81.45913329676358,
"grad_norm": 3.8196020126342773,
"learning_rate": 9.270433351618212e-06,
"loss": 1.5168,
"step": 148500
},
{
"epoch": 81.73340647284695,
"grad_norm": 4.32403039932251,
"learning_rate": 9.133296763576522e-06,
"loss": 1.5214,
"step": 149000
},
{
"epoch": 82.00767964893033,
"grad_norm": 4.165477752685547,
"learning_rate": 8.996160175534833e-06,
"loss": 1.5242,
"step": 149500
},
{
"epoch": 82.28195282501372,
"grad_norm": 4.155007362365723,
"learning_rate": 8.859023587493144e-06,
"loss": 1.5022,
"step": 150000
},
{
"epoch": 82.55622600109709,
"grad_norm": 4.049638748168945,
"learning_rate": 8.721886999451453e-06,
"loss": 1.5172,
"step": 150500
},
{
"epoch": 82.83049917718047,
"grad_norm": 4.376342296600342,
"learning_rate": 8.584750411409765e-06,
"loss": 1.5097,
"step": 151000
},
{
"epoch": 83.10477235326385,
"grad_norm": 4.413540363311768,
"learning_rate": 8.447613823368076e-06,
"loss": 1.5163,
"step": 151500
},
{
"epoch": 83.37904552934722,
"grad_norm": 3.754805326461792,
"learning_rate": 8.310477235326386e-06,
"loss": 1.5011,
"step": 152000
},
{
"epoch": 83.6533187054306,
"grad_norm": 4.167300224304199,
"learning_rate": 8.173340647284695e-06,
"loss": 1.5026,
"step": 152500
},
{
"epoch": 83.92759188151399,
"grad_norm": 4.1614861488342285,
"learning_rate": 8.036204059243006e-06,
"loss": 1.5101,
"step": 153000
},
{
"epoch": 84.20186505759737,
"grad_norm": 4.183162212371826,
"learning_rate": 7.899067471201317e-06,
"loss": 1.4981,
"step": 153500
},
{
"epoch": 84.47613823368074,
"grad_norm": 4.0559539794921875,
"learning_rate": 7.761930883159627e-06,
"loss": 1.4965,
"step": 154000
},
{
"epoch": 84.75041140976413,
"grad_norm": 4.252512454986572,
"learning_rate": 7.624794295117937e-06,
"loss": 1.5019,
"step": 154500
},
{
"epoch": 85.02468458584751,
"grad_norm": 3.9115328788757324,
"learning_rate": 7.4876577070762485e-06,
"loss": 1.5138,
"step": 155000
},
{
"epoch": 85.29895776193088,
"grad_norm": 4.217545032501221,
"learning_rate": 7.350521119034559e-06,
"loss": 1.492,
"step": 155500
},
{
"epoch": 85.57323093801426,
"grad_norm": 3.974954128265381,
"learning_rate": 7.21338453099287e-06,
"loss": 1.4897,
"step": 156000
},
{
"epoch": 85.84750411409765,
"grad_norm": 4.266519069671631,
"learning_rate": 7.07624794295118e-06,
"loss": 1.4996,
"step": 156500
},
{
"epoch": 86.12177729018102,
"grad_norm": 3.8507697582244873,
"learning_rate": 6.93911135490949e-06,
"loss": 1.4891,
"step": 157000
},
{
"epoch": 86.3960504662644,
"grad_norm": 4.050006866455078,
"learning_rate": 6.801974766867801e-06,
"loss": 1.4848,
"step": 157500
},
{
"epoch": 86.67032364234778,
"grad_norm": 4.006500720977783,
"learning_rate": 6.664838178826111e-06,
"loss": 1.4946,
"step": 158000
},
{
"epoch": 86.94459681843115,
"grad_norm": 4.2527289390563965,
"learning_rate": 6.527701590784421e-06,
"loss": 1.495,
"step": 158500
},
{
"epoch": 87.21886999451453,
"grad_norm": 4.087696552276611,
"learning_rate": 6.390565002742732e-06,
"loss": 1.4834,
"step": 159000
},
{
"epoch": 87.49314317059792,
"grad_norm": 3.9683475494384766,
"learning_rate": 6.253428414701043e-06,
"loss": 1.48,
"step": 159500
},
{
"epoch": 87.7674163466813,
"grad_norm": 4.009182453155518,
"learning_rate": 6.116291826659353e-06,
"loss": 1.4799,
"step": 160000
},
{
"epoch": 88.04168952276467,
"grad_norm": 3.9172310829162598,
"learning_rate": 5.979155238617663e-06,
"loss": 1.4827,
"step": 160500
},
{
"epoch": 88.31596269884805,
"grad_norm": 3.920940399169922,
"learning_rate": 5.842018650575974e-06,
"loss": 1.4772,
"step": 161000
},
{
"epoch": 88.59023587493144,
"grad_norm": 4.178516387939453,
"learning_rate": 5.704882062534284e-06,
"loss": 1.4831,
"step": 161500
},
{
"epoch": 88.8645090510148,
"grad_norm": 4.068806171417236,
"learning_rate": 5.567745474492595e-06,
"loss": 1.4796,
"step": 162000
},
{
"epoch": 89.13878222709819,
"grad_norm": 3.923023223876953,
"learning_rate": 5.430608886450905e-06,
"loss": 1.4734,
"step": 162500
},
{
"epoch": 89.41305540318157,
"grad_norm": 4.0538411140441895,
"learning_rate": 5.293472298409216e-06,
"loss": 1.4675,
"step": 163000
},
{
"epoch": 89.68732857926494,
"grad_norm": 4.289505481719971,
"learning_rate": 5.156335710367526e-06,
"loss": 1.4812,
"step": 163500
},
{
"epoch": 89.96160175534833,
"grad_norm": 4.2184247970581055,
"learning_rate": 5.019199122325837e-06,
"loss": 1.4761,
"step": 164000
},
{
"epoch": 90.23587493143171,
"grad_norm": 4.014777183532715,
"learning_rate": 4.8820625342841474e-06,
"loss": 1.4659,
"step": 164500
},
{
"epoch": 90.51014810751508,
"grad_norm": 4.025433540344238,
"learning_rate": 4.744925946242457e-06,
"loss": 1.4701,
"step": 165000
},
{
"epoch": 90.78442128359846,
"grad_norm": 4.117000102996826,
"learning_rate": 4.607789358200768e-06,
"loss": 1.4745,
"step": 165500
},
{
"epoch": 91.05869445968185,
"grad_norm": 4.047626495361328,
"learning_rate": 4.4706527701590785e-06,
"loss": 1.4693,
"step": 166000
},
{
"epoch": 91.33296763576523,
"grad_norm": 4.17887020111084,
"learning_rate": 4.333516182117389e-06,
"loss": 1.4624,
"step": 166500
},
{
"epoch": 91.6072408118486,
"grad_norm": 4.2437639236450195,
"learning_rate": 4.196379594075699e-06,
"loss": 1.4665,
"step": 167000
},
{
"epoch": 91.88151398793198,
"grad_norm": 3.7711315155029297,
"learning_rate": 4.0592430060340105e-06,
"loss": 1.4731,
"step": 167500
},
{
"epoch": 92.15578716401536,
"grad_norm": 4.002791404724121,
"learning_rate": 3.92210641799232e-06,
"loss": 1.4642,
"step": 168000
},
{
"epoch": 92.43006034009873,
"grad_norm": 4.0743231773376465,
"learning_rate": 3.7849698299506313e-06,
"loss": 1.4584,
"step": 168500
},
{
"epoch": 92.70433351618212,
"grad_norm": 4.080685138702393,
"learning_rate": 3.647833241908941e-06,
"loss": 1.4623,
"step": 169000
},
{
"epoch": 92.9786066922655,
"grad_norm": 4.304593563079834,
"learning_rate": 3.510696653867252e-06,
"loss": 1.4682,
"step": 169500
},
{
"epoch": 93.25287986834887,
"grad_norm": 4.447428226470947,
"learning_rate": 3.3735600658255624e-06,
"loss": 1.4535,
"step": 170000
},
{
"epoch": 93.52715304443225,
"grad_norm": 4.22756814956665,
"learning_rate": 3.236423477783873e-06,
"loss": 1.4592,
"step": 170500
},
{
"epoch": 93.80142622051564,
"grad_norm": 4.293380260467529,
"learning_rate": 3.0992868897421833e-06,
"loss": 1.4632,
"step": 171000
},
{
"epoch": 94.075699396599,
"grad_norm": 4.07041072845459,
"learning_rate": 2.962150301700494e-06,
"loss": 1.4605,
"step": 171500
},
{
"epoch": 94.34997257268239,
"grad_norm": 4.039161205291748,
"learning_rate": 2.825013713658804e-06,
"loss": 1.4551,
"step": 172000
},
{
"epoch": 94.62424574876577,
"grad_norm": 4.1246795654296875,
"learning_rate": 2.687877125617115e-06,
"loss": 1.456,
"step": 172500
},
{
"epoch": 94.89851892484916,
"grad_norm": 4.026761054992676,
"learning_rate": 2.550740537575425e-06,
"loss": 1.4512,
"step": 173000
},
{
"epoch": 95.17279210093253,
"grad_norm": 4.5864715576171875,
"learning_rate": 2.4136039495337357e-06,
"loss": 1.4575,
"step": 173500
},
{
"epoch": 95.44706527701591,
"grad_norm": 4.117992401123047,
"learning_rate": 2.2764673614920463e-06,
"loss": 1.4475,
"step": 174000
},
{
"epoch": 95.72133845309929,
"grad_norm": 4.155096530914307,
"learning_rate": 2.1393307734503565e-06,
"loss": 1.4581,
"step": 174500
},
{
"epoch": 95.99561162918266,
"grad_norm": 4.28767204284668,
"learning_rate": 2.002194185408667e-06,
"loss": 1.4521,
"step": 175000
},
{
"epoch": 96.26988480526605,
"grad_norm": 4.1511077880859375,
"learning_rate": 1.8650575973669776e-06,
"loss": 1.4488,
"step": 175500
},
{
"epoch": 96.54415798134943,
"grad_norm": 4.336985111236572,
"learning_rate": 1.727921009325288e-06,
"loss": 1.4534,
"step": 176000
},
{
"epoch": 96.8184311574328,
"grad_norm": 4.181045055389404,
"learning_rate": 1.5907844212835987e-06,
"loss": 1.45,
"step": 176500
},
{
"epoch": 97.09270433351618,
"grad_norm": 4.217624187469482,
"learning_rate": 1.453647833241909e-06,
"loss": 1.4498,
"step": 177000
},
{
"epoch": 97.36697750959956,
"grad_norm": 3.8873023986816406,
"learning_rate": 1.3165112452002194e-06,
"loss": 1.4507,
"step": 177500
},
{
"epoch": 97.64125068568293,
"grad_norm": 4.3951191902160645,
"learning_rate": 1.17937465715853e-06,
"loss": 1.451,
"step": 178000
},
{
"epoch": 97.91552386176632,
"grad_norm": 4.1204118728637695,
"learning_rate": 1.0422380691168404e-06,
"loss": 1.4484,
"step": 178500
},
{
"epoch": 98.1897970378497,
"grad_norm": 4.278495788574219,
"learning_rate": 9.05101481075151e-07,
"loss": 1.4431,
"step": 179000
},
{
"epoch": 98.46407021393308,
"grad_norm": 4.186399459838867,
"learning_rate": 7.679648930334613e-07,
"loss": 1.4493,
"step": 179500
},
{
"epoch": 98.73834339001645,
"grad_norm": 4.110637187957764,
"learning_rate": 6.308283049917719e-07,
"loss": 1.4418,
"step": 180000
},
{
"epoch": 99.01261656609984,
"grad_norm": 3.9559993743896484,
"learning_rate": 4.936917169500823e-07,
"loss": 1.4407,
"step": 180500
},
{
"epoch": 99.28688974218322,
"grad_norm": 4.4722418785095215,
"learning_rate": 3.565551289083928e-07,
"loss": 1.4421,
"step": 181000
},
{
"epoch": 99.56116291826659,
"grad_norm": 4.151792526245117,
"learning_rate": 2.1941854086670326e-07,
"loss": 1.4469,
"step": 181500
},
{
"epoch": 99.83543609434997,
"grad_norm": 4.128389835357666,
"learning_rate": 8.228195282501371e-08,
"loss": 1.4463,
"step": 182000
},
{
"epoch": 100.0,
"step": 182300,
"total_flos": 3.157662139522744e+17,
"train_loss": 2.261507166926833,
"train_runtime": 62730.7046,
"train_samples_per_second": 185.96,
"train_steps_per_second": 2.906
}
],
"logging_steps": 500,
"max_steps": 182300,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.157662139522744e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}