deberta-v3-xsmall-zyda-2 / trainer_state.json
agentlans's picture
Upload 13 files
e8b975c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 87414,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005719907566293729,
"grad_norm": 7.0386528968811035,
"learning_rate": 4.971400462168532e-05,
"loss": 7.2889,
"step": 500
},
{
"epoch": 0.011439815132587457,
"grad_norm": 6.592615604400635,
"learning_rate": 4.942800924337063e-05,
"loss": 5.389,
"step": 1000
},
{
"epoch": 0.017159722698881188,
"grad_norm": 6.088059425354004,
"learning_rate": 4.914201386505594e-05,
"loss": 4.8994,
"step": 1500
},
{
"epoch": 0.022879630265174915,
"grad_norm": 6.355687618255615,
"learning_rate": 4.885601848674126e-05,
"loss": 4.6219,
"step": 2000
},
{
"epoch": 0.028599537831468645,
"grad_norm": 6.1159186363220215,
"learning_rate": 4.8570023108426574e-05,
"loss": 4.4233,
"step": 2500
},
{
"epoch": 0.034319445397762376,
"grad_norm": 6.636866092681885,
"learning_rate": 4.828402773011189e-05,
"loss": 4.2802,
"step": 3000
},
{
"epoch": 0.0400393529640561,
"grad_norm": 5.87605619430542,
"learning_rate": 4.7998032351797196e-05,
"loss": 4.1453,
"step": 3500
},
{
"epoch": 0.04575926053034983,
"grad_norm": 6.0966691970825195,
"learning_rate": 4.771203697348251e-05,
"loss": 4.0822,
"step": 4000
},
{
"epoch": 0.05147916809664356,
"grad_norm": 5.99679708480835,
"learning_rate": 4.7426041595167824e-05,
"loss": 3.9941,
"step": 4500
},
{
"epoch": 0.05719907566293729,
"grad_norm": 5.738614082336426,
"learning_rate": 4.714004621685314e-05,
"loss": 3.9104,
"step": 5000
},
{
"epoch": 0.06291898322923102,
"grad_norm": 6.038717746734619,
"learning_rate": 4.6854050838538446e-05,
"loss": 3.8528,
"step": 5500
},
{
"epoch": 0.06863889079552475,
"grad_norm": 5.92971134185791,
"learning_rate": 4.656805546022377e-05,
"loss": 3.78,
"step": 6000
},
{
"epoch": 0.07435879836181847,
"grad_norm": 5.791173458099365,
"learning_rate": 4.628206008190908e-05,
"loss": 3.7407,
"step": 6500
},
{
"epoch": 0.0800787059281122,
"grad_norm": 6.2113566398620605,
"learning_rate": 4.5996064703594396e-05,
"loss": 3.6834,
"step": 7000
},
{
"epoch": 0.08579861349440593,
"grad_norm": 5.970207214355469,
"learning_rate": 4.57100693252797e-05,
"loss": 3.663,
"step": 7500
},
{
"epoch": 0.09151852106069966,
"grad_norm": 5.8883514404296875,
"learning_rate": 4.542407394696502e-05,
"loss": 3.6161,
"step": 8000
},
{
"epoch": 0.09723842862699339,
"grad_norm": 6.351154804229736,
"learning_rate": 4.513807856865033e-05,
"loss": 3.6193,
"step": 8500
},
{
"epoch": 0.10295833619328712,
"grad_norm": 5.69071102142334,
"learning_rate": 4.4852083190335646e-05,
"loss": 3.545,
"step": 9000
},
{
"epoch": 0.10867824375958085,
"grad_norm": 5.988426208496094,
"learning_rate": 4.456608781202096e-05,
"loss": 3.5088,
"step": 9500
},
{
"epoch": 0.11439815132587458,
"grad_norm": 6.251087665557861,
"learning_rate": 4.4280092433706274e-05,
"loss": 3.5158,
"step": 10000
},
{
"epoch": 0.1201180588921683,
"grad_norm": 5.840632438659668,
"learning_rate": 4.399409705539159e-05,
"loss": 3.5045,
"step": 10500
},
{
"epoch": 0.12583796645846204,
"grad_norm": 5.9697723388671875,
"learning_rate": 4.37081016770769e-05,
"loss": 3.4466,
"step": 11000
},
{
"epoch": 0.13155787402475577,
"grad_norm": 6.149275779724121,
"learning_rate": 4.342210629876222e-05,
"loss": 3.4544,
"step": 11500
},
{
"epoch": 0.1372777815910495,
"grad_norm": 5.605130672454834,
"learning_rate": 4.3136110920447525e-05,
"loss": 3.4116,
"step": 12000
},
{
"epoch": 0.1429976891573432,
"grad_norm": 5.5137152671813965,
"learning_rate": 4.285011554213284e-05,
"loss": 3.3919,
"step": 12500
},
{
"epoch": 0.14871759672363694,
"grad_norm": 5.848428726196289,
"learning_rate": 4.256412016381815e-05,
"loss": 3.3695,
"step": 13000
},
{
"epoch": 0.15443750428993067,
"grad_norm": 5.794093132019043,
"learning_rate": 4.227812478550347e-05,
"loss": 3.3509,
"step": 13500
},
{
"epoch": 0.1601574118562244,
"grad_norm": 5.879131317138672,
"learning_rate": 4.199212940718878e-05,
"loss": 3.3551,
"step": 14000
},
{
"epoch": 0.16587731942251813,
"grad_norm": 5.5501179695129395,
"learning_rate": 4.1706134028874096e-05,
"loss": 3.3327,
"step": 14500
},
{
"epoch": 0.17159722698881186,
"grad_norm": 5.30273962020874,
"learning_rate": 4.142013865055941e-05,
"loss": 3.3045,
"step": 15000
},
{
"epoch": 0.1773171345551056,
"grad_norm": 6.049214839935303,
"learning_rate": 4.1134143272244725e-05,
"loss": 3.2868,
"step": 15500
},
{
"epoch": 0.18303704212139932,
"grad_norm": 5.499197483062744,
"learning_rate": 4.084814789393003e-05,
"loss": 3.2837,
"step": 16000
},
{
"epoch": 0.18875694968769305,
"grad_norm": 5.448641777038574,
"learning_rate": 4.0562152515615347e-05,
"loss": 3.2751,
"step": 16500
},
{
"epoch": 0.19447685725398678,
"grad_norm": 5.422529697418213,
"learning_rate": 4.027615713730066e-05,
"loss": 3.2585,
"step": 17000
},
{
"epoch": 0.2001967648202805,
"grad_norm": 5.639166831970215,
"learning_rate": 3.9990161758985975e-05,
"loss": 3.2587,
"step": 17500
},
{
"epoch": 0.20591667238657424,
"grad_norm": 5.255904674530029,
"learning_rate": 3.970416638067129e-05,
"loss": 3.2505,
"step": 18000
},
{
"epoch": 0.21163657995286797,
"grad_norm": 5.453869819641113,
"learning_rate": 3.9418171002356604e-05,
"loss": 3.2382,
"step": 18500
},
{
"epoch": 0.2173564875191617,
"grad_norm": 5.905749797821045,
"learning_rate": 3.913217562404192e-05,
"loss": 3.2309,
"step": 19000
},
{
"epoch": 0.22307639508545543,
"grad_norm": 5.28553581237793,
"learning_rate": 3.884618024572723e-05,
"loss": 3.2308,
"step": 19500
},
{
"epoch": 0.22879630265174916,
"grad_norm": 5.1942830085754395,
"learning_rate": 3.8560184867412547e-05,
"loss": 3.2163,
"step": 20000
},
{
"epoch": 0.23451621021804286,
"grad_norm": 6.12723970413208,
"learning_rate": 3.8274189489097854e-05,
"loss": 3.2082,
"step": 20500
},
{
"epoch": 0.2402361177843366,
"grad_norm": 5.664548873901367,
"learning_rate": 3.798819411078317e-05,
"loss": 3.1802,
"step": 21000
},
{
"epoch": 0.24595602535063033,
"grad_norm": 5.903208255767822,
"learning_rate": 3.770219873246848e-05,
"loss": 3.1946,
"step": 21500
},
{
"epoch": 0.2516759329169241,
"grad_norm": 5.729937553405762,
"learning_rate": 3.7416203354153804e-05,
"loss": 3.1646,
"step": 22000
},
{
"epoch": 0.2573958404832178,
"grad_norm": 6.068752765655518,
"learning_rate": 3.713020797583911e-05,
"loss": 3.2006,
"step": 22500
},
{
"epoch": 0.26311574804951154,
"grad_norm": 5.97099494934082,
"learning_rate": 3.6844212597524425e-05,
"loss": 3.1437,
"step": 23000
},
{
"epoch": 0.26883565561580525,
"grad_norm": 5.777164936065674,
"learning_rate": 3.655821721920974e-05,
"loss": 3.1407,
"step": 23500
},
{
"epoch": 0.274555563182099,
"grad_norm": 4.982606887817383,
"learning_rate": 3.6272221840895054e-05,
"loss": 3.1282,
"step": 24000
},
{
"epoch": 0.2802754707483927,
"grad_norm": 5.495816230773926,
"learning_rate": 3.598622646258037e-05,
"loss": 3.1434,
"step": 24500
},
{
"epoch": 0.2859953783146864,
"grad_norm": 5.898298263549805,
"learning_rate": 3.5700231084265676e-05,
"loss": 3.1446,
"step": 25000
},
{
"epoch": 0.29171528588098017,
"grad_norm": 5.729229927062988,
"learning_rate": 3.541423570595099e-05,
"loss": 3.1254,
"step": 25500
},
{
"epoch": 0.29743519344727387,
"grad_norm": 6.3333821296691895,
"learning_rate": 3.512824032763631e-05,
"loss": 3.1543,
"step": 26000
},
{
"epoch": 0.30315510101356763,
"grad_norm": 6.0027756690979,
"learning_rate": 3.4842244949321625e-05,
"loss": 3.1379,
"step": 26500
},
{
"epoch": 0.30887500857986133,
"grad_norm": 5.95717716217041,
"learning_rate": 3.455624957100693e-05,
"loss": 3.1188,
"step": 27000
},
{
"epoch": 0.3145949161461551,
"grad_norm": 6.262216567993164,
"learning_rate": 3.427025419269225e-05,
"loss": 3.1096,
"step": 27500
},
{
"epoch": 0.3203148237124488,
"grad_norm": 6.436416149139404,
"learning_rate": 3.398425881437756e-05,
"loss": 3.1294,
"step": 28000
},
{
"epoch": 0.32603473127874255,
"grad_norm": 5.524046421051025,
"learning_rate": 3.3698263436062876e-05,
"loss": 3.0965,
"step": 28500
},
{
"epoch": 0.33175463884503625,
"grad_norm": 5.237400531768799,
"learning_rate": 3.341226805774818e-05,
"loss": 3.0826,
"step": 29000
},
{
"epoch": 0.33747454641133,
"grad_norm": 5.551352500915527,
"learning_rate": 3.3126272679433504e-05,
"loss": 3.0666,
"step": 29500
},
{
"epoch": 0.3431944539776237,
"grad_norm": 5.407064914703369,
"learning_rate": 3.284027730111882e-05,
"loss": 3.0738,
"step": 30000
},
{
"epoch": 0.3489143615439175,
"grad_norm": 5.428358554840088,
"learning_rate": 3.255428192280413e-05,
"loss": 3.0628,
"step": 30500
},
{
"epoch": 0.3546342691102112,
"grad_norm": 6.178744792938232,
"learning_rate": 3.226828654448944e-05,
"loss": 3.0683,
"step": 31000
},
{
"epoch": 0.36035417667650493,
"grad_norm": 5.695249080657959,
"learning_rate": 3.1982291166174755e-05,
"loss": 3.0732,
"step": 31500
},
{
"epoch": 0.36607408424279864,
"grad_norm": 5.676379203796387,
"learning_rate": 3.169629578786007e-05,
"loss": 3.0531,
"step": 32000
},
{
"epoch": 0.37179399180909234,
"grad_norm": 5.420720100402832,
"learning_rate": 3.141030040954538e-05,
"loss": 3.0472,
"step": 32500
},
{
"epoch": 0.3775138993753861,
"grad_norm": 5.6645379066467285,
"learning_rate": 3.11243050312307e-05,
"loss": 3.0391,
"step": 33000
},
{
"epoch": 0.3832338069416798,
"grad_norm": 6.123884201049805,
"learning_rate": 3.083830965291601e-05,
"loss": 3.0527,
"step": 33500
},
{
"epoch": 0.38895371450797356,
"grad_norm": 5.331460475921631,
"learning_rate": 3.0552314274601326e-05,
"loss": 3.049,
"step": 34000
},
{
"epoch": 0.39467362207426726,
"grad_norm": 6.356675624847412,
"learning_rate": 3.0266318896286637e-05,
"loss": 3.0416,
"step": 34500
},
{
"epoch": 0.400393529640561,
"grad_norm": 6.067440986633301,
"learning_rate": 2.9980323517971955e-05,
"loss": 3.0303,
"step": 35000
},
{
"epoch": 0.4061134372068547,
"grad_norm": 5.3762030601501465,
"learning_rate": 2.9694328139657262e-05,
"loss": 3.0363,
"step": 35500
},
{
"epoch": 0.4118333447731485,
"grad_norm": 5.837817192077637,
"learning_rate": 2.9408332761342576e-05,
"loss": 3.0284,
"step": 36000
},
{
"epoch": 0.4175532523394422,
"grad_norm": 5.484166622161865,
"learning_rate": 2.912233738302789e-05,
"loss": 2.9946,
"step": 36500
},
{
"epoch": 0.42327315990573594,
"grad_norm": 5.776547908782959,
"learning_rate": 2.883634200471321e-05,
"loss": 3.037,
"step": 37000
},
{
"epoch": 0.42899306747202964,
"grad_norm": 5.481433868408203,
"learning_rate": 2.8550346626398516e-05,
"loss": 3.0289,
"step": 37500
},
{
"epoch": 0.4347129750383234,
"grad_norm": 5.784084320068359,
"learning_rate": 2.826435124808383e-05,
"loss": 2.9928,
"step": 38000
},
{
"epoch": 0.4404328826046171,
"grad_norm": 5.899621486663818,
"learning_rate": 2.7978355869769148e-05,
"loss": 3.0009,
"step": 38500
},
{
"epoch": 0.44615279017091086,
"grad_norm": 5.488452434539795,
"learning_rate": 2.7692360491454462e-05,
"loss": 3.0099,
"step": 39000
},
{
"epoch": 0.45187269773720457,
"grad_norm": 5.848759174346924,
"learning_rate": 2.740636511313977e-05,
"loss": 3.0009,
"step": 39500
},
{
"epoch": 0.4575926053034983,
"grad_norm": 5.612068176269531,
"learning_rate": 2.7120369734825084e-05,
"loss": 2.9828,
"step": 40000
},
{
"epoch": 0.463312512869792,
"grad_norm": 5.79826021194458,
"learning_rate": 2.68343743565104e-05,
"loss": 3.0152,
"step": 40500
},
{
"epoch": 0.46903242043608573,
"grad_norm": 5.842123508453369,
"learning_rate": 2.6548378978195716e-05,
"loss": 2.9719,
"step": 41000
},
{
"epoch": 0.4747523280023795,
"grad_norm": 5.69782018661499,
"learning_rate": 2.626238359988103e-05,
"loss": 2.975,
"step": 41500
},
{
"epoch": 0.4804722355686732,
"grad_norm": 6.189919948577881,
"learning_rate": 2.5976388221566338e-05,
"loss": 2.9709,
"step": 42000
},
{
"epoch": 0.48619214313496695,
"grad_norm": 5.761579513549805,
"learning_rate": 2.5690392843251655e-05,
"loss": 2.9583,
"step": 42500
},
{
"epoch": 0.49191205070126065,
"grad_norm": 6.164900779724121,
"learning_rate": 2.540439746493697e-05,
"loss": 2.9742,
"step": 43000
},
{
"epoch": 0.4976319582675544,
"grad_norm": 5.3809285163879395,
"learning_rate": 2.5118402086622284e-05,
"loss": 2.9849,
"step": 43500
},
{
"epoch": 0.5033518658338482,
"grad_norm": 5.787545680999756,
"learning_rate": 2.4832406708307595e-05,
"loss": 2.963,
"step": 44000
},
{
"epoch": 0.5090717734001419,
"grad_norm": 5.825649261474609,
"learning_rate": 2.454641132999291e-05,
"loss": 2.9634,
"step": 44500
},
{
"epoch": 0.5147916809664356,
"grad_norm": 5.936666488647461,
"learning_rate": 2.4260415951678223e-05,
"loss": 2.9678,
"step": 45000
},
{
"epoch": 0.5205115885327293,
"grad_norm": 5.980503082275391,
"learning_rate": 2.3974420573363534e-05,
"loss": 2.9675,
"step": 45500
},
{
"epoch": 0.5262314960990231,
"grad_norm": 5.755555629730225,
"learning_rate": 2.368842519504885e-05,
"loss": 2.9597,
"step": 46000
},
{
"epoch": 0.5319514036653168,
"grad_norm": 5.1978936195373535,
"learning_rate": 2.3402429816734163e-05,
"loss": 2.947,
"step": 46500
},
{
"epoch": 0.5376713112316105,
"grad_norm": 5.265974521636963,
"learning_rate": 2.3116434438419477e-05,
"loss": 2.9521,
"step": 47000
},
{
"epoch": 0.5433912187979042,
"grad_norm": 6.028165340423584,
"learning_rate": 2.2830439060104788e-05,
"loss": 2.9579,
"step": 47500
},
{
"epoch": 0.549111126364198,
"grad_norm": 5.533000946044922,
"learning_rate": 2.2544443681790102e-05,
"loss": 2.9541,
"step": 48000
},
{
"epoch": 0.5548310339304917,
"grad_norm": 5.428481101989746,
"learning_rate": 2.225844830347542e-05,
"loss": 2.9532,
"step": 48500
},
{
"epoch": 0.5605509414967854,
"grad_norm": 5.905336856842041,
"learning_rate": 2.197245292516073e-05,
"loss": 2.94,
"step": 49000
},
{
"epoch": 0.5662708490630791,
"grad_norm": 6.032477855682373,
"learning_rate": 2.1686457546846045e-05,
"loss": 2.9435,
"step": 49500
},
{
"epoch": 0.5719907566293728,
"grad_norm": 5.996410369873047,
"learning_rate": 2.1400462168531356e-05,
"loss": 2.9358,
"step": 50000
},
{
"epoch": 0.5777106641956666,
"grad_norm": 5.634001731872559,
"learning_rate": 2.1114466790216674e-05,
"loss": 2.9361,
"step": 50500
},
{
"epoch": 0.5834305717619603,
"grad_norm": 5.509332656860352,
"learning_rate": 2.0828471411901985e-05,
"loss": 2.9314,
"step": 51000
},
{
"epoch": 0.589150479328254,
"grad_norm": 6.294771194458008,
"learning_rate": 2.05424760335873e-05,
"loss": 2.9238,
"step": 51500
},
{
"epoch": 0.5948703868945477,
"grad_norm": 5.542776107788086,
"learning_rate": 2.025648065527261e-05,
"loss": 2.9226,
"step": 52000
},
{
"epoch": 0.6005902944608416,
"grad_norm": 5.870414733886719,
"learning_rate": 1.9970485276957927e-05,
"loss": 2.9202,
"step": 52500
},
{
"epoch": 0.6063102020271353,
"grad_norm": 6.047051429748535,
"learning_rate": 1.9684489898643238e-05,
"loss": 2.9285,
"step": 53000
},
{
"epoch": 0.612030109593429,
"grad_norm": 5.594234943389893,
"learning_rate": 1.9398494520328553e-05,
"loss": 2.9307,
"step": 53500
},
{
"epoch": 0.6177500171597227,
"grad_norm": 5.4298295974731445,
"learning_rate": 1.9112499142013863e-05,
"loss": 2.9382,
"step": 54000
},
{
"epoch": 0.6234699247260165,
"grad_norm": 6.184563636779785,
"learning_rate": 1.882650376369918e-05,
"loss": 2.9341,
"step": 54500
},
{
"epoch": 0.6291898322923102,
"grad_norm": 5.776815414428711,
"learning_rate": 1.8540508385384492e-05,
"loss": 2.9287,
"step": 55000
},
{
"epoch": 0.6349097398586039,
"grad_norm": 5.83139181137085,
"learning_rate": 1.8254513007069806e-05,
"loss": 2.9007,
"step": 55500
},
{
"epoch": 0.6406296474248976,
"grad_norm": 5.469008922576904,
"learning_rate": 1.7968517628755117e-05,
"loss": 2.904,
"step": 56000
},
{
"epoch": 0.6463495549911913,
"grad_norm": 6.898833751678467,
"learning_rate": 1.7682522250440435e-05,
"loss": 2.9105,
"step": 56500
},
{
"epoch": 0.6520694625574851,
"grad_norm": 5.798022747039795,
"learning_rate": 1.739652687212575e-05,
"loss": 2.8995,
"step": 57000
},
{
"epoch": 0.6577893701237788,
"grad_norm": 5.57025146484375,
"learning_rate": 1.711053149381106e-05,
"loss": 2.9093,
"step": 57500
},
{
"epoch": 0.6635092776900725,
"grad_norm": 5.779621124267578,
"learning_rate": 1.6824536115496374e-05,
"loss": 2.9118,
"step": 58000
},
{
"epoch": 0.6692291852563662,
"grad_norm": 5.683529853820801,
"learning_rate": 1.653854073718169e-05,
"loss": 2.9102,
"step": 58500
},
{
"epoch": 0.67494909282266,
"grad_norm": 6.1109538078308105,
"learning_rate": 1.6252545358867003e-05,
"loss": 2.9102,
"step": 59000
},
{
"epoch": 0.6806690003889537,
"grad_norm": 5.536868095397949,
"learning_rate": 1.5966549980552314e-05,
"loss": 2.8966,
"step": 59500
},
{
"epoch": 0.6863889079552474,
"grad_norm": 6.560556888580322,
"learning_rate": 1.5680554602237628e-05,
"loss": 2.8896,
"step": 60000
},
{
"epoch": 0.6921088155215411,
"grad_norm": 5.969814300537109,
"learning_rate": 1.5394559223922942e-05,
"loss": 2.8961,
"step": 60500
},
{
"epoch": 0.697828723087835,
"grad_norm": 5.238883018493652,
"learning_rate": 1.5108563845608257e-05,
"loss": 2.882,
"step": 61000
},
{
"epoch": 0.7035486306541286,
"grad_norm": 5.538156509399414,
"learning_rate": 1.4822568467293567e-05,
"loss": 2.8955,
"step": 61500
},
{
"epoch": 0.7092685382204224,
"grad_norm": 6.287049770355225,
"learning_rate": 1.4536573088978883e-05,
"loss": 2.8932,
"step": 62000
},
{
"epoch": 0.714988445786716,
"grad_norm": 6.05150032043457,
"learning_rate": 1.4250577710664196e-05,
"loss": 2.8868,
"step": 62500
},
{
"epoch": 0.7207083533530099,
"grad_norm": 5.857907295227051,
"learning_rate": 1.396458233234951e-05,
"loss": 2.8872,
"step": 63000
},
{
"epoch": 0.7264282609193036,
"grad_norm": 5.748091697692871,
"learning_rate": 1.3678586954034823e-05,
"loss": 2.8964,
"step": 63500
},
{
"epoch": 0.7321481684855973,
"grad_norm": 5.55509614944458,
"learning_rate": 1.3392591575720137e-05,
"loss": 2.8758,
"step": 64000
},
{
"epoch": 0.737868076051891,
"grad_norm": 6.135568618774414,
"learning_rate": 1.3106596197405451e-05,
"loss": 2.8685,
"step": 64500
},
{
"epoch": 0.7435879836181847,
"grad_norm": 5.816187381744385,
"learning_rate": 1.2820600819090764e-05,
"loss": 2.8802,
"step": 65000
},
{
"epoch": 0.7493078911844785,
"grad_norm": 6.309732437133789,
"learning_rate": 1.2534605440776078e-05,
"loss": 2.8859,
"step": 65500
},
{
"epoch": 0.7550277987507722,
"grad_norm": 5.721366882324219,
"learning_rate": 1.2248610062461391e-05,
"loss": 2.8702,
"step": 66000
},
{
"epoch": 0.7607477063170659,
"grad_norm": 5.648174285888672,
"learning_rate": 1.1962614684146704e-05,
"loss": 2.8603,
"step": 66500
},
{
"epoch": 0.7664676138833596,
"grad_norm": 6.4422688484191895,
"learning_rate": 1.1676619305832018e-05,
"loss": 2.875,
"step": 67000
},
{
"epoch": 0.7721875214496534,
"grad_norm": 5.673862934112549,
"learning_rate": 1.1390623927517332e-05,
"loss": 2.8743,
"step": 67500
},
{
"epoch": 0.7779074290159471,
"grad_norm": 5.87379789352417,
"learning_rate": 1.1104628549202645e-05,
"loss": 2.8678,
"step": 68000
},
{
"epoch": 0.7836273365822408,
"grad_norm": 5.968353271484375,
"learning_rate": 1.0818633170887959e-05,
"loss": 2.8797,
"step": 68500
},
{
"epoch": 0.7893472441485345,
"grad_norm": 5.516451835632324,
"learning_rate": 1.0532637792573273e-05,
"loss": 2.8716,
"step": 69000
},
{
"epoch": 0.7950671517148283,
"grad_norm": 6.277103900909424,
"learning_rate": 1.0246642414258586e-05,
"loss": 2.8483,
"step": 69500
},
{
"epoch": 0.800787059281122,
"grad_norm": 5.54793643951416,
"learning_rate": 9.9606470359439e-06,
"loss": 2.8462,
"step": 70000
},
{
"epoch": 0.8065069668474157,
"grad_norm": 5.989738464355469,
"learning_rate": 9.674651657629213e-06,
"loss": 2.8672,
"step": 70500
},
{
"epoch": 0.8122268744137094,
"grad_norm": 5.7795844078063965,
"learning_rate": 9.388656279314527e-06,
"loss": 2.8566,
"step": 71000
},
{
"epoch": 0.8179467819800033,
"grad_norm": 5.732882976531982,
"learning_rate": 9.10266090099984e-06,
"loss": 2.8497,
"step": 71500
},
{
"epoch": 0.823666689546297,
"grad_norm": 6.427890777587891,
"learning_rate": 8.816665522685154e-06,
"loss": 2.8656,
"step": 72000
},
{
"epoch": 0.8293865971125907,
"grad_norm": 6.1445746421813965,
"learning_rate": 8.530670144370468e-06,
"loss": 2.8425,
"step": 72500
},
{
"epoch": 0.8351065046788844,
"grad_norm": 6.342021465301514,
"learning_rate": 8.24467476605578e-06,
"loss": 2.859,
"step": 73000
},
{
"epoch": 0.8408264122451781,
"grad_norm": 6.117573261260986,
"learning_rate": 7.958679387741095e-06,
"loss": 2.8631,
"step": 73500
},
{
"epoch": 0.8465463198114719,
"grad_norm": 6.145172595977783,
"learning_rate": 7.672684009426408e-06,
"loss": 2.8536,
"step": 74000
},
{
"epoch": 0.8522662273777656,
"grad_norm": 5.709812641143799,
"learning_rate": 7.386688631111721e-06,
"loss": 2.8546,
"step": 74500
},
{
"epoch": 0.8579861349440593,
"grad_norm": 6.244381904602051,
"learning_rate": 7.100693252797034e-06,
"loss": 2.8277,
"step": 75000
},
{
"epoch": 0.863706042510353,
"grad_norm": 6.229698181152344,
"learning_rate": 6.814697874482348e-06,
"loss": 2.85,
"step": 75500
},
{
"epoch": 0.8694259500766468,
"grad_norm": 5.480854511260986,
"learning_rate": 6.528702496167661e-06,
"loss": 2.8576,
"step": 76000
},
{
"epoch": 0.8751458576429405,
"grad_norm": 6.088413715362549,
"learning_rate": 6.2427071178529756e-06,
"loss": 2.8618,
"step": 76500
},
{
"epoch": 0.8808657652092342,
"grad_norm": 6.159554481506348,
"learning_rate": 5.956711739538289e-06,
"loss": 2.8319,
"step": 77000
},
{
"epoch": 0.8865856727755279,
"grad_norm": 6.287491798400879,
"learning_rate": 5.6707163612236024e-06,
"loss": 2.8622,
"step": 77500
},
{
"epoch": 0.8923055803418217,
"grad_norm": 6.237947940826416,
"learning_rate": 5.384720982908916e-06,
"loss": 2.8539,
"step": 78000
},
{
"epoch": 0.8980254879081154,
"grad_norm": 6.096075057983398,
"learning_rate": 5.09872560459423e-06,
"loss": 2.8256,
"step": 78500
},
{
"epoch": 0.9037453954744091,
"grad_norm": 6.158285140991211,
"learning_rate": 4.812730226279544e-06,
"loss": 2.8476,
"step": 79000
},
{
"epoch": 0.9094653030407028,
"grad_norm": 6.239354133605957,
"learning_rate": 4.526734847964857e-06,
"loss": 2.8538,
"step": 79500
},
{
"epoch": 0.9151852106069966,
"grad_norm": 5.273146629333496,
"learning_rate": 4.2407394696501705e-06,
"loss": 2.849,
"step": 80000
},
{
"epoch": 0.9209051181732903,
"grad_norm": 6.121274948120117,
"learning_rate": 3.954744091335484e-06,
"loss": 2.8454,
"step": 80500
},
{
"epoch": 0.926625025739584,
"grad_norm": 6.213692665100098,
"learning_rate": 3.6687487130207977e-06,
"loss": 2.8431,
"step": 81000
},
{
"epoch": 0.9323449333058778,
"grad_norm": 6.369662761688232,
"learning_rate": 3.382753334706111e-06,
"loss": 2.8589,
"step": 81500
},
{
"epoch": 0.9380648408721715,
"grad_norm": 5.934210300445557,
"learning_rate": 3.096757956391425e-06,
"loss": 2.8371,
"step": 82000
},
{
"epoch": 0.9437847484384653,
"grad_norm": 6.598156452178955,
"learning_rate": 2.8107625780767385e-06,
"loss": 2.8282,
"step": 82500
},
{
"epoch": 0.949504656004759,
"grad_norm": 5.81234073638916,
"learning_rate": 2.524767199762052e-06,
"loss": 2.8244,
"step": 83000
},
{
"epoch": 0.9552245635710527,
"grad_norm": 6.145198822021484,
"learning_rate": 2.2387718214473658e-06,
"loss": 2.8294,
"step": 83500
},
{
"epoch": 0.9609444711373464,
"grad_norm": 5.321970462799072,
"learning_rate": 1.952776443132679e-06,
"loss": 2.8425,
"step": 84000
},
{
"epoch": 0.9666643787036402,
"grad_norm": 5.8561248779296875,
"learning_rate": 1.6667810648179926e-06,
"loss": 2.8618,
"step": 84500
},
{
"epoch": 0.9723842862699339,
"grad_norm": 6.001429080963135,
"learning_rate": 1.3807856865033063e-06,
"loss": 2.8456,
"step": 85000
},
{
"epoch": 0.9781041938362276,
"grad_norm": 6.074833393096924,
"learning_rate": 1.0947903081886197e-06,
"loss": 2.8408,
"step": 85500
},
{
"epoch": 0.9838241014025213,
"grad_norm": 5.579460144042969,
"learning_rate": 8.087949298739332e-07,
"loss": 2.8427,
"step": 86000
},
{
"epoch": 0.9895440089688151,
"grad_norm": 5.968313217163086,
"learning_rate": 5.227995515592468e-07,
"loss": 2.8345,
"step": 86500
},
{
"epoch": 0.9952639165351088,
"grad_norm": 5.920719146728516,
"learning_rate": 2.3680417324456038e-07,
"loss": 2.8337,
"step": 87000
},
{
"epoch": 1.0,
"step": 87414,
"total_flos": 4.634223291773338e+16,
"train_loss": 3.132497888326312,
"train_runtime": 20429.0413,
"train_samples_per_second": 34.231,
"train_steps_per_second": 4.279
}
],
"logging_steps": 500,
"max_steps": 87414,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.634223291773338e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}