|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.7809745229100065, |
|
"eval_steps": 1000000, |
|
"global_step": 140000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024217766153250025, |
|
"grad_norm": 1.516142725944519, |
|
"learning_rate": 9.997578223384676e-06, |
|
"loss": 9.1668, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04843553230650005, |
|
"grad_norm": 1.0418092012405396, |
|
"learning_rate": 9.995156446769351e-06, |
|
"loss": 7.7891, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07265329845975008, |
|
"grad_norm": 0.9371763467788696, |
|
"learning_rate": 9.992734670154026e-06, |
|
"loss": 7.1939, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0968710646130001, |
|
"grad_norm": 1.1670751571655273, |
|
"learning_rate": 9.990312893538701e-06, |
|
"loss": 6.9189, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12108883076625013, |
|
"grad_norm": 1.3695101737976074, |
|
"learning_rate": 9.987891116923376e-06, |
|
"loss": 6.7278, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.14530659691950015, |
|
"grad_norm": 1.796486735343933, |
|
"learning_rate": 9.985469340308051e-06, |
|
"loss": 6.5867, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.16952436307275018, |
|
"grad_norm": 1.509717583656311, |
|
"learning_rate": 9.983047563692726e-06, |
|
"loss": 6.4508, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.1937421292260002, |
|
"grad_norm": 1.8329906463623047, |
|
"learning_rate": 9.9806257870774e-06, |
|
"loss": 6.3551, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.21795989537925023, |
|
"grad_norm": 1.5139986276626587, |
|
"learning_rate": 9.978204010462076e-06, |
|
"loss": 6.2743, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.24217766153250025, |
|
"grad_norm": 2.2407052516937256, |
|
"learning_rate": 9.97578223384675e-06, |
|
"loss": 6.2001, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.26639542768575025, |
|
"grad_norm": 2.087357521057129, |
|
"learning_rate": 9.973360457231426e-06, |
|
"loss": 6.1334, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.2906131938390003, |
|
"grad_norm": 2.0182762145996094, |
|
"learning_rate": 9.970938680616102e-06, |
|
"loss": 6.0656, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3148309599922503, |
|
"grad_norm": 1.9544531106948853, |
|
"learning_rate": 9.968516904000775e-06, |
|
"loss": 6.0134, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.33904872614550036, |
|
"grad_norm": 2.3156166076660156, |
|
"learning_rate": 9.966095127385452e-06, |
|
"loss": 5.9546, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.36326649229875035, |
|
"grad_norm": 2.5564098358154297, |
|
"learning_rate": 9.963673350770125e-06, |
|
"loss": 5.9063, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.3874842584520004, |
|
"grad_norm": 2.191112518310547, |
|
"learning_rate": 9.961251574154802e-06, |
|
"loss": 5.8564, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4117020246052504, |
|
"grad_norm": 2.1813371181488037, |
|
"learning_rate": 9.958829797539475e-06, |
|
"loss": 5.8053, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.43591979075850046, |
|
"grad_norm": 1.9756942987442017, |
|
"learning_rate": 9.95640802092415e-06, |
|
"loss": 5.7669, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.46013755691175046, |
|
"grad_norm": 2.2932822704315186, |
|
"learning_rate": 9.953986244308825e-06, |
|
"loss": 5.7218, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.4843553230650005, |
|
"grad_norm": 2.218536376953125, |
|
"learning_rate": 9.9515644676935e-06, |
|
"loss": 5.6818, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5085730892182505, |
|
"grad_norm": 2.3896877765655518, |
|
"learning_rate": 9.949142691078175e-06, |
|
"loss": 5.6481, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5327908553715005, |
|
"grad_norm": 2.5433712005615234, |
|
"learning_rate": 9.94672091446285e-06, |
|
"loss": 5.6124, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5570086215247505, |
|
"grad_norm": 2.5442490577697754, |
|
"learning_rate": 9.944299137847525e-06, |
|
"loss": 5.5728, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.5812263876780006, |
|
"grad_norm": 2.327425241470337, |
|
"learning_rate": 9.9418773612322e-06, |
|
"loss": 5.5406, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6054441538312506, |
|
"grad_norm": 2.290090799331665, |
|
"learning_rate": 9.939455584616876e-06, |
|
"loss": 5.5121, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.6296619199845006, |
|
"grad_norm": 3.161325216293335, |
|
"learning_rate": 9.93703380800155e-06, |
|
"loss": 5.4739, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.6538796861377506, |
|
"grad_norm": 2.6134533882141113, |
|
"learning_rate": 9.934612031386226e-06, |
|
"loss": 5.4384, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.6780974522910007, |
|
"grad_norm": 2.674760580062866, |
|
"learning_rate": 9.9321902547709e-06, |
|
"loss": 5.413, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7023152184442507, |
|
"grad_norm": 2.431614398956299, |
|
"learning_rate": 9.929768478155576e-06, |
|
"loss": 5.3903, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.7265329845975007, |
|
"grad_norm": 2.4028687477111816, |
|
"learning_rate": 9.927346701540251e-06, |
|
"loss": 5.3637, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7507507507507507, |
|
"grad_norm": 2.4807944297790527, |
|
"learning_rate": 9.924924924924926e-06, |
|
"loss": 5.3279, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.7749685169040008, |
|
"grad_norm": 2.9065611362457275, |
|
"learning_rate": 9.922503148309601e-06, |
|
"loss": 5.3098, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.7991862830572508, |
|
"grad_norm": 2.359736204147339, |
|
"learning_rate": 9.920081371694276e-06, |
|
"loss": 5.2858, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.8234040492105008, |
|
"grad_norm": 2.642854690551758, |
|
"learning_rate": 9.917659595078951e-06, |
|
"loss": 5.2518, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.8476218153637508, |
|
"grad_norm": 3.2326414585113525, |
|
"learning_rate": 9.915237818463626e-06, |
|
"loss": 5.2354, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.8718395815170009, |
|
"grad_norm": 2.285203218460083, |
|
"learning_rate": 9.912816041848301e-06, |
|
"loss": 5.2117, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.8960573476702509, |
|
"grad_norm": 2.551164388656616, |
|
"learning_rate": 9.910394265232976e-06, |
|
"loss": 5.1941, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.9202751138235009, |
|
"grad_norm": 2.678759813308716, |
|
"learning_rate": 9.907972488617651e-06, |
|
"loss": 5.1706, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.9444928799767509, |
|
"grad_norm": 2.6895062923431396, |
|
"learning_rate": 9.905550712002325e-06, |
|
"loss": 5.1499, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.968710646130001, |
|
"grad_norm": 2.554659128189087, |
|
"learning_rate": 9.903128935387001e-06, |
|
"loss": 5.1276, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.992928412283251, |
|
"grad_norm": 2.785282850265503, |
|
"learning_rate": 9.900707158771675e-06, |
|
"loss": 5.1079, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.017146178436501, |
|
"grad_norm": 2.7283270359039307, |
|
"learning_rate": 9.89828538215635e-06, |
|
"loss": 5.0726, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.0413639445897511, |
|
"grad_norm": 2.654245615005493, |
|
"learning_rate": 9.895863605541027e-06, |
|
"loss": 5.0654, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.065581710743001, |
|
"grad_norm": 2.563713550567627, |
|
"learning_rate": 9.8934418289257e-06, |
|
"loss": 5.0436, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.0897994768962511, |
|
"grad_norm": 2.6896631717681885, |
|
"learning_rate": 9.891020052310377e-06, |
|
"loss": 5.0161, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.114017243049501, |
|
"grad_norm": 2.8477983474731445, |
|
"learning_rate": 9.88859827569505e-06, |
|
"loss": 5.008, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.1382350092027511, |
|
"grad_norm": 2.6253600120544434, |
|
"learning_rate": 9.886176499079725e-06, |
|
"loss": 4.987, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.1624527753560012, |
|
"grad_norm": 2.7618229389190674, |
|
"learning_rate": 9.8837547224644e-06, |
|
"loss": 4.9655, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.186670541509251, |
|
"grad_norm": 2.7631571292877197, |
|
"learning_rate": 9.881332945849075e-06, |
|
"loss": 4.9426, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.2108883076625012, |
|
"grad_norm": 3.108574390411377, |
|
"learning_rate": 9.87891116923375e-06, |
|
"loss": 4.9264, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.2351060738157513, |
|
"grad_norm": 2.5930752754211426, |
|
"learning_rate": 9.876489392618425e-06, |
|
"loss": 4.9068, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.2593238399690012, |
|
"grad_norm": 2.4590559005737305, |
|
"learning_rate": 9.8740676160031e-06, |
|
"loss": 4.8908, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.2835416061222513, |
|
"grad_norm": 2.7004990577697754, |
|
"learning_rate": 9.871645839387776e-06, |
|
"loss": 4.8767, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.3077593722755014, |
|
"grad_norm": 2.5023412704467773, |
|
"learning_rate": 9.86922406277245e-06, |
|
"loss": 4.8543, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.3319771384287513, |
|
"grad_norm": 3.338123083114624, |
|
"learning_rate": 9.866802286157126e-06, |
|
"loss": 4.8324, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.3561949045820014, |
|
"grad_norm": 2.871856689453125, |
|
"learning_rate": 9.8643805095418e-06, |
|
"loss": 4.8138, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.3804126707352513, |
|
"grad_norm": 3.148714303970337, |
|
"learning_rate": 9.861958732926476e-06, |
|
"loss": 4.7991, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.4046304368885014, |
|
"grad_norm": 2.986448287963867, |
|
"learning_rate": 9.85953695631115e-06, |
|
"loss": 4.781, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.4288482030417513, |
|
"grad_norm": 2.5939040184020996, |
|
"learning_rate": 9.857115179695826e-06, |
|
"loss": 4.7634, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.4530659691950014, |
|
"grad_norm": 2.674027442932129, |
|
"learning_rate": 9.854693403080501e-06, |
|
"loss": 4.7446, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.4772837353482515, |
|
"grad_norm": 3.018937826156616, |
|
"learning_rate": 9.852271626465176e-06, |
|
"loss": 4.7332, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.5015015015015014, |
|
"grad_norm": 2.862410306930542, |
|
"learning_rate": 9.849849849849851e-06, |
|
"loss": 4.7151, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.5257192676547515, |
|
"grad_norm": 2.9605488777160645, |
|
"learning_rate": 9.847428073234524e-06, |
|
"loss": 4.7023, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.5499370338080016, |
|
"grad_norm": 3.116225242614746, |
|
"learning_rate": 9.845006296619201e-06, |
|
"loss": 4.6834, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.5741547999612515, |
|
"grad_norm": 3.074164390563965, |
|
"learning_rate": 9.842584520003876e-06, |
|
"loss": 4.6676, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.5983725661145016, |
|
"grad_norm": 2.677706003189087, |
|
"learning_rate": 9.840162743388551e-06, |
|
"loss": 4.6547, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.6225903322677517, |
|
"grad_norm": 2.832223653793335, |
|
"learning_rate": 9.837740966773226e-06, |
|
"loss": 4.6402, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.6468080984210016, |
|
"grad_norm": 3.1041297912597656, |
|
"learning_rate": 9.8353191901579e-06, |
|
"loss": 4.6271, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.6710258645742515, |
|
"grad_norm": 2.883216381072998, |
|
"learning_rate": 9.832897413542576e-06, |
|
"loss": 4.6141, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.6952436307275018, |
|
"grad_norm": 2.894000291824341, |
|
"learning_rate": 9.83047563692725e-06, |
|
"loss": 4.6031, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.7194613968807517, |
|
"grad_norm": 2.9335453510284424, |
|
"learning_rate": 9.828053860311927e-06, |
|
"loss": 4.5911, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.7436791630340016, |
|
"grad_norm": 2.7511613368988037, |
|
"learning_rate": 9.8256320836966e-06, |
|
"loss": 4.5824, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.7678969291872517, |
|
"grad_norm": 2.8148419857025146, |
|
"learning_rate": 9.823210307081275e-06, |
|
"loss": 4.5693, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.7921146953405018, |
|
"grad_norm": 2.8832480907440186, |
|
"learning_rate": 9.820788530465952e-06, |
|
"loss": 4.5622, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.8163324614937517, |
|
"grad_norm": 2.9674079418182373, |
|
"learning_rate": 9.818366753850625e-06, |
|
"loss": 4.5473, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.8405502276470018, |
|
"grad_norm": 2.971090793609619, |
|
"learning_rate": 9.815944977235302e-06, |
|
"loss": 4.538, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.864767993800252, |
|
"grad_norm": 2.785881996154785, |
|
"learning_rate": 9.813523200619975e-06, |
|
"loss": 4.5327, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.8889857599535018, |
|
"grad_norm": 2.9853248596191406, |
|
"learning_rate": 9.81110142400465e-06, |
|
"loss": 4.5078, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.913203526106752, |
|
"grad_norm": 2.899179697036743, |
|
"learning_rate": 9.808679647389325e-06, |
|
"loss": 4.5002, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.937421292260002, |
|
"grad_norm": 2.5843992233276367, |
|
"learning_rate": 9.806257870774e-06, |
|
"loss": 4.4928, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.961639058413252, |
|
"grad_norm": 2.8425755500793457, |
|
"learning_rate": 9.803836094158675e-06, |
|
"loss": 4.4881, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.985856824566502, |
|
"grad_norm": 2.87211275100708, |
|
"learning_rate": 9.80141431754335e-06, |
|
"loss": 4.4822, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.010074590719752, |
|
"grad_norm": 3.0297703742980957, |
|
"learning_rate": 9.798992540928026e-06, |
|
"loss": 4.4619, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.034292356873002, |
|
"grad_norm": 2.9869863986968994, |
|
"learning_rate": 9.7965707643127e-06, |
|
"loss": 4.4526, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.058510123026252, |
|
"grad_norm": 2.777209520339966, |
|
"learning_rate": 9.794148987697376e-06, |
|
"loss": 4.432, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.0827278891795022, |
|
"grad_norm": 3.0258235931396484, |
|
"learning_rate": 9.79172721108205e-06, |
|
"loss": 4.4386, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.106945655332752, |
|
"grad_norm": 2.8184220790863037, |
|
"learning_rate": 9.789305434466726e-06, |
|
"loss": 4.4254, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.131163421486002, |
|
"grad_norm": 2.9428908824920654, |
|
"learning_rate": 9.7868836578514e-06, |
|
"loss": 4.4172, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.1553811876392523, |
|
"grad_norm": 3.1215102672576904, |
|
"learning_rate": 9.784461881236076e-06, |
|
"loss": 4.4078, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.1795989537925022, |
|
"grad_norm": 3.032611846923828, |
|
"learning_rate": 9.782040104620751e-06, |
|
"loss": 4.4036, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.203816719945752, |
|
"grad_norm": 2.9431488513946533, |
|
"learning_rate": 9.779618328005426e-06, |
|
"loss": 4.3997, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.228034486099002, |
|
"grad_norm": 2.9058682918548584, |
|
"learning_rate": 9.7771965513901e-06, |
|
"loss": 4.389, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.2522522522522523, |
|
"grad_norm": 2.703967809677124, |
|
"learning_rate": 9.774774774774776e-06, |
|
"loss": 4.3753, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.2764700184055022, |
|
"grad_norm": 2.764721155166626, |
|
"learning_rate": 9.77235299815945e-06, |
|
"loss": 4.3658, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.300687784558752, |
|
"grad_norm": 2.834578514099121, |
|
"learning_rate": 9.769931221544126e-06, |
|
"loss": 4.3577, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.3249055507120024, |
|
"grad_norm": 2.9823198318481445, |
|
"learning_rate": 9.767509444928801e-06, |
|
"loss": 4.3531, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.3491233168652523, |
|
"grad_norm": 2.8373069763183594, |
|
"learning_rate": 9.765087668313475e-06, |
|
"loss": 4.3515, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.373341083018502, |
|
"grad_norm": 2.6971516609191895, |
|
"learning_rate": 9.762665891698151e-06, |
|
"loss": 4.3367, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.3975588491717525, |
|
"grad_norm": 2.8022115230560303, |
|
"learning_rate": 9.760244115082825e-06, |
|
"loss": 4.3302, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.4217766153250024, |
|
"grad_norm": 2.9047532081604004, |
|
"learning_rate": 9.757822338467502e-06, |
|
"loss": 4.3202, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.4459943814782523, |
|
"grad_norm": 2.81803297996521, |
|
"learning_rate": 9.755400561852175e-06, |
|
"loss": 4.3184, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.4702121476315027, |
|
"grad_norm": 2.9668848514556885, |
|
"learning_rate": 9.75297878523685e-06, |
|
"loss": 4.3093, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.4944299137847525, |
|
"grad_norm": 3.0008721351623535, |
|
"learning_rate": 9.750557008621525e-06, |
|
"loss": 4.3089, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.5186476799380024, |
|
"grad_norm": 2.76766300201416, |
|
"learning_rate": 9.7481352320062e-06, |
|
"loss": 4.2961, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.5428654460912528, |
|
"grad_norm": 2.961453914642334, |
|
"learning_rate": 9.745713455390875e-06, |
|
"loss": 4.284, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.5670832122445026, |
|
"grad_norm": 3.030158758163452, |
|
"learning_rate": 9.74329167877555e-06, |
|
"loss": 4.2849, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.5913009783977525, |
|
"grad_norm": 2.9656057357788086, |
|
"learning_rate": 9.740869902160225e-06, |
|
"loss": 4.2712, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.615518744551003, |
|
"grad_norm": 3.3482959270477295, |
|
"learning_rate": 9.7384481255449e-06, |
|
"loss": 4.2833, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.6397365107042527, |
|
"grad_norm": 2.8142096996307373, |
|
"learning_rate": 9.736026348929575e-06, |
|
"loss": 4.2652, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.6639542768575026, |
|
"grad_norm": 2.776679277420044, |
|
"learning_rate": 9.73360457231425e-06, |
|
"loss": 4.2653, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.688172043010753, |
|
"grad_norm": 2.7612788677215576, |
|
"learning_rate": 9.731182795698925e-06, |
|
"loss": 4.2562, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.712389809164003, |
|
"grad_norm": 2.959991931915283, |
|
"learning_rate": 9.7287610190836e-06, |
|
"loss": 4.2515, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.7366075753172527, |
|
"grad_norm": 2.969061851501465, |
|
"learning_rate": 9.726339242468276e-06, |
|
"loss": 4.2378, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.7608253414705026, |
|
"grad_norm": 3.1710784435272217, |
|
"learning_rate": 9.72391746585295e-06, |
|
"loss": 4.2408, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.7850431076237525, |
|
"grad_norm": 2.9343762397766113, |
|
"learning_rate": 9.721495689237626e-06, |
|
"loss": 4.2316, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.809260873777003, |
|
"grad_norm": 2.98744535446167, |
|
"learning_rate": 9.7190739126223e-06, |
|
"loss": 4.2302, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.8334786399302527, |
|
"grad_norm": 2.8376593589782715, |
|
"learning_rate": 9.716652136006976e-06, |
|
"loss": 4.2229, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.8576964060835026, |
|
"grad_norm": 2.7830283641815186, |
|
"learning_rate": 9.714230359391651e-06, |
|
"loss": 4.2138, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.881914172236753, |
|
"grad_norm": 2.824352741241455, |
|
"learning_rate": 9.711808582776326e-06, |
|
"loss": 4.2039, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 2.906131938390003, |
|
"grad_norm": 2.8537116050720215, |
|
"learning_rate": 9.709386806161001e-06, |
|
"loss": 4.2063, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.9303497045432527, |
|
"grad_norm": 3.004157543182373, |
|
"learning_rate": 9.706965029545674e-06, |
|
"loss": 4.1983, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 2.954567470696503, |
|
"grad_norm": 2.8163509368896484, |
|
"learning_rate": 9.704543252930351e-06, |
|
"loss": 4.1938, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 2.978785236849753, |
|
"grad_norm": 2.8276596069335938, |
|
"learning_rate": 9.702121476315024e-06, |
|
"loss": 4.1915, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 3.003003003003003, |
|
"grad_norm": 2.7849977016448975, |
|
"learning_rate": 9.699699699699701e-06, |
|
"loss": 4.1942, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.027220769156253, |
|
"grad_norm": 2.782846212387085, |
|
"learning_rate": 9.697277923084375e-06, |
|
"loss": 4.1741, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 3.051438535309503, |
|
"grad_norm": 2.906552314758301, |
|
"learning_rate": 9.69485614646905e-06, |
|
"loss": 4.1767, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.075656301462753, |
|
"grad_norm": 3.0256595611572266, |
|
"learning_rate": 9.692434369853726e-06, |
|
"loss": 4.1665, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 3.0998740676160033, |
|
"grad_norm": 2.847698450088501, |
|
"learning_rate": 9.6900125932384e-06, |
|
"loss": 4.1642, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.124091833769253, |
|
"grad_norm": 2.8021674156188965, |
|
"learning_rate": 9.687590816623077e-06, |
|
"loss": 4.1663, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 3.148309599922503, |
|
"grad_norm": 2.784911632537842, |
|
"learning_rate": 9.68516904000775e-06, |
|
"loss": 4.1546, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.1725273660757534, |
|
"grad_norm": 3.019435167312622, |
|
"learning_rate": 9.682747263392425e-06, |
|
"loss": 4.1443, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 3.1967451322290033, |
|
"grad_norm": 2.60965895652771, |
|
"learning_rate": 9.6803254867771e-06, |
|
"loss": 4.1465, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.220962898382253, |
|
"grad_norm": 2.740164041519165, |
|
"learning_rate": 9.677903710161775e-06, |
|
"loss": 4.1345, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 3.2451806645355035, |
|
"grad_norm": 2.862274646759033, |
|
"learning_rate": 9.67548193354645e-06, |
|
"loss": 4.1461, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.2693984306887534, |
|
"grad_norm": 2.8547213077545166, |
|
"learning_rate": 9.673060156931125e-06, |
|
"loss": 4.137, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 3.2936161968420032, |
|
"grad_norm": 3.0033137798309326, |
|
"learning_rate": 9.6706383803158e-06, |
|
"loss": 4.1253, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.317833962995253, |
|
"grad_norm": 2.795989513397217, |
|
"learning_rate": 9.668216603700475e-06, |
|
"loss": 4.1232, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 3.3420517291485035, |
|
"grad_norm": 2.8020830154418945, |
|
"learning_rate": 9.66579482708515e-06, |
|
"loss": 4.1238, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.3662694953017533, |
|
"grad_norm": 2.808565855026245, |
|
"learning_rate": 9.663373050469825e-06, |
|
"loss": 4.1155, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 3.3904872614550032, |
|
"grad_norm": 2.7904319763183594, |
|
"learning_rate": 9.6609512738545e-06, |
|
"loss": 4.1143, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.4147050276082536, |
|
"grad_norm": 2.7850215435028076, |
|
"learning_rate": 9.658529497239176e-06, |
|
"loss": 4.1102, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 3.4389227937615034, |
|
"grad_norm": 2.6868176460266113, |
|
"learning_rate": 9.65610772062385e-06, |
|
"loss": 4.0994, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.4631405599147533, |
|
"grad_norm": 2.862273931503296, |
|
"learning_rate": 9.653685944008526e-06, |
|
"loss": 4.1054, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 3.4873583260680037, |
|
"grad_norm": 3.01948881149292, |
|
"learning_rate": 9.6512641673932e-06, |
|
"loss": 4.0975, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.5115760922212536, |
|
"grad_norm": 2.945227861404419, |
|
"learning_rate": 9.648842390777876e-06, |
|
"loss": 4.0941, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 3.5357938583745034, |
|
"grad_norm": 3.265650987625122, |
|
"learning_rate": 9.64642061416255e-06, |
|
"loss": 4.0854, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.5600116245277533, |
|
"grad_norm": 2.839852809906006, |
|
"learning_rate": 9.643998837547224e-06, |
|
"loss": 4.0889, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 3.5842293906810037, |
|
"grad_norm": 3.0958175659179688, |
|
"learning_rate": 9.641577060931901e-06, |
|
"loss": 4.0793, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 3.6084471568342535, |
|
"grad_norm": 2.957026481628418, |
|
"learning_rate": 9.639155284316576e-06, |
|
"loss": 4.0764, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 3.6326649229875034, |
|
"grad_norm": 3.0738115310668945, |
|
"learning_rate": 9.636733507701251e-06, |
|
"loss": 4.0742, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.6568826891407538, |
|
"grad_norm": 2.877403736114502, |
|
"learning_rate": 9.634311731085926e-06, |
|
"loss": 4.0744, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 3.6811004552940036, |
|
"grad_norm": 3.0667495727539062, |
|
"learning_rate": 9.6318899544706e-06, |
|
"loss": 4.0715, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 3.7053182214472535, |
|
"grad_norm": 2.8147807121276855, |
|
"learning_rate": 9.629468177855276e-06, |
|
"loss": 4.0666, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 3.729535987600504, |
|
"grad_norm": 2.8717801570892334, |
|
"learning_rate": 9.62704640123995e-06, |
|
"loss": 4.064, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 3.7537537537537538, |
|
"grad_norm": 2.7591042518615723, |
|
"learning_rate": 9.624624624624626e-06, |
|
"loss": 4.0557, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 3.7779715199070036, |
|
"grad_norm": 2.843806743621826, |
|
"learning_rate": 9.6222028480093e-06, |
|
"loss": 4.0579, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 3.802189286060254, |
|
"grad_norm": 2.869080066680908, |
|
"learning_rate": 9.619781071393975e-06, |
|
"loss": 4.0537, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 3.826407052213504, |
|
"grad_norm": 2.792863607406616, |
|
"learning_rate": 9.617359294778652e-06, |
|
"loss": 4.0484, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 3.8506248183667537, |
|
"grad_norm": 2.991138458251953, |
|
"learning_rate": 9.614937518163325e-06, |
|
"loss": 4.039, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 3.874842584520004, |
|
"grad_norm": 2.7616770267486572, |
|
"learning_rate": 9.612515741548002e-06, |
|
"loss": 4.0438, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 3.899060350673254, |
|
"grad_norm": 2.718642234802246, |
|
"learning_rate": 9.610093964932675e-06, |
|
"loss": 4.0333, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 3.923278116826504, |
|
"grad_norm": 2.8432154655456543, |
|
"learning_rate": 9.60767218831735e-06, |
|
"loss": 4.0419, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 3.947495882979754, |
|
"grad_norm": 3.018446683883667, |
|
"learning_rate": 9.605250411702025e-06, |
|
"loss": 4.0374, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 3.971713649133004, |
|
"grad_norm": 2.909247636795044, |
|
"learning_rate": 9.6028286350867e-06, |
|
"loss": 4.0299, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 3.995931415286254, |
|
"grad_norm": 3.047041654586792, |
|
"learning_rate": 9.600406858471375e-06, |
|
"loss": 4.0195, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 4.020149181439504, |
|
"grad_norm": 2.8578057289123535, |
|
"learning_rate": 9.59798508185605e-06, |
|
"loss": 4.0212, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.044366947592754, |
|
"grad_norm": 2.8038136959075928, |
|
"learning_rate": 9.595563305240725e-06, |
|
"loss": 4.0196, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 4.068584713746004, |
|
"grad_norm": 2.879891872406006, |
|
"learning_rate": 9.5931415286254e-06, |
|
"loss": 4.0127, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.092802479899254, |
|
"grad_norm": 2.875603437423706, |
|
"learning_rate": 9.590719752010075e-06, |
|
"loss": 4.0142, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 4.117020246052504, |
|
"grad_norm": 2.975302219390869, |
|
"learning_rate": 9.58829797539475e-06, |
|
"loss": 4.0002, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.141238012205754, |
|
"grad_norm": 2.9974005222320557, |
|
"learning_rate": 9.585876198779426e-06, |
|
"loss": 4.0038, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 4.1654557783590045, |
|
"grad_norm": 2.8580379486083984, |
|
"learning_rate": 9.5834544221641e-06, |
|
"loss": 4.0003, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.189673544512254, |
|
"grad_norm": 2.987436056137085, |
|
"learning_rate": 9.581032645548776e-06, |
|
"loss": 4.005, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 4.213891310665504, |
|
"grad_norm": 2.6872076988220215, |
|
"learning_rate": 9.57861086893345e-06, |
|
"loss": 3.9909, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.238109076818755, |
|
"grad_norm": 2.991762638092041, |
|
"learning_rate": 9.576189092318126e-06, |
|
"loss": 3.9897, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 4.262326842972004, |
|
"grad_norm": 2.8275723457336426, |
|
"learning_rate": 9.5737673157028e-06, |
|
"loss": 3.996, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.286544609125254, |
|
"grad_norm": 2.892839193344116, |
|
"learning_rate": 9.571345539087476e-06, |
|
"loss": 3.9946, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 4.310762375278505, |
|
"grad_norm": 2.8410208225250244, |
|
"learning_rate": 9.56892376247215e-06, |
|
"loss": 3.9862, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.334980141431754, |
|
"grad_norm": 2.797422409057617, |
|
"learning_rate": 9.566501985856826e-06, |
|
"loss": 3.9843, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 4.3591979075850045, |
|
"grad_norm": 2.855832099914551, |
|
"learning_rate": 9.564080209241501e-06, |
|
"loss": 3.9796, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.383415673738254, |
|
"grad_norm": 3.0120160579681396, |
|
"learning_rate": 9.561658432626174e-06, |
|
"loss": 3.9768, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 4.407633439891504, |
|
"grad_norm": 2.7952980995178223, |
|
"learning_rate": 9.559236656010851e-06, |
|
"loss": 3.9742, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.431851206044755, |
|
"grad_norm": 2.8430566787719727, |
|
"learning_rate": 9.556814879395525e-06, |
|
"loss": 3.9741, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 4.456068972198004, |
|
"grad_norm": 2.9674031734466553, |
|
"learning_rate": 9.554393102780201e-06, |
|
"loss": 3.9697, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 4.480286738351254, |
|
"grad_norm": 3.0408644676208496, |
|
"learning_rate": 9.551971326164875e-06, |
|
"loss": 3.9624, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 4.504504504504505, |
|
"grad_norm": 2.9981327056884766, |
|
"learning_rate": 9.54954954954955e-06, |
|
"loss": 3.9652, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 4.528722270657754, |
|
"grad_norm": 2.7843706607818604, |
|
"learning_rate": 9.547127772934225e-06, |
|
"loss": 3.9722, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 4.5529400368110045, |
|
"grad_norm": 2.7166874408721924, |
|
"learning_rate": 9.5447059963189e-06, |
|
"loss": 3.9628, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 4.577157802964255, |
|
"grad_norm": 2.923854351043701, |
|
"learning_rate": 9.542284219703575e-06, |
|
"loss": 3.9594, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 4.601375569117504, |
|
"grad_norm": 2.915800094604492, |
|
"learning_rate": 9.53986244308825e-06, |
|
"loss": 3.9609, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 4.625593335270755, |
|
"grad_norm": 2.9524765014648438, |
|
"learning_rate": 9.537440666472925e-06, |
|
"loss": 3.9587, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 4.649811101424005, |
|
"grad_norm": 2.898005723953247, |
|
"learning_rate": 9.5350188898576e-06, |
|
"loss": 3.9498, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 4.674028867577254, |
|
"grad_norm": 2.9840903282165527, |
|
"learning_rate": 9.532597113242275e-06, |
|
"loss": 3.9508, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 4.698246633730505, |
|
"grad_norm": 2.7765541076660156, |
|
"learning_rate": 9.53017533662695e-06, |
|
"loss": 3.9481, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 4.722464399883755, |
|
"grad_norm": 2.8900692462921143, |
|
"learning_rate": 9.527753560011625e-06, |
|
"loss": 3.9365, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 4.746682166037004, |
|
"grad_norm": 2.8892781734466553, |
|
"learning_rate": 9.5253317833963e-06, |
|
"loss": 3.9492, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 4.770899932190255, |
|
"grad_norm": 2.960374355316162, |
|
"learning_rate": 9.522910006780975e-06, |
|
"loss": 3.942, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 4.795117698343505, |
|
"grad_norm": 2.7404415607452393, |
|
"learning_rate": 9.52048823016565e-06, |
|
"loss": 3.938, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 4.8193354644967545, |
|
"grad_norm": 3.024486780166626, |
|
"learning_rate": 9.518066453550326e-06, |
|
"loss": 3.9296, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 4.843553230650005, |
|
"grad_norm": 2.8316361904144287, |
|
"learning_rate": 9.515644676935e-06, |
|
"loss": 3.9351, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 4.867770996803255, |
|
"grad_norm": 2.8669049739837646, |
|
"learning_rate": 9.513222900319676e-06, |
|
"loss": 3.9373, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 4.891988762956505, |
|
"grad_norm": 2.7152950763702393, |
|
"learning_rate": 9.51080112370435e-06, |
|
"loss": 3.9163, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 4.916206529109755, |
|
"grad_norm": 2.7430613040924072, |
|
"learning_rate": 9.508379347089026e-06, |
|
"loss": 3.9273, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 4.940424295263005, |
|
"grad_norm": 3.0171566009521484, |
|
"learning_rate": 9.5059575704737e-06, |
|
"loss": 3.9247, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 4.964642061416255, |
|
"grad_norm": 2.833829164505005, |
|
"learning_rate": 9.503535793858374e-06, |
|
"loss": 3.9307, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 4.988859827569505, |
|
"grad_norm": 2.7739973068237305, |
|
"learning_rate": 9.501114017243051e-06, |
|
"loss": 3.9195, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.013077593722755, |
|
"grad_norm": 2.774411201477051, |
|
"learning_rate": 9.498692240627724e-06, |
|
"loss": 3.9116, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 5.037295359876005, |
|
"grad_norm": 2.851175546646118, |
|
"learning_rate": 9.496270464012401e-06, |
|
"loss": 3.9113, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.061513126029255, |
|
"grad_norm": 2.8700265884399414, |
|
"learning_rate": 9.493848687397074e-06, |
|
"loss": 3.9058, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 5.0857308921825055, |
|
"grad_norm": 2.8087737560272217, |
|
"learning_rate": 9.49142691078175e-06, |
|
"loss": 3.9087, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.109948658335755, |
|
"grad_norm": 2.882826328277588, |
|
"learning_rate": 9.489005134166426e-06, |
|
"loss": 3.907, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 5.134166424489005, |
|
"grad_norm": 2.900575637817383, |
|
"learning_rate": 9.4865833575511e-06, |
|
"loss": 3.9022, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.158384190642255, |
|
"grad_norm": 2.7019128799438477, |
|
"learning_rate": 9.484161580935776e-06, |
|
"loss": 3.9017, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 5.182601956795505, |
|
"grad_norm": 2.8361051082611084, |
|
"learning_rate": 9.48173980432045e-06, |
|
"loss": 3.9108, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.206819722948755, |
|
"grad_norm": 2.741563558578491, |
|
"learning_rate": 9.479318027705125e-06, |
|
"loss": 3.8919, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 5.231037489102005, |
|
"grad_norm": 2.967627763748169, |
|
"learning_rate": 9.4768962510898e-06, |
|
"loss": 3.8967, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.255255255255255, |
|
"grad_norm": 2.8605451583862305, |
|
"learning_rate": 9.474474474474475e-06, |
|
"loss": 3.897, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 5.2794730214085055, |
|
"grad_norm": 2.7184574604034424, |
|
"learning_rate": 9.47205269785915e-06, |
|
"loss": 3.8975, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.303690787561755, |
|
"grad_norm": 2.7433297634124756, |
|
"learning_rate": 9.469630921243825e-06, |
|
"loss": 3.8954, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 5.327908553715005, |
|
"grad_norm": 2.7750015258789062, |
|
"learning_rate": 9.4672091446285e-06, |
|
"loss": 3.8913, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.352126319868256, |
|
"grad_norm": 2.9533851146698, |
|
"learning_rate": 9.464787368013175e-06, |
|
"loss": 3.8844, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 5.376344086021505, |
|
"grad_norm": 2.8131632804870605, |
|
"learning_rate": 9.46236559139785e-06, |
|
"loss": 3.8809, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 5.400561852174755, |
|
"grad_norm": 2.791193723678589, |
|
"learning_rate": 9.459943814782525e-06, |
|
"loss": 3.885, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 5.424779618328006, |
|
"grad_norm": 2.869932174682617, |
|
"learning_rate": 9.4575220381672e-06, |
|
"loss": 3.8839, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 5.448997384481255, |
|
"grad_norm": 2.906806707382202, |
|
"learning_rate": 9.455100261551875e-06, |
|
"loss": 3.8816, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 5.4732151506345055, |
|
"grad_norm": 2.6837105751037598, |
|
"learning_rate": 9.45267848493655e-06, |
|
"loss": 3.88, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 5.497432916787756, |
|
"grad_norm": 2.9571547508239746, |
|
"learning_rate": 9.450256708321225e-06, |
|
"loss": 3.877, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 5.521650682941005, |
|
"grad_norm": 2.706204891204834, |
|
"learning_rate": 9.4478349317059e-06, |
|
"loss": 3.8772, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 5.545868449094256, |
|
"grad_norm": 2.7605583667755127, |
|
"learning_rate": 9.445413155090576e-06, |
|
"loss": 3.875, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 5.570086215247506, |
|
"grad_norm": 2.711221933364868, |
|
"learning_rate": 9.44299137847525e-06, |
|
"loss": 3.8712, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 5.594303981400755, |
|
"grad_norm": 2.9056496620178223, |
|
"learning_rate": 9.440569601859924e-06, |
|
"loss": 3.8639, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 5.618521747554006, |
|
"grad_norm": 2.7061548233032227, |
|
"learning_rate": 9.4381478252446e-06, |
|
"loss": 3.8677, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 5.642739513707256, |
|
"grad_norm": 2.9951186180114746, |
|
"learning_rate": 9.435726048629276e-06, |
|
"loss": 3.8667, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 5.6669572798605055, |
|
"grad_norm": 2.753833293914795, |
|
"learning_rate": 9.43330427201395e-06, |
|
"loss": 3.8669, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 5.691175046013756, |
|
"grad_norm": 2.8989222049713135, |
|
"learning_rate": 9.430882495398626e-06, |
|
"loss": 3.8751, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 5.715392812167005, |
|
"grad_norm": 3.0137453079223633, |
|
"learning_rate": 9.4284607187833e-06, |
|
"loss": 3.8706, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 5.739610578320256, |
|
"grad_norm": 2.7698261737823486, |
|
"learning_rate": 9.426038942167976e-06, |
|
"loss": 3.857, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 5.763828344473506, |
|
"grad_norm": 2.877211332321167, |
|
"learning_rate": 9.42361716555265e-06, |
|
"loss": 3.8541, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 5.788046110626755, |
|
"grad_norm": 2.8494150638580322, |
|
"learning_rate": 9.421195388937326e-06, |
|
"loss": 3.8594, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 5.812263876780006, |
|
"grad_norm": 2.72268009185791, |
|
"learning_rate": 9.418773612322e-06, |
|
"loss": 3.8526, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 5.836481642933256, |
|
"grad_norm": 3.0423946380615234, |
|
"learning_rate": 9.416351835706675e-06, |
|
"loss": 3.8558, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 5.860699409086505, |
|
"grad_norm": 2.7056820392608643, |
|
"learning_rate": 9.413930059091351e-06, |
|
"loss": 3.8494, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 5.884917175239756, |
|
"grad_norm": 2.7295594215393066, |
|
"learning_rate": 9.411508282476025e-06, |
|
"loss": 3.8557, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 5.909134941393006, |
|
"grad_norm": 2.8661701679229736, |
|
"learning_rate": 9.409086505860701e-06, |
|
"loss": 3.8528, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 5.9333527075462555, |
|
"grad_norm": 2.8183608055114746, |
|
"learning_rate": 9.406664729245375e-06, |
|
"loss": 3.8511, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 5.957570473699506, |
|
"grad_norm": 2.974858283996582, |
|
"learning_rate": 9.40424295263005e-06, |
|
"loss": 3.8438, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 5.981788239852756, |
|
"grad_norm": 2.8071188926696777, |
|
"learning_rate": 9.401821176014725e-06, |
|
"loss": 3.8338, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 6.006006006006006, |
|
"grad_norm": 2.679610252380371, |
|
"learning_rate": 9.3993993993994e-06, |
|
"loss": 3.8432, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.030223772159256, |
|
"grad_norm": 2.7918217182159424, |
|
"learning_rate": 9.396977622784075e-06, |
|
"loss": 3.8359, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 6.054441538312506, |
|
"grad_norm": 2.9353878498077393, |
|
"learning_rate": 9.39455584616875e-06, |
|
"loss": 3.8423, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.078659304465756, |
|
"grad_norm": 2.7717785835266113, |
|
"learning_rate": 9.392134069553425e-06, |
|
"loss": 3.8362, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 6.102877070619006, |
|
"grad_norm": 2.8372817039489746, |
|
"learning_rate": 9.3897122929381e-06, |
|
"loss": 3.8308, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.127094836772256, |
|
"grad_norm": 2.823821544647217, |
|
"learning_rate": 9.387290516322775e-06, |
|
"loss": 3.8314, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 6.151312602925506, |
|
"grad_norm": 2.7613956928253174, |
|
"learning_rate": 9.38486873970745e-06, |
|
"loss": 3.8317, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.175530369078756, |
|
"grad_norm": 2.745297431945801, |
|
"learning_rate": 9.382446963092125e-06, |
|
"loss": 3.8274, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 6.1997481352320065, |
|
"grad_norm": 2.7873809337615967, |
|
"learning_rate": 9.3800251864768e-06, |
|
"loss": 3.8203, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.223965901385256, |
|
"grad_norm": 2.871760606765747, |
|
"learning_rate": 9.377603409861475e-06, |
|
"loss": 3.8356, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 6.248183667538506, |
|
"grad_norm": 2.788484811782837, |
|
"learning_rate": 9.37518163324615e-06, |
|
"loss": 3.8239, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.272401433691757, |
|
"grad_norm": 2.7170634269714355, |
|
"learning_rate": 9.372759856630826e-06, |
|
"loss": 3.8208, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 6.296619199845006, |
|
"grad_norm": 2.8259615898132324, |
|
"learning_rate": 9.370338080015499e-06, |
|
"loss": 3.8241, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 6.320836965998256, |
|
"grad_norm": 2.9876561164855957, |
|
"learning_rate": 9.367916303400176e-06, |
|
"loss": 3.8259, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 6.345054732151507, |
|
"grad_norm": 2.844414710998535, |
|
"learning_rate": 9.365494526784849e-06, |
|
"loss": 3.8173, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 6.369272498304756, |
|
"grad_norm": 2.8100152015686035, |
|
"learning_rate": 9.363072750169526e-06, |
|
"loss": 3.8245, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 6.3934902644580065, |
|
"grad_norm": 2.8489105701446533, |
|
"learning_rate": 9.360650973554201e-06, |
|
"loss": 3.8147, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 6.417708030611257, |
|
"grad_norm": 2.8502120971679688, |
|
"learning_rate": 9.358229196938874e-06, |
|
"loss": 3.8122, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 6.441925796764506, |
|
"grad_norm": 2.9556784629821777, |
|
"learning_rate": 9.355807420323551e-06, |
|
"loss": 3.8131, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 6.466143562917757, |
|
"grad_norm": 2.762270212173462, |
|
"learning_rate": 9.353385643708224e-06, |
|
"loss": 3.807, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 6.490361329071007, |
|
"grad_norm": 2.7611629962921143, |
|
"learning_rate": 9.350963867092901e-06, |
|
"loss": 3.8113, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 6.514579095224256, |
|
"grad_norm": 2.738227605819702, |
|
"learning_rate": 9.348542090477574e-06, |
|
"loss": 3.8137, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 6.538796861377507, |
|
"grad_norm": 2.822857618331909, |
|
"learning_rate": 9.34612031386225e-06, |
|
"loss": 3.7986, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 6.563014627530757, |
|
"grad_norm": 2.731264591217041, |
|
"learning_rate": 9.343698537246925e-06, |
|
"loss": 3.808, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 6.5872323936840065, |
|
"grad_norm": 2.699312448501587, |
|
"learning_rate": 9.3412767606316e-06, |
|
"loss": 3.803, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 6.611450159837257, |
|
"grad_norm": 2.8332924842834473, |
|
"learning_rate": 9.338854984016275e-06, |
|
"loss": 3.8153, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 6.635667925990506, |
|
"grad_norm": 2.8680782318115234, |
|
"learning_rate": 9.33643320740095e-06, |
|
"loss": 3.7969, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 6.659885692143757, |
|
"grad_norm": 2.844148635864258, |
|
"learning_rate": 9.334011430785625e-06, |
|
"loss": 3.8032, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 6.684103458297007, |
|
"grad_norm": 2.7583229541778564, |
|
"learning_rate": 9.3315896541703e-06, |
|
"loss": 3.8099, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 6.708321224450256, |
|
"grad_norm": 2.8120036125183105, |
|
"learning_rate": 9.329167877554975e-06, |
|
"loss": 3.799, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 6.732538990603507, |
|
"grad_norm": 2.8804004192352295, |
|
"learning_rate": 9.32674610093965e-06, |
|
"loss": 3.7951, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 6.756756756756757, |
|
"grad_norm": 2.8031482696533203, |
|
"learning_rate": 9.324324324324325e-06, |
|
"loss": 3.7966, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 6.7809745229100065, |
|
"grad_norm": 2.7140092849731445, |
|
"learning_rate": 9.321902547709e-06, |
|
"loss": 3.7959, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 6.7809745229100065, |
|
"step": 140000, |
|
"total_flos": 5.861064073050849e+17, |
|
"train_loss": 4.399306026785714, |
|
"train_runtime": 93598.1238, |
|
"train_samples_per_second": 705.829, |
|
"train_steps_per_second": 22.058 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 2064600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 1000000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.861064073050849e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|