astrollama-3-8b-base_aic / trainer_state.json
research4pan's picture
First model version
b64b3ce
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 15.0,
"global_step": 1329,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007524454477050414,
"grad_norm": 17.096418545106147,
"learning_rate": 5.000000000000001e-07,
"loss": 2.0237,
"step": 1
},
{
"epoch": 0.0015048908954100827,
"grad_norm": 16.78961990783759,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.051,
"step": 2
},
{
"epoch": 0.002257336343115124,
"grad_norm": 16.92710636567725,
"learning_rate": 1.5e-06,
"loss": 2.0056,
"step": 3
},
{
"epoch": 0.0030097817908201654,
"grad_norm": 15.800755993760415,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.0404,
"step": 4
},
{
"epoch": 0.003762227238525207,
"grad_norm": 13.652117284716505,
"learning_rate": 2.5e-06,
"loss": 2.0583,
"step": 5
},
{
"epoch": 0.004514672686230248,
"grad_norm": 10.054709487467143,
"learning_rate": 3e-06,
"loss": 2.0356,
"step": 6
},
{
"epoch": 0.005267118133935289,
"grad_norm": 8.85935164861444,
"learning_rate": 3.5e-06,
"loss": 1.9996,
"step": 7
},
{
"epoch": 0.006019563581640331,
"grad_norm": 6.192373440285258,
"learning_rate": 4.000000000000001e-06,
"loss": 1.9559,
"step": 8
},
{
"epoch": 0.006772009029345372,
"grad_norm": 6.138606674037414,
"learning_rate": 4.5e-06,
"loss": 1.9717,
"step": 9
},
{
"epoch": 0.007524454477050414,
"grad_norm": 5.9344693580666865,
"learning_rate": 5e-06,
"loss": 1.944,
"step": 10
},
{
"epoch": 0.008276899924755455,
"grad_norm": 3.9721360906952476,
"learning_rate": 5.500000000000001e-06,
"loss": 1.9399,
"step": 11
},
{
"epoch": 0.009029345372460496,
"grad_norm": 2.0921084497961955,
"learning_rate": 6e-06,
"loss": 1.9453,
"step": 12
},
{
"epoch": 0.009781790820165538,
"grad_norm": 1.7100531308392566,
"learning_rate": 6.5000000000000004e-06,
"loss": 1.9063,
"step": 13
},
{
"epoch": 0.010534236267870579,
"grad_norm": 2.2394036296709325,
"learning_rate": 7e-06,
"loss": 1.9232,
"step": 14
},
{
"epoch": 0.011286681715575621,
"grad_norm": 1.7257578632049955,
"learning_rate": 7.500000000000001e-06,
"loss": 1.9155,
"step": 15
},
{
"epoch": 0.012039127163280662,
"grad_norm": 1.722804100477846,
"learning_rate": 8.000000000000001e-06,
"loss": 1.9451,
"step": 16
},
{
"epoch": 0.012791572610985704,
"grad_norm": 1.7344221548890328,
"learning_rate": 8.5e-06,
"loss": 1.908,
"step": 17
},
{
"epoch": 0.013544018058690745,
"grad_norm": 1.7978430571139923,
"learning_rate": 9e-06,
"loss": 1.9099,
"step": 18
},
{
"epoch": 0.014296463506395787,
"grad_norm": 1.9445916511349313,
"learning_rate": 9.5e-06,
"loss": 1.9487,
"step": 19
},
{
"epoch": 0.015048908954100828,
"grad_norm": 1.8679455392786308,
"learning_rate": 1e-05,
"loss": 1.9214,
"step": 20
},
{
"epoch": 0.01580135440180587,
"grad_norm": 1.7172456686085904,
"learning_rate": 1.0500000000000001e-05,
"loss": 1.9369,
"step": 21
},
{
"epoch": 0.01655379984951091,
"grad_norm": 1.8542854842097685,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.9434,
"step": 22
},
{
"epoch": 0.01730624529721595,
"grad_norm": 1.7362799341435606,
"learning_rate": 1.15e-05,
"loss": 1.9698,
"step": 23
},
{
"epoch": 0.01805869074492099,
"grad_norm": 2.243932025815967,
"learning_rate": 1.2e-05,
"loss": 1.9407,
"step": 24
},
{
"epoch": 0.018811136192626036,
"grad_norm": 1.6852063818478258,
"learning_rate": 1.25e-05,
"loss": 1.9428,
"step": 25
},
{
"epoch": 0.019563581640331076,
"grad_norm": 1.7722956582976739,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.9257,
"step": 26
},
{
"epoch": 0.020316027088036117,
"grad_norm": 1.8957074955371662,
"learning_rate": 1.3500000000000001e-05,
"loss": 1.9428,
"step": 27
},
{
"epoch": 0.021068472535741158,
"grad_norm": 1.8269875447052017,
"learning_rate": 1.4e-05,
"loss": 1.9796,
"step": 28
},
{
"epoch": 0.0218209179834462,
"grad_norm": 1.7797796080545987,
"learning_rate": 1.45e-05,
"loss": 1.9703,
"step": 29
},
{
"epoch": 0.022573363431151242,
"grad_norm": 1.7083061033037448,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.9746,
"step": 30
},
{
"epoch": 0.023325808878856283,
"grad_norm": 1.7474103331726714,
"learning_rate": 1.55e-05,
"loss": 1.9411,
"step": 31
},
{
"epoch": 0.024078254326561323,
"grad_norm": 1.9610770194176417,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.9565,
"step": 32
},
{
"epoch": 0.024830699774266364,
"grad_norm": 1.7400717189997106,
"learning_rate": 1.65e-05,
"loss": 1.906,
"step": 33
},
{
"epoch": 0.025583145221971408,
"grad_norm": 1.742030372598344,
"learning_rate": 1.7e-05,
"loss": 1.9221,
"step": 34
},
{
"epoch": 0.02633559066967645,
"grad_norm": 1.734995623073503,
"learning_rate": 1.7500000000000002e-05,
"loss": 1.9755,
"step": 35
},
{
"epoch": 0.02708803611738149,
"grad_norm": 1.7002834844499994,
"learning_rate": 1.8e-05,
"loss": 1.9627,
"step": 36
},
{
"epoch": 0.02784048156508653,
"grad_norm": 1.6938832869786309,
"learning_rate": 1.8500000000000002e-05,
"loss": 1.9745,
"step": 37
},
{
"epoch": 0.028592927012791574,
"grad_norm": 1.773813276072807,
"learning_rate": 1.9e-05,
"loss": 1.9644,
"step": 38
},
{
"epoch": 0.029345372460496615,
"grad_norm": 1.8240574364894557,
"learning_rate": 1.95e-05,
"loss": 1.9651,
"step": 39
},
{
"epoch": 0.030097817908201655,
"grad_norm": 1.7331896923527779,
"learning_rate": 2e-05,
"loss": 2.0067,
"step": 40
},
{
"epoch": 0.030850263355906696,
"grad_norm": 1.936921436358863,
"learning_rate": 1.9999970299504145e-05,
"loss": 1.9646,
"step": 41
},
{
"epoch": 0.03160270880361174,
"grad_norm": 1.8063495679903903,
"learning_rate": 1.9999881198192997e-05,
"loss": 1.9706,
"step": 42
},
{
"epoch": 0.03235515425131678,
"grad_norm": 1.7421551057437685,
"learning_rate": 1.9999732696595825e-05,
"loss": 1.9446,
"step": 43
},
{
"epoch": 0.03310759969902182,
"grad_norm": 1.7872066133263222,
"learning_rate": 1.999952479559475e-05,
"loss": 2.0199,
"step": 44
},
{
"epoch": 0.033860045146726865,
"grad_norm": 1.737195180633472,
"learning_rate": 1.999925749642472e-05,
"loss": 2.0073,
"step": 45
},
{
"epoch": 0.0346124905944319,
"grad_norm": 1.7272101739372974,
"learning_rate": 1.999893080067352e-05,
"loss": 1.9924,
"step": 46
},
{
"epoch": 0.035364936042136946,
"grad_norm": 1.9210426604174216,
"learning_rate": 1.9998544710281757e-05,
"loss": 2.0182,
"step": 47
},
{
"epoch": 0.03611738148984198,
"grad_norm": 1.8511645112169914,
"learning_rate": 1.9998099227542843e-05,
"loss": 1.9846,
"step": 48
},
{
"epoch": 0.03686982693754703,
"grad_norm": 1.6634001249944705,
"learning_rate": 1.9997594355102988e-05,
"loss": 1.9936,
"step": 49
},
{
"epoch": 0.03762227238525207,
"grad_norm": 1.8361184979560372,
"learning_rate": 1.999703009596119e-05,
"loss": 2.0103,
"step": 50
},
{
"epoch": 0.03837471783295711,
"grad_norm": 1.9510150288879684,
"learning_rate": 1.99964064534692e-05,
"loss": 1.9836,
"step": 51
},
{
"epoch": 0.03912716328066215,
"grad_norm": 1.9946843661345377,
"learning_rate": 1.9995723431331517e-05,
"loss": 1.9981,
"step": 52
},
{
"epoch": 0.0398796087283672,
"grad_norm": 1.6803458845725914,
"learning_rate": 1.9994981033605364e-05,
"loss": 2.0061,
"step": 53
},
{
"epoch": 0.040632054176072234,
"grad_norm": 1.7086096802441109,
"learning_rate": 1.999417926470065e-05,
"loss": 1.9819,
"step": 54
},
{
"epoch": 0.04138449962377728,
"grad_norm": 1.7364747891419154,
"learning_rate": 1.999331812937997e-05,
"loss": 1.9859,
"step": 55
},
{
"epoch": 0.042136945071482315,
"grad_norm": 1.6739307036412872,
"learning_rate": 1.9992397632758545e-05,
"loss": 1.9483,
"step": 56
},
{
"epoch": 0.04288939051918736,
"grad_norm": 1.5964718102972715,
"learning_rate": 1.999141778030422e-05,
"loss": 1.9758,
"step": 57
},
{
"epoch": 0.0436418359668924,
"grad_norm": 1.7561444246344466,
"learning_rate": 1.999037857783742e-05,
"loss": 2.0112,
"step": 58
},
{
"epoch": 0.04439428141459744,
"grad_norm": 1.6059265072926614,
"learning_rate": 1.9989280031531103e-05,
"loss": 1.9615,
"step": 59
},
{
"epoch": 0.045146726862302484,
"grad_norm": 1.6510903645571706,
"learning_rate": 1.998812214791075e-05,
"loss": 2.0045,
"step": 60
},
{
"epoch": 0.04589917231000752,
"grad_norm": 1.6066769997363797,
"learning_rate": 1.99869049338543e-05,
"loss": 1.9756,
"step": 61
},
{
"epoch": 0.046651617757712566,
"grad_norm": 1.5655663748016024,
"learning_rate": 1.9985628396592122e-05,
"loss": 1.9609,
"step": 62
},
{
"epoch": 0.04740406320541761,
"grad_norm": 1.8178377686365406,
"learning_rate": 1.9984292543706982e-05,
"loss": 1.9856,
"step": 63
},
{
"epoch": 0.04815650865312265,
"grad_norm": 1.692291421012987,
"learning_rate": 1.9982897383133978e-05,
"loss": 2.0037,
"step": 64
},
{
"epoch": 0.04890895410082769,
"grad_norm": 1.5517921983349234,
"learning_rate": 1.9981442923160494e-05,
"loss": 1.9674,
"step": 65
},
{
"epoch": 0.04966139954853273,
"grad_norm": 1.5178168813515758,
"learning_rate": 1.9979929172426175e-05,
"loss": 2.0073,
"step": 66
},
{
"epoch": 0.05041384499623777,
"grad_norm": 1.6763818599312603,
"learning_rate": 1.9978356139922844e-05,
"loss": 2.0179,
"step": 67
},
{
"epoch": 0.051166290443942816,
"grad_norm": 1.6635928541331586,
"learning_rate": 1.9976723834994475e-05,
"loss": 1.9751,
"step": 68
},
{
"epoch": 0.05191873589164785,
"grad_norm": 1.6018201029874186,
"learning_rate": 1.9975032267337122e-05,
"loss": 2.0422,
"step": 69
},
{
"epoch": 0.0526711813393529,
"grad_norm": 2.084362693963691,
"learning_rate": 1.997328144699886e-05,
"loss": 2.0094,
"step": 70
},
{
"epoch": 0.05342362678705794,
"grad_norm": 1.6182691555653421,
"learning_rate": 1.9971471384379737e-05,
"loss": 2.0021,
"step": 71
},
{
"epoch": 0.05417607223476298,
"grad_norm": 1.698517543296973,
"learning_rate": 1.9969602090231704e-05,
"loss": 1.9741,
"step": 72
},
{
"epoch": 0.05492851768246802,
"grad_norm": 1.6510010650550127,
"learning_rate": 1.9967673575658554e-05,
"loss": 1.9967,
"step": 73
},
{
"epoch": 0.05568096313017306,
"grad_norm": 1.6647498522166857,
"learning_rate": 1.996568585211586e-05,
"loss": 2.0084,
"step": 74
},
{
"epoch": 0.056433408577878104,
"grad_norm": 1.6224171047011524,
"learning_rate": 1.9963638931410887e-05,
"loss": 2.0298,
"step": 75
},
{
"epoch": 0.05718585402558315,
"grad_norm": 1.6196376738772422,
"learning_rate": 1.9961532825702553e-05,
"loss": 1.9836,
"step": 76
},
{
"epoch": 0.057938299473288185,
"grad_norm": 1.570626143828964,
"learning_rate": 1.9959367547501335e-05,
"loss": 1.9736,
"step": 77
},
{
"epoch": 0.05869074492099323,
"grad_norm": 1.5895794355418542,
"learning_rate": 1.99571431096692e-05,
"loss": 1.9949,
"step": 78
},
{
"epoch": 0.059443190368698266,
"grad_norm": 1.724690999705577,
"learning_rate": 1.995485952541953e-05,
"loss": 1.9561,
"step": 79
},
{
"epoch": 0.06019563581640331,
"grad_norm": 1.5239592845360483,
"learning_rate": 1.9952516808317036e-05,
"loss": 1.9842,
"step": 80
},
{
"epoch": 0.060948081264108354,
"grad_norm": 1.5253553584853936,
"learning_rate": 1.9950114972277698e-05,
"loss": 1.9917,
"step": 81
},
{
"epoch": 0.06170052671181339,
"grad_norm": 1.5473528562515397,
"learning_rate": 1.9947654031568657e-05,
"loss": 1.9787,
"step": 82
},
{
"epoch": 0.062452972159518436,
"grad_norm": 1.4905031642703375,
"learning_rate": 1.9945134000808143e-05,
"loss": 1.9658,
"step": 83
},
{
"epoch": 0.06320541760722348,
"grad_norm": 1.4632928104263483,
"learning_rate": 1.9942554894965392e-05,
"loss": 1.9866,
"step": 84
},
{
"epoch": 0.06395786305492852,
"grad_norm": 1.4530754111337434,
"learning_rate": 1.9939916729360544e-05,
"loss": 1.9933,
"step": 85
},
{
"epoch": 0.06471030850263355,
"grad_norm": 1.6122191862509654,
"learning_rate": 1.9937219519664567e-05,
"loss": 1.9725,
"step": 86
},
{
"epoch": 0.0654627539503386,
"grad_norm": 1.5679483619635757,
"learning_rate": 1.9934463281899157e-05,
"loss": 1.9809,
"step": 87
},
{
"epoch": 0.06621519939804364,
"grad_norm": 1.510234924913074,
"learning_rate": 1.9931648032436634e-05,
"loss": 1.9635,
"step": 88
},
{
"epoch": 0.06696764484574869,
"grad_norm": 1.6168210779508898,
"learning_rate": 1.992877378799986e-05,
"loss": 1.968,
"step": 89
},
{
"epoch": 0.06772009029345373,
"grad_norm": 1.5652681455207142,
"learning_rate": 1.992584056566214e-05,
"loss": 1.9896,
"step": 90
},
{
"epoch": 0.06847253574115876,
"grad_norm": 1.5418722830557785,
"learning_rate": 1.9922848382847094e-05,
"loss": 1.9777,
"step": 91
},
{
"epoch": 0.0692249811888638,
"grad_norm": 1.6970590522239763,
"learning_rate": 1.9919797257328596e-05,
"loss": 1.9578,
"step": 92
},
{
"epoch": 0.06997742663656885,
"grad_norm": 1.489723924916067,
"learning_rate": 1.9916687207230622e-05,
"loss": 1.9854,
"step": 93
},
{
"epoch": 0.07072987208427389,
"grad_norm": 1.421588923209003,
"learning_rate": 1.9913518251027187e-05,
"loss": 1.9844,
"step": 94
},
{
"epoch": 0.07148231753197894,
"grad_norm": 1.5243917870009003,
"learning_rate": 1.9910290407542202e-05,
"loss": 1.9851,
"step": 95
},
{
"epoch": 0.07223476297968397,
"grad_norm": 1.3738547884528647,
"learning_rate": 1.9907003695949377e-05,
"loss": 1.9773,
"step": 96
},
{
"epoch": 0.07298720842738901,
"grad_norm": 1.5344896015266904,
"learning_rate": 1.9903658135772106e-05,
"loss": 2.0057,
"step": 97
},
{
"epoch": 0.07373965387509406,
"grad_norm": 1.5075974294817263,
"learning_rate": 1.9900253746883347e-05,
"loss": 1.9773,
"step": 98
},
{
"epoch": 0.0744920993227991,
"grad_norm": 1.4473399711974544,
"learning_rate": 1.9896790549505508e-05,
"loss": 1.9667,
"step": 99
},
{
"epoch": 0.07524454477050414,
"grad_norm": 1.4630022776666067,
"learning_rate": 1.9893268564210327e-05,
"loss": 1.9915,
"step": 100
},
{
"epoch": 0.07599699021820917,
"grad_norm": 1.4655014132926005,
"learning_rate": 1.9889687811918744e-05,
"loss": 1.9565,
"step": 101
},
{
"epoch": 0.07674943566591422,
"grad_norm": 1.4409960507792667,
"learning_rate": 1.988604831390078e-05,
"loss": 1.9953,
"step": 102
},
{
"epoch": 0.07750188111361926,
"grad_norm": 1.724010171862991,
"learning_rate": 1.988235009177542e-05,
"loss": 1.9797,
"step": 103
},
{
"epoch": 0.0782543265613243,
"grad_norm": 1.6139029299127305,
"learning_rate": 1.9878593167510466e-05,
"loss": 2.0093,
"step": 104
},
{
"epoch": 0.07900677200902935,
"grad_norm": 1.4506443096043256,
"learning_rate": 1.9874777563422425e-05,
"loss": 1.9918,
"step": 105
},
{
"epoch": 0.0797592174567344,
"grad_norm": 1.6075602372808435,
"learning_rate": 1.987090330217636e-05,
"loss": 1.9468,
"step": 106
},
{
"epoch": 0.08051166290443942,
"grad_norm": 1.4459560012881987,
"learning_rate": 1.9866970406785763e-05,
"loss": 1.9906,
"step": 107
},
{
"epoch": 0.08126410835214447,
"grad_norm": 1.43370819670338,
"learning_rate": 1.9862978900612432e-05,
"loss": 1.9389,
"step": 108
},
{
"epoch": 0.08201655379984951,
"grad_norm": 1.543252465187717,
"learning_rate": 1.9858928807366303e-05,
"loss": 1.9804,
"step": 109
},
{
"epoch": 0.08276899924755456,
"grad_norm": 1.7504351823326842,
"learning_rate": 1.985482015110533e-05,
"loss": 1.9443,
"step": 110
},
{
"epoch": 0.0835214446952596,
"grad_norm": 1.4330770465365406,
"learning_rate": 1.9850652956235347e-05,
"loss": 1.9623,
"step": 111
},
{
"epoch": 0.08427389014296463,
"grad_norm": 1.6952578341057976,
"learning_rate": 1.98464272475099e-05,
"loss": 1.967,
"step": 112
},
{
"epoch": 0.08502633559066967,
"grad_norm": 1.4806228304470834,
"learning_rate": 1.9842143050030115e-05,
"loss": 1.973,
"step": 113
},
{
"epoch": 0.08577878103837472,
"grad_norm": 1.5034957256153032,
"learning_rate": 1.9837800389244553e-05,
"loss": 2.0,
"step": 114
},
{
"epoch": 0.08653122648607976,
"grad_norm": 1.4997131954653153,
"learning_rate": 1.983339929094905e-05,
"loss": 1.9665,
"step": 115
},
{
"epoch": 0.0872836719337848,
"grad_norm": 1.507230600984627,
"learning_rate": 1.9828939781286564e-05,
"loss": 2.0074,
"step": 116
},
{
"epoch": 0.08803611738148984,
"grad_norm": 1.5030928149719422,
"learning_rate": 1.982442188674703e-05,
"loss": 1.9907,
"step": 117
},
{
"epoch": 0.08878856282919488,
"grad_norm": 1.6536434349173452,
"learning_rate": 1.981984563416718e-05,
"loss": 1.9926,
"step": 118
},
{
"epoch": 0.08954100827689992,
"grad_norm": 1.446082778369442,
"learning_rate": 1.981521105073042e-05,
"loss": 1.9679,
"step": 119
},
{
"epoch": 0.09029345372460497,
"grad_norm": 1.4596364425379227,
"learning_rate": 1.9810518163966627e-05,
"loss": 1.9426,
"step": 120
},
{
"epoch": 0.09104589917231001,
"grad_norm": 1.5118653456755002,
"learning_rate": 1.9805767001752016e-05,
"loss": 2.0246,
"step": 121
},
{
"epoch": 0.09179834462001504,
"grad_norm": 1.3973395827138186,
"learning_rate": 1.980095759230896e-05,
"loss": 2.004,
"step": 122
},
{
"epoch": 0.09255079006772009,
"grad_norm": 1.53697983569752,
"learning_rate": 1.9796089964205832e-05,
"loss": 1.9921,
"step": 123
},
{
"epoch": 0.09330323551542513,
"grad_norm": 1.473390484487182,
"learning_rate": 1.9791164146356823e-05,
"loss": 1.9991,
"step": 124
},
{
"epoch": 0.09405568096313018,
"grad_norm": 1.38802323562396,
"learning_rate": 1.978618016802178e-05,
"loss": 1.9636,
"step": 125
},
{
"epoch": 0.09480812641083522,
"grad_norm": 1.4753301278510884,
"learning_rate": 1.978113805880603e-05,
"loss": 1.9927,
"step": 126
},
{
"epoch": 0.09556057185854025,
"grad_norm": 1.4133915863089033,
"learning_rate": 1.9776037848660202e-05,
"loss": 2.0054,
"step": 127
},
{
"epoch": 0.0963130173062453,
"grad_norm": 1.4871247211879552,
"learning_rate": 1.9770879567880046e-05,
"loss": 2.0302,
"step": 128
},
{
"epoch": 0.09706546275395034,
"grad_norm": 1.4746631460143926,
"learning_rate": 1.9765663247106265e-05,
"loss": 1.9814,
"step": 129
},
{
"epoch": 0.09781790820165538,
"grad_norm": 1.393645999910181,
"learning_rate": 1.9760388917324317e-05,
"loss": 1.9873,
"step": 130
},
{
"epoch": 0.09857035364936043,
"grad_norm": 1.4818000799300188,
"learning_rate": 1.975505660986425e-05,
"loss": 1.9508,
"step": 131
},
{
"epoch": 0.09932279909706546,
"grad_norm": 1.517411141800739,
"learning_rate": 1.97496663564005e-05,
"loss": 1.9773,
"step": 132
},
{
"epoch": 0.1000752445447705,
"grad_norm": 1.353023590996782,
"learning_rate": 1.9744218188951698e-05,
"loss": 1.9868,
"step": 133
},
{
"epoch": 0.10082768999247554,
"grad_norm": 1.426306082202656,
"learning_rate": 1.973871213988051e-05,
"loss": 1.9925,
"step": 134
},
{
"epoch": 0.10158013544018059,
"grad_norm": 1.3010698696938292,
"learning_rate": 1.9733148241893403e-05,
"loss": 1.956,
"step": 135
},
{
"epoch": 0.10233258088788563,
"grad_norm": 1.719691530723257,
"learning_rate": 1.972752652804049e-05,
"loss": 1.9853,
"step": 136
},
{
"epoch": 0.10308502633559068,
"grad_norm": 1.4460724387319388,
"learning_rate": 1.972184703171531e-05,
"loss": 2.0284,
"step": 137
},
{
"epoch": 0.1038374717832957,
"grad_norm": 1.4593267990081857,
"learning_rate": 1.9716109786654627e-05,
"loss": 1.9645,
"step": 138
},
{
"epoch": 0.10458991723100075,
"grad_norm": 1.6763547358427662,
"learning_rate": 1.9710314826938254e-05,
"loss": 1.9967,
"step": 139
},
{
"epoch": 0.1053423626787058,
"grad_norm": 1.3203767546242118,
"learning_rate": 1.970446218698882e-05,
"loss": 1.9755,
"step": 140
},
{
"epoch": 0.10609480812641084,
"grad_norm": 1.7712614335540453,
"learning_rate": 1.969855190157159e-05,
"loss": 1.9548,
"step": 141
},
{
"epoch": 0.10684725357411588,
"grad_norm": 1.420530068603303,
"learning_rate": 1.9692584005794245e-05,
"loss": 1.9995,
"step": 142
},
{
"epoch": 0.10759969902182091,
"grad_norm": 1.624567509602698,
"learning_rate": 1.9686558535106675e-05,
"loss": 1.965,
"step": 143
},
{
"epoch": 0.10835214446952596,
"grad_norm": 1.4508278830996981,
"learning_rate": 1.9680475525300778e-05,
"loss": 1.9862,
"step": 144
},
{
"epoch": 0.109104589917231,
"grad_norm": 1.7991217889697975,
"learning_rate": 1.967433501251023e-05,
"loss": 1.9246,
"step": 145
},
{
"epoch": 0.10985703536493605,
"grad_norm": 1.4636775064716372,
"learning_rate": 1.9668137033210292e-05,
"loss": 1.9678,
"step": 146
},
{
"epoch": 0.11060948081264109,
"grad_norm": 1.4173782410652398,
"learning_rate": 1.9661881624217573e-05,
"loss": 1.9717,
"step": 147
},
{
"epoch": 0.11136192626034612,
"grad_norm": 1.6077867999863473,
"learning_rate": 1.9655568822689825e-05,
"loss": 1.9576,
"step": 148
},
{
"epoch": 0.11211437170805116,
"grad_norm": 1.3538835889395469,
"learning_rate": 1.964919866612571e-05,
"loss": 1.9949,
"step": 149
},
{
"epoch": 0.11286681715575621,
"grad_norm": 1.5350926403645768,
"learning_rate": 1.9642771192364593e-05,
"loss": 1.968,
"step": 150
},
{
"epoch": 0.11361926260346125,
"grad_norm": 1.4779818342434305,
"learning_rate": 1.9636286439586303e-05,
"loss": 1.9418,
"step": 151
},
{
"epoch": 0.1143717080511663,
"grad_norm": 1.4297054967030913,
"learning_rate": 1.962974444631092e-05,
"loss": 1.996,
"step": 152
},
{
"epoch": 0.11512415349887133,
"grad_norm": 1.7477944077220016,
"learning_rate": 1.9623145251398527e-05,
"loss": 1.9806,
"step": 153
},
{
"epoch": 0.11587659894657637,
"grad_norm": 1.4271326001994253,
"learning_rate": 1.9616488894049e-05,
"loss": 1.9737,
"step": 154
},
{
"epoch": 0.11662904439428141,
"grad_norm": 1.6919046010850303,
"learning_rate": 1.9609775413801763e-05,
"loss": 1.9659,
"step": 155
},
{
"epoch": 0.11738148984198646,
"grad_norm": 1.5166891890055354,
"learning_rate": 1.9603004850535547e-05,
"loss": 1.9896,
"step": 156
},
{
"epoch": 0.1181339352896915,
"grad_norm": 1.4162493374760687,
"learning_rate": 1.9596177244468177e-05,
"loss": 1.9533,
"step": 157
},
{
"epoch": 0.11888638073739653,
"grad_norm": 1.5932466516182726,
"learning_rate": 1.9589292636156306e-05,
"loss": 1.9971,
"step": 158
},
{
"epoch": 0.11963882618510158,
"grad_norm": 1.4311081193079844,
"learning_rate": 1.9582351066495193e-05,
"loss": 1.9907,
"step": 159
},
{
"epoch": 0.12039127163280662,
"grad_norm": 1.6200215222027239,
"learning_rate": 1.957535257671845e-05,
"loss": 1.9715,
"step": 160
},
{
"epoch": 0.12114371708051166,
"grad_norm": 1.418048709798436,
"learning_rate": 1.95682972083978e-05,
"loss": 2.003,
"step": 161
},
{
"epoch": 0.12189616252821671,
"grad_norm": 1.3587237207905725,
"learning_rate": 1.9561185003442827e-05,
"loss": 1.9517,
"step": 162
},
{
"epoch": 0.12264860797592174,
"grad_norm": 1.5231409653128272,
"learning_rate": 1.9554016004100734e-05,
"loss": 1.9692,
"step": 163
},
{
"epoch": 0.12340105342362678,
"grad_norm": 1.3360980130507736,
"learning_rate": 1.9546790252956093e-05,
"loss": 1.9653,
"step": 164
},
{
"epoch": 0.12415349887133183,
"grad_norm": 1.4347304810397616,
"learning_rate": 1.9539507792930582e-05,
"loss": 1.9584,
"step": 165
},
{
"epoch": 0.12490594431903687,
"grad_norm": 1.307902604997509,
"learning_rate": 1.9532168667282732e-05,
"loss": 2.0186,
"step": 166
},
{
"epoch": 0.1256583897667419,
"grad_norm": 1.4036550292733128,
"learning_rate": 1.952477291960768e-05,
"loss": 1.9539,
"step": 167
},
{
"epoch": 0.12641083521444696,
"grad_norm": 1.3433691460739932,
"learning_rate": 1.9517320593836895e-05,
"loss": 1.9862,
"step": 168
},
{
"epoch": 0.127163280662152,
"grad_norm": 1.3464769845536433,
"learning_rate": 1.9509811734237938e-05,
"loss": 1.9712,
"step": 169
},
{
"epoch": 0.12791572610985705,
"grad_norm": 1.2610468906674772,
"learning_rate": 1.9502246385414177e-05,
"loss": 1.9326,
"step": 170
},
{
"epoch": 0.12866817155756208,
"grad_norm": 1.2662792900134903,
"learning_rate": 1.9494624592304536e-05,
"loss": 1.9328,
"step": 171
},
{
"epoch": 0.1294206170052671,
"grad_norm": 1.3762307251278898,
"learning_rate": 1.948694640018322e-05,
"loss": 1.9786,
"step": 172
},
{
"epoch": 0.13017306245297217,
"grad_norm": 1.3594099609716863,
"learning_rate": 1.947921185465945e-05,
"loss": 1.9499,
"step": 173
},
{
"epoch": 0.1309255079006772,
"grad_norm": 1.4142393193371317,
"learning_rate": 1.94714210016772e-05,
"loss": 1.9854,
"step": 174
},
{
"epoch": 0.13167795334838225,
"grad_norm": 1.373292466735073,
"learning_rate": 1.9463573887514902e-05,
"loss": 1.9798,
"step": 175
},
{
"epoch": 0.13243039879608728,
"grad_norm": 1.3592294507253506,
"learning_rate": 1.9455670558785195e-05,
"loss": 1.9703,
"step": 176
},
{
"epoch": 0.13318284424379231,
"grad_norm": 1.4627280891447714,
"learning_rate": 1.9447711062434633e-05,
"loss": 1.9847,
"step": 177
},
{
"epoch": 0.13393528969149737,
"grad_norm": 1.3716160171894403,
"learning_rate": 1.943969544574342e-05,
"loss": 2.0015,
"step": 178
},
{
"epoch": 0.1346877351392024,
"grad_norm": 1.4484715232722631,
"learning_rate": 1.9431623756325112e-05,
"loss": 1.9459,
"step": 179
},
{
"epoch": 0.13544018058690746,
"grad_norm": 1.3840022801127387,
"learning_rate": 1.942349604212634e-05,
"loss": 2.0248,
"step": 180
},
{
"epoch": 0.1361926260346125,
"grad_norm": 1.5293566822957931,
"learning_rate": 1.9415312351426533e-05,
"loss": 1.9755,
"step": 181
},
{
"epoch": 0.13694507148231752,
"grad_norm": 1.326871914981708,
"learning_rate": 1.940707273283763e-05,
"loss": 1.9425,
"step": 182
},
{
"epoch": 0.13769751693002258,
"grad_norm": 1.366448795054707,
"learning_rate": 1.9398777235303783e-05,
"loss": 2.0004,
"step": 183
},
{
"epoch": 0.1384499623777276,
"grad_norm": 1.3718393929467805,
"learning_rate": 1.9390425908101063e-05,
"loss": 1.9602,
"step": 184
},
{
"epoch": 0.13920240782543267,
"grad_norm": 1.4846607216778775,
"learning_rate": 1.938201880083719e-05,
"loss": 1.9339,
"step": 185
},
{
"epoch": 0.1399548532731377,
"grad_norm": 1.3668331563239702,
"learning_rate": 1.9373555963451213e-05,
"loss": 1.957,
"step": 186
},
{
"epoch": 0.14070729872084273,
"grad_norm": 1.387556800457941,
"learning_rate": 1.9365037446213216e-05,
"loss": 1.9595,
"step": 187
},
{
"epoch": 0.14145974416854779,
"grad_norm": 1.3155571655692533,
"learning_rate": 1.9356463299724047e-05,
"loss": 2.0023,
"step": 188
},
{
"epoch": 0.14221218961625282,
"grad_norm": 1.332446834708362,
"learning_rate": 1.9347833574914985e-05,
"loss": 1.9358,
"step": 189
},
{
"epoch": 0.14296463506395787,
"grad_norm": 1.3118920943387449,
"learning_rate": 1.9339148323047447e-05,
"loss": 1.9729,
"step": 190
},
{
"epoch": 0.1437170805116629,
"grad_norm": 1.2748859965634367,
"learning_rate": 1.933040759571269e-05,
"loss": 1.95,
"step": 191
},
{
"epoch": 0.14446952595936793,
"grad_norm": 1.418032984331189,
"learning_rate": 1.932161144483151e-05,
"loss": 1.9824,
"step": 192
},
{
"epoch": 0.145221971407073,
"grad_norm": 1.3589133412689693,
"learning_rate": 1.9312759922653908e-05,
"loss": 1.9673,
"step": 193
},
{
"epoch": 0.14597441685477802,
"grad_norm": 1.4020108300799787,
"learning_rate": 1.9303853081758803e-05,
"loss": 1.9509,
"step": 194
},
{
"epoch": 0.14672686230248308,
"grad_norm": 1.2663387127270673,
"learning_rate": 1.9294890975053713e-05,
"loss": 1.9625,
"step": 195
},
{
"epoch": 0.1474793077501881,
"grad_norm": 1.3499575309398055,
"learning_rate": 1.9285873655774447e-05,
"loss": 1.9512,
"step": 196
},
{
"epoch": 0.14823175319789314,
"grad_norm": 1.393307418562387,
"learning_rate": 1.927680117748477e-05,
"loss": 1.9705,
"step": 197
},
{
"epoch": 0.1489841986455982,
"grad_norm": 1.2762238398109587,
"learning_rate": 1.9267673594076103e-05,
"loss": 1.9382,
"step": 198
},
{
"epoch": 0.14973664409330323,
"grad_norm": 1.632268152977179,
"learning_rate": 1.92584909597672e-05,
"loss": 1.964,
"step": 199
},
{
"epoch": 0.1504890895410083,
"grad_norm": 1.3595752942104353,
"learning_rate": 1.9249253329103817e-05,
"loss": 1.9949,
"step": 200
},
{
"epoch": 0.15124153498871332,
"grad_norm": 1.35780526534179,
"learning_rate": 1.92399607569584e-05,
"loss": 1.9746,
"step": 201
},
{
"epoch": 0.15199398043641835,
"grad_norm": 1.3661250778688059,
"learning_rate": 1.923061329852974e-05,
"loss": 1.9495,
"step": 202
},
{
"epoch": 0.1527464258841234,
"grad_norm": 1.3966743552588,
"learning_rate": 1.9221211009342677e-05,
"loss": 1.9697,
"step": 203
},
{
"epoch": 0.15349887133182843,
"grad_norm": 1.460303781755008,
"learning_rate": 1.921175394524773e-05,
"loss": 1.9464,
"step": 204
},
{
"epoch": 0.1542513167795335,
"grad_norm": 1.2452806215453123,
"learning_rate": 1.920224216242081e-05,
"loss": 1.9743,
"step": 205
},
{
"epoch": 0.15500376222723852,
"grad_norm": 1.3890371419429213,
"learning_rate": 1.9192675717362847e-05,
"loss": 1.9865,
"step": 206
},
{
"epoch": 0.15575620767494355,
"grad_norm": 1.323269308523132,
"learning_rate": 1.918305466689947e-05,
"loss": 1.9594,
"step": 207
},
{
"epoch": 0.1565086531226486,
"grad_norm": 1.3384928844831772,
"learning_rate": 1.917337906818067e-05,
"loss": 1.9769,
"step": 208
},
{
"epoch": 0.15726109857035364,
"grad_norm": 1.384565925877069,
"learning_rate": 1.916364897868047e-05,
"loss": 1.9958,
"step": 209
},
{
"epoch": 0.1580135440180587,
"grad_norm": 1.459761434339266,
"learning_rate": 1.9153864456196565e-05,
"loss": 1.9396,
"step": 210
},
{
"epoch": 0.15876598946576373,
"grad_norm": 1.3331685781473661,
"learning_rate": 1.9144025558849987e-05,
"loss": 1.9991,
"step": 211
},
{
"epoch": 0.1595184349134688,
"grad_norm": 1.4210079121447194,
"learning_rate": 1.913413234508476e-05,
"loss": 1.9761,
"step": 212
},
{
"epoch": 0.16027088036117382,
"grad_norm": 1.4252252661077878,
"learning_rate": 1.912418487366756e-05,
"loss": 1.9372,
"step": 213
},
{
"epoch": 0.16102332580887885,
"grad_norm": 1.296248823579948,
"learning_rate": 1.9114183203687352e-05,
"loss": 1.9458,
"step": 214
},
{
"epoch": 0.1617757712565839,
"grad_norm": 1.4165301057522213,
"learning_rate": 1.9104127394555044e-05,
"loss": 1.9714,
"step": 215
},
{
"epoch": 0.16252821670428894,
"grad_norm": 1.423492160289639,
"learning_rate": 1.9094017506003144e-05,
"loss": 1.9651,
"step": 216
},
{
"epoch": 0.163280662151994,
"grad_norm": 1.2792606672593925,
"learning_rate": 1.908385359808539e-05,
"loss": 1.9521,
"step": 217
},
{
"epoch": 0.16403310759969902,
"grad_norm": 1.3766953881913706,
"learning_rate": 1.9073635731176406e-05,
"loss": 1.9434,
"step": 218
},
{
"epoch": 0.16478555304740405,
"grad_norm": 1.2920855336517059,
"learning_rate": 1.906336396597133e-05,
"loss": 1.9361,
"step": 219
},
{
"epoch": 0.1655379984951091,
"grad_norm": 1.4153757635924926,
"learning_rate": 1.905303836348547e-05,
"loss": 1.9791,
"step": 220
},
{
"epoch": 0.16629044394281414,
"grad_norm": 1.4107290451749257,
"learning_rate": 1.904265898505393e-05,
"loss": 1.965,
"step": 221
},
{
"epoch": 0.1670428893905192,
"grad_norm": 1.4240861145992867,
"learning_rate": 1.9032225892331238e-05,
"loss": 1.9617,
"step": 222
},
{
"epoch": 0.16779533483822423,
"grad_norm": 1.3760759342375255,
"learning_rate": 1.902173914729101e-05,
"loss": 1.9406,
"step": 223
},
{
"epoch": 0.16854778028592926,
"grad_norm": 1.3864951118357907,
"learning_rate": 1.9011198812225548e-05,
"loss": 2.0006,
"step": 224
},
{
"epoch": 0.16930022573363432,
"grad_norm": 1.3437298356753506,
"learning_rate": 1.9000604949745484e-05,
"loss": 1.9626,
"step": 225
},
{
"epoch": 0.17005267118133935,
"grad_norm": 1.340034285680908,
"learning_rate": 1.898995762277942e-05,
"loss": 1.9527,
"step": 226
},
{
"epoch": 0.1708051166290444,
"grad_norm": 1.3748766841864157,
"learning_rate": 1.8979256894573525e-05,
"loss": 1.9518,
"step": 227
},
{
"epoch": 0.17155756207674944,
"grad_norm": 1.2660589514505463,
"learning_rate": 1.896850282869119e-05,
"loss": 1.9573,
"step": 228
},
{
"epoch": 0.17231000752445447,
"grad_norm": 1.3681973319531922,
"learning_rate": 1.8957695489012635e-05,
"loss": 1.9401,
"step": 229
},
{
"epoch": 0.17306245297215953,
"grad_norm": 1.2831175622200304,
"learning_rate": 1.8946834939734526e-05,
"loss": 1.9693,
"step": 230
},
{
"epoch": 0.17381489841986456,
"grad_norm": 1.2861704316680282,
"learning_rate": 1.8935921245369606e-05,
"loss": 1.9679,
"step": 231
},
{
"epoch": 0.1745673438675696,
"grad_norm": 1.3549676877355488,
"learning_rate": 1.8924954470746296e-05,
"loss": 1.975,
"step": 232
},
{
"epoch": 0.17531978931527464,
"grad_norm": 1.320189966915083,
"learning_rate": 1.8913934681008328e-05,
"loss": 1.9555,
"step": 233
},
{
"epoch": 0.17607223476297967,
"grad_norm": 1.3238385143038385,
"learning_rate": 1.890286194161435e-05,
"loss": 1.9665,
"step": 234
},
{
"epoch": 0.17682468021068473,
"grad_norm": 1.281857600947946,
"learning_rate": 1.8891736318337525e-05,
"loss": 1.9601,
"step": 235
},
{
"epoch": 0.17757712565838976,
"grad_norm": 1.3494601456589077,
"learning_rate": 1.8880557877265165e-05,
"loss": 1.9993,
"step": 236
},
{
"epoch": 0.17832957110609482,
"grad_norm": 1.3216996781353922,
"learning_rate": 1.8869326684798315e-05,
"loss": 1.9762,
"step": 237
},
{
"epoch": 0.17908201655379985,
"grad_norm": 1.33448049656044,
"learning_rate": 1.885804280765137e-05,
"loss": 1.9437,
"step": 238
},
{
"epoch": 0.17983446200150488,
"grad_norm": 1.3394485079126583,
"learning_rate": 1.8846706312851687e-05,
"loss": 1.9729,
"step": 239
},
{
"epoch": 0.18058690744920994,
"grad_norm": 1.3004755169724092,
"learning_rate": 1.8835317267739158e-05,
"loss": 1.9729,
"step": 240
},
{
"epoch": 0.18133935289691497,
"grad_norm": 1.461123883389385,
"learning_rate": 1.882387573996585e-05,
"loss": 1.9704,
"step": 241
},
{
"epoch": 0.18209179834462003,
"grad_norm": 1.2991306164664274,
"learning_rate": 1.881238179749557e-05,
"loss": 1.9723,
"step": 242
},
{
"epoch": 0.18284424379232506,
"grad_norm": 1.315953898249,
"learning_rate": 1.8800835508603478e-05,
"loss": 1.9506,
"step": 243
},
{
"epoch": 0.1835966892400301,
"grad_norm": 1.327714953953187,
"learning_rate": 1.878923694187567e-05,
"loss": 1.9413,
"step": 244
},
{
"epoch": 0.18434913468773514,
"grad_norm": 1.3498393780924753,
"learning_rate": 1.8777586166208786e-05,
"loss": 1.9278,
"step": 245
},
{
"epoch": 0.18510158013544017,
"grad_norm": 1.3716018880467373,
"learning_rate": 1.8765883250809586e-05,
"loss": 2.0071,
"step": 246
},
{
"epoch": 0.18585402558314523,
"grad_norm": 1.4448584035733611,
"learning_rate": 1.8754128265194554e-05,
"loss": 1.9275,
"step": 247
},
{
"epoch": 0.18660647103085026,
"grad_norm": 1.3921477626217262,
"learning_rate": 1.8742321279189465e-05,
"loss": 1.9996,
"step": 248
},
{
"epoch": 0.1873589164785553,
"grad_norm": 1.364450633083321,
"learning_rate": 1.873046236292899e-05,
"loss": 1.9688,
"step": 249
},
{
"epoch": 0.18811136192626035,
"grad_norm": 1.4280902502030084,
"learning_rate": 1.871855158685626e-05,
"loss": 1.9456,
"step": 250
},
{
"epoch": 0.18886380737396538,
"grad_norm": 1.3573333141884782,
"learning_rate": 1.870658902172248e-05,
"loss": 1.9568,
"step": 251
},
{
"epoch": 0.18961625282167044,
"grad_norm": 1.3545988893629721,
"learning_rate": 1.869457473858646e-05,
"loss": 1.9699,
"step": 252
},
{
"epoch": 0.19036869826937547,
"grad_norm": 1.3292266931234964,
"learning_rate": 1.868250880881424e-05,
"loss": 1.9843,
"step": 253
},
{
"epoch": 0.1911211437170805,
"grad_norm": 1.3002459253975567,
"learning_rate": 1.867039130407864e-05,
"loss": 1.9417,
"step": 254
},
{
"epoch": 0.19187358916478556,
"grad_norm": 1.4899267775067526,
"learning_rate": 1.8658222296358834e-05,
"loss": 1.9271,
"step": 255
},
{
"epoch": 0.1926260346124906,
"grad_norm": 1.3207143351158235,
"learning_rate": 1.864600185793994e-05,
"loss": 1.9753,
"step": 256
},
{
"epoch": 0.19337848006019565,
"grad_norm": 1.2787262608127998,
"learning_rate": 1.8633730061412575e-05,
"loss": 1.9514,
"step": 257
},
{
"epoch": 0.19413092550790068,
"grad_norm": 1.2930330417800022,
"learning_rate": 1.8621406979672422e-05,
"loss": 1.976,
"step": 258
},
{
"epoch": 0.1948833709556057,
"grad_norm": 1.3752952420928894,
"learning_rate": 1.8609032685919815e-05,
"loss": 2.0012,
"step": 259
},
{
"epoch": 0.19563581640331076,
"grad_norm": 1.3555254434813793,
"learning_rate": 1.8596607253659283e-05,
"loss": 1.8956,
"step": 260
},
{
"epoch": 0.1963882618510158,
"grad_norm": 1.2317803627439476,
"learning_rate": 1.8584130756699122e-05,
"loss": 1.9731,
"step": 261
},
{
"epoch": 0.19714070729872085,
"grad_norm": 1.2449720211347859,
"learning_rate": 1.857160326915097e-05,
"loss": 1.9472,
"step": 262
},
{
"epoch": 0.19789315274642588,
"grad_norm": 1.3126585601535266,
"learning_rate": 1.8559024865429336e-05,
"loss": 1.9198,
"step": 263
},
{
"epoch": 0.1986455981941309,
"grad_norm": 1.3348867900532828,
"learning_rate": 1.854639562025119e-05,
"loss": 1.983,
"step": 264
},
{
"epoch": 0.19939804364183597,
"grad_norm": 1.266325420705586,
"learning_rate": 1.85337156086355e-05,
"loss": 1.9564,
"step": 265
},
{
"epoch": 0.200150489089541,
"grad_norm": 1.2090376605707414,
"learning_rate": 1.8520984905902798e-05,
"loss": 1.9579,
"step": 266
},
{
"epoch": 0.20090293453724606,
"grad_norm": 1.2923682057430215,
"learning_rate": 1.8508203587674713e-05,
"loss": 1.9345,
"step": 267
},
{
"epoch": 0.2016553799849511,
"grad_norm": 1.522859291241985,
"learning_rate": 1.8495371729873545e-05,
"loss": 1.9725,
"step": 268
},
{
"epoch": 0.20240782543265612,
"grad_norm": 1.312711748287889,
"learning_rate": 1.8482489408721804e-05,
"loss": 1.9848,
"step": 269
},
{
"epoch": 0.20316027088036118,
"grad_norm": 1.3132635623231081,
"learning_rate": 1.8469556700741755e-05,
"loss": 1.9479,
"step": 270
},
{
"epoch": 0.2039127163280662,
"grad_norm": 1.3221986830633619,
"learning_rate": 1.845657368275496e-05,
"loss": 1.9309,
"step": 271
},
{
"epoch": 0.20466516177577126,
"grad_norm": 1.244194299321954,
"learning_rate": 1.8443540431881842e-05,
"loss": 1.9317,
"step": 272
},
{
"epoch": 0.2054176072234763,
"grad_norm": 1.4715910791859022,
"learning_rate": 1.8430457025541203e-05,
"loss": 1.9505,
"step": 273
},
{
"epoch": 0.20617005267118135,
"grad_norm": 1.2612257292372675,
"learning_rate": 1.841732354144977e-05,
"loss": 1.9429,
"step": 274
},
{
"epoch": 0.20692249811888638,
"grad_norm": 1.377695965427535,
"learning_rate": 1.8404140057621735e-05,
"loss": 1.9736,
"step": 275
},
{
"epoch": 0.2076749435665914,
"grad_norm": 1.2559142967446475,
"learning_rate": 1.8390906652368313e-05,
"loss": 1.978,
"step": 276
},
{
"epoch": 0.20842738901429647,
"grad_norm": 1.4140381666883666,
"learning_rate": 1.8377623404297236e-05,
"loss": 1.951,
"step": 277
},
{
"epoch": 0.2091798344620015,
"grad_norm": 1.365019370426324,
"learning_rate": 1.8364290392312318e-05,
"loss": 1.9298,
"step": 278
},
{
"epoch": 0.20993227990970656,
"grad_norm": 1.2445598925870334,
"learning_rate": 1.8350907695612963e-05,
"loss": 1.9482,
"step": 279
},
{
"epoch": 0.2106847253574116,
"grad_norm": 1.23747230832893,
"learning_rate": 1.833747539369373e-05,
"loss": 1.9587,
"step": 280
},
{
"epoch": 0.21143717080511662,
"grad_norm": 1.271259198996052,
"learning_rate": 1.8323993566343817e-05,
"loss": 1.9559,
"step": 281
},
{
"epoch": 0.21218961625282168,
"grad_norm": 1.2428268642896483,
"learning_rate": 1.8310462293646617e-05,
"loss": 1.9618,
"step": 282
},
{
"epoch": 0.2129420617005267,
"grad_norm": 1.2181992804364723,
"learning_rate": 1.829688165597923e-05,
"loss": 1.9586,
"step": 283
},
{
"epoch": 0.21369450714823177,
"grad_norm": 1.3137573309258301,
"learning_rate": 1.8283251734011994e-05,
"loss": 1.9574,
"step": 284
},
{
"epoch": 0.2144469525959368,
"grad_norm": 1.2817918023787924,
"learning_rate": 1.8269572608707995e-05,
"loss": 1.9759,
"step": 285
},
{
"epoch": 0.21519939804364183,
"grad_norm": 1.2398310606282485,
"learning_rate": 1.8255844361322594e-05,
"loss": 1.9714,
"step": 286
},
{
"epoch": 0.21595184349134688,
"grad_norm": 1.2474818854161847,
"learning_rate": 1.8242067073402943e-05,
"loss": 1.9257,
"step": 287
},
{
"epoch": 0.21670428893905191,
"grad_norm": 1.267500764627139,
"learning_rate": 1.8228240826787497e-05,
"loss": 1.9289,
"step": 288
},
{
"epoch": 0.21745673438675697,
"grad_norm": 1.2937076406106591,
"learning_rate": 1.821436570360553e-05,
"loss": 1.9376,
"step": 289
},
{
"epoch": 0.218209179834462,
"grad_norm": 1.381344300266341,
"learning_rate": 1.8200441786276655e-05,
"loss": 1.9614,
"step": 290
},
{
"epoch": 0.21896162528216703,
"grad_norm": 1.3292835031009747,
"learning_rate": 1.818646915751032e-05,
"loss": 1.9527,
"step": 291
},
{
"epoch": 0.2197140707298721,
"grad_norm": 1.4862295273448796,
"learning_rate": 1.8172447900305327e-05,
"loss": 1.9242,
"step": 292
},
{
"epoch": 0.22046651617757712,
"grad_norm": 1.255910434100239,
"learning_rate": 1.8158378097949327e-05,
"loss": 1.9511,
"step": 293
},
{
"epoch": 0.22121896162528218,
"grad_norm": 1.3833877789938933,
"learning_rate": 1.814425983401835e-05,
"loss": 1.9582,
"step": 294
},
{
"epoch": 0.2219714070729872,
"grad_norm": 1.3270707623485452,
"learning_rate": 1.813009319237628e-05,
"loss": 1.9513,
"step": 295
},
{
"epoch": 0.22272385252069224,
"grad_norm": 1.344077280090676,
"learning_rate": 1.8115878257174372e-05,
"loss": 1.9561,
"step": 296
},
{
"epoch": 0.2234762979683973,
"grad_norm": 1.4849174595312744,
"learning_rate": 1.8101615112850752e-05,
"loss": 1.9603,
"step": 297
},
{
"epoch": 0.22422874341610233,
"grad_norm": 1.2374589107637808,
"learning_rate": 1.8087303844129915e-05,
"loss": 1.9213,
"step": 298
},
{
"epoch": 0.22498118886380739,
"grad_norm": 1.4205139369103519,
"learning_rate": 1.8072944536022213e-05,
"loss": 1.9418,
"step": 299
},
{
"epoch": 0.22573363431151242,
"grad_norm": 1.369836371776786,
"learning_rate": 1.805853727382336e-05,
"loss": 1.9757,
"step": 300
},
{
"epoch": 0.22648607975921745,
"grad_norm": 1.401869147648205,
"learning_rate": 1.8044082143113924e-05,
"loss": 1.9753,
"step": 301
},
{
"epoch": 0.2272385252069225,
"grad_norm": 1.2735801265219953,
"learning_rate": 1.8029579229758812e-05,
"loss": 1.9312,
"step": 302
},
{
"epoch": 0.22799097065462753,
"grad_norm": 1.2623222549351667,
"learning_rate": 1.8015028619906774e-05,
"loss": 1.9143,
"step": 303
},
{
"epoch": 0.2287434161023326,
"grad_norm": 1.3462626773361908,
"learning_rate": 1.8000430399989866e-05,
"loss": 1.9341,
"step": 304
},
{
"epoch": 0.22949586155003762,
"grad_norm": 1.4397841087420657,
"learning_rate": 1.798578465672297e-05,
"loss": 1.9592,
"step": 305
},
{
"epoch": 0.23024830699774265,
"grad_norm": 1.3503911703867,
"learning_rate": 1.797109147710325e-05,
"loss": 1.9369,
"step": 306
},
{
"epoch": 0.2310007524454477,
"grad_norm": 1.397792890821999,
"learning_rate": 1.7956350948409655e-05,
"loss": 1.9321,
"step": 307
},
{
"epoch": 0.23175319789315274,
"grad_norm": 1.2691980019923077,
"learning_rate": 1.7941563158202376e-05,
"loss": 1.9346,
"step": 308
},
{
"epoch": 0.2325056433408578,
"grad_norm": 1.3083945925685703,
"learning_rate": 1.7926728194322364e-05,
"loss": 1.9404,
"step": 309
},
{
"epoch": 0.23325808878856283,
"grad_norm": 1.3815189471454836,
"learning_rate": 1.7911846144890772e-05,
"loss": 1.8911,
"step": 310
},
{
"epoch": 0.23401053423626786,
"grad_norm": 1.2751085156167634,
"learning_rate": 1.7896917098308448e-05,
"loss": 1.9424,
"step": 311
},
{
"epoch": 0.23476297968397292,
"grad_norm": 1.4034284554651721,
"learning_rate": 1.7881941143255414e-05,
"loss": 1.9349,
"step": 312
},
{
"epoch": 0.23551542513167795,
"grad_norm": 1.183656645638489,
"learning_rate": 1.7866918368690324e-05,
"loss": 1.9201,
"step": 313
},
{
"epoch": 0.236267870579383,
"grad_norm": 1.4035327567401452,
"learning_rate": 1.7851848863849948e-05,
"loss": 1.9269,
"step": 314
},
{
"epoch": 0.23702031602708803,
"grad_norm": 1.4146981077419156,
"learning_rate": 1.7836732718248644e-05,
"loss": 1.9547,
"step": 315
},
{
"epoch": 0.23777276147479307,
"grad_norm": 1.2606770091287702,
"learning_rate": 1.782157002167781e-05,
"loss": 1.9114,
"step": 316
},
{
"epoch": 0.23852520692249812,
"grad_norm": 1.264942150087034,
"learning_rate": 1.780636086420537e-05,
"loss": 1.957,
"step": 317
},
{
"epoch": 0.23927765237020315,
"grad_norm": 1.2832089391613668,
"learning_rate": 1.779110533617523e-05,
"loss": 1.9352,
"step": 318
},
{
"epoch": 0.2400300978179082,
"grad_norm": 1.2438669676175231,
"learning_rate": 1.7775803528206736e-05,
"loss": 1.9316,
"step": 319
},
{
"epoch": 0.24078254326561324,
"grad_norm": 1.28050205514043,
"learning_rate": 1.776045553119415e-05,
"loss": 1.9358,
"step": 320
},
{
"epoch": 0.24153498871331827,
"grad_norm": 1.4850152024836958,
"learning_rate": 1.774506143630609e-05,
"loss": 1.9448,
"step": 321
},
{
"epoch": 0.24228743416102333,
"grad_norm": 1.2336418678705168,
"learning_rate": 1.7729621334985005e-05,
"loss": 1.9548,
"step": 322
},
{
"epoch": 0.24303987960872836,
"grad_norm": 1.3187274349326452,
"learning_rate": 1.7714135318946637e-05,
"loss": 1.9141,
"step": 323
},
{
"epoch": 0.24379232505643342,
"grad_norm": 1.3362302789595277,
"learning_rate": 1.769860348017945e-05,
"loss": 1.9906,
"step": 324
},
{
"epoch": 0.24454477050413845,
"grad_norm": 1.2917223785235106,
"learning_rate": 1.768302591094411e-05,
"loss": 1.9704,
"step": 325
},
{
"epoch": 0.24529721595184348,
"grad_norm": 1.2410353365947815,
"learning_rate": 1.766740270377292e-05,
"loss": 1.9598,
"step": 326
},
{
"epoch": 0.24604966139954854,
"grad_norm": 1.209844953184704,
"learning_rate": 1.7651733951469283e-05,
"loss": 1.9899,
"step": 327
},
{
"epoch": 0.24680210684725357,
"grad_norm": 1.2455171915167857,
"learning_rate": 1.763601974710714e-05,
"loss": 1.953,
"step": 328
},
{
"epoch": 0.24755455229495862,
"grad_norm": 1.3300216273055823,
"learning_rate": 1.7620260184030422e-05,
"loss": 1.9345,
"step": 329
},
{
"epoch": 0.24830699774266365,
"grad_norm": 1.264403174801815,
"learning_rate": 1.7604455355852498e-05,
"loss": 1.9846,
"step": 330
},
{
"epoch": 0.24905944319036868,
"grad_norm": 1.2969384094885095,
"learning_rate": 1.7588605356455618e-05,
"loss": 1.9591,
"step": 331
},
{
"epoch": 0.24981188863807374,
"grad_norm": 1.1993983238693904,
"learning_rate": 1.7572710279990345e-05,
"loss": 1.9271,
"step": 332
},
{
"epoch": 0.2505643340857788,
"grad_norm": 1.2126500502323312,
"learning_rate": 1.7556770220875014e-05,
"loss": 1.9127,
"step": 333
},
{
"epoch": 0.2513167795334838,
"grad_norm": 1.2559825437661283,
"learning_rate": 1.7540785273795152e-05,
"loss": 1.9608,
"step": 334
},
{
"epoch": 0.2520692249811889,
"grad_norm": 1.229176059592469,
"learning_rate": 1.7524755533702933e-05,
"loss": 1.9236,
"step": 335
},
{
"epoch": 0.2528216704288939,
"grad_norm": 1.220864897022238,
"learning_rate": 1.7508681095816603e-05,
"loss": 1.957,
"step": 336
},
{
"epoch": 0.25357411587659895,
"grad_norm": 1.170735746454973,
"learning_rate": 1.7492562055619916e-05,
"loss": 1.9161,
"step": 337
},
{
"epoch": 0.254326561324304,
"grad_norm": 1.2658827515167421,
"learning_rate": 1.747639850886157e-05,
"loss": 1.9188,
"step": 338
},
{
"epoch": 0.255079006772009,
"grad_norm": 1.2481272580196758,
"learning_rate": 1.7460190551554633e-05,
"loss": 1.9358,
"step": 339
},
{
"epoch": 0.2558314522197141,
"grad_norm": 1.3270227389917226,
"learning_rate": 1.7443938279975988e-05,
"loss": 1.9528,
"step": 340
},
{
"epoch": 0.2565838976674191,
"grad_norm": 1.3496429757298112,
"learning_rate": 1.7427641790665728e-05,
"loss": 1.931,
"step": 341
},
{
"epoch": 0.25733634311512416,
"grad_norm": 1.192231518380529,
"learning_rate": 1.741130118042662e-05,
"loss": 1.9121,
"step": 342
},
{
"epoch": 0.2580887885628292,
"grad_norm": 1.26602125913812,
"learning_rate": 1.7394916546323514e-05,
"loss": 1.9619,
"step": 343
},
{
"epoch": 0.2588412340105342,
"grad_norm": 1.2735271723964803,
"learning_rate": 1.7378487985682758e-05,
"loss": 1.9467,
"step": 344
},
{
"epoch": 0.2595936794582393,
"grad_norm": 1.2684494780483646,
"learning_rate": 1.736201559609163e-05,
"loss": 1.9482,
"step": 345
},
{
"epoch": 0.26034612490594433,
"grad_norm": 1.3352324631540693,
"learning_rate": 1.7345499475397756e-05,
"loss": 1.9047,
"step": 346
},
{
"epoch": 0.26109857035364936,
"grad_norm": 1.3673432683764037,
"learning_rate": 1.732893972170854e-05,
"loss": 1.9469,
"step": 347
},
{
"epoch": 0.2618510158013544,
"grad_norm": 1.345369015580567,
"learning_rate": 1.7312336433390552e-05,
"loss": 1.981,
"step": 348
},
{
"epoch": 0.2626034612490594,
"grad_norm": 1.2313146585214643,
"learning_rate": 1.7295689709068974e-05,
"loss": 1.9607,
"step": 349
},
{
"epoch": 0.2633559066967645,
"grad_norm": 1.282750924087256,
"learning_rate": 1.7278999647626998e-05,
"loss": 1.929,
"step": 350
},
{
"epoch": 0.26410835214446954,
"grad_norm": 1.2187521492995663,
"learning_rate": 1.7262266348205246e-05,
"loss": 1.9244,
"step": 351
},
{
"epoch": 0.26486079759217457,
"grad_norm": 1.257585976375792,
"learning_rate": 1.7245489910201177e-05,
"loss": 1.9198,
"step": 352
},
{
"epoch": 0.2656132430398796,
"grad_norm": 1.3269365164188605,
"learning_rate": 1.7228670433268494e-05,
"loss": 1.9372,
"step": 353
},
{
"epoch": 0.26636568848758463,
"grad_norm": 1.2627398762579862,
"learning_rate": 1.721180801731656e-05,
"loss": 1.9477,
"step": 354
},
{
"epoch": 0.2671181339352897,
"grad_norm": 1.2606831334730368,
"learning_rate": 1.71949027625098e-05,
"loss": 1.8927,
"step": 355
},
{
"epoch": 0.26787057938299474,
"grad_norm": 1.317457726427343,
"learning_rate": 1.7177954769267098e-05,
"loss": 1.9414,
"step": 356
},
{
"epoch": 0.2686230248306998,
"grad_norm": 1.3118666543646553,
"learning_rate": 1.7160964138261217e-05,
"loss": 1.9188,
"step": 357
},
{
"epoch": 0.2693754702784048,
"grad_norm": 1.1987507431409556,
"learning_rate": 1.7143930970418196e-05,
"loss": 1.9029,
"step": 358
},
{
"epoch": 0.27012791572610984,
"grad_norm": 1.2772312759138085,
"learning_rate": 1.712685536691673e-05,
"loss": 1.9252,
"step": 359
},
{
"epoch": 0.2708803611738149,
"grad_norm": 1.3728512617682893,
"learning_rate": 1.7109737429187604e-05,
"loss": 1.9537,
"step": 360
},
{
"epoch": 0.27163280662151995,
"grad_norm": 1.2566222308905282,
"learning_rate": 1.709257725891307e-05,
"loss": 1.9386,
"step": 361
},
{
"epoch": 0.272385252069225,
"grad_norm": 1.2441244159399403,
"learning_rate": 1.7075374958026235e-05,
"loss": 1.9245,
"step": 362
},
{
"epoch": 0.27313769751693,
"grad_norm": 1.1613266389487966,
"learning_rate": 1.7058130628710473e-05,
"loss": 1.8907,
"step": 363
},
{
"epoch": 0.27389014296463504,
"grad_norm": 1.221997630665815,
"learning_rate": 1.704084437339881e-05,
"loss": 1.951,
"step": 364
},
{
"epoch": 0.2746425884123401,
"grad_norm": 1.2394734646185215,
"learning_rate": 1.7023516294773318e-05,
"loss": 1.9132,
"step": 365
},
{
"epoch": 0.27539503386004516,
"grad_norm": 1.2432498691103655,
"learning_rate": 1.7006146495764503e-05,
"loss": 1.9644,
"step": 366
},
{
"epoch": 0.2761474793077502,
"grad_norm": 1.2035998593214576,
"learning_rate": 1.698873507955069e-05,
"loss": 1.9361,
"step": 367
},
{
"epoch": 0.2768999247554552,
"grad_norm": 1.2047819015682062,
"learning_rate": 1.6971282149557428e-05,
"loss": 1.9543,
"step": 368
},
{
"epoch": 0.27765237020316025,
"grad_norm": 1.1803662633584004,
"learning_rate": 1.695378780945684e-05,
"loss": 1.9239,
"step": 369
},
{
"epoch": 0.27840481565086533,
"grad_norm": 1.2141508976530713,
"learning_rate": 1.6936252163167048e-05,
"loss": 1.9304,
"step": 370
},
{
"epoch": 0.27915726109857036,
"grad_norm": 1.18522978218327,
"learning_rate": 1.6918675314851524e-05,
"loss": 1.9789,
"step": 371
},
{
"epoch": 0.2799097065462754,
"grad_norm": 1.2117062325093526,
"learning_rate": 1.6901057368918497e-05,
"loss": 1.8881,
"step": 372
},
{
"epoch": 0.2806621519939804,
"grad_norm": 1.1997218487955565,
"learning_rate": 1.6883398430020314e-05,
"loss": 1.9579,
"step": 373
},
{
"epoch": 0.28141459744168545,
"grad_norm": 1.1930865509627973,
"learning_rate": 1.6865698603052813e-05,
"loss": 1.9575,
"step": 374
},
{
"epoch": 0.28216704288939054,
"grad_norm": 1.1844110272957733,
"learning_rate": 1.6847957993154734e-05,
"loss": 1.9382,
"step": 375
},
{
"epoch": 0.28291948833709557,
"grad_norm": 1.236585138372541,
"learning_rate": 1.683017670570705e-05,
"loss": 1.9102,
"step": 376
},
{
"epoch": 0.2836719337848006,
"grad_norm": 1.2307325112629608,
"learning_rate": 1.6812354846332376e-05,
"loss": 1.9454,
"step": 377
},
{
"epoch": 0.28442437923250563,
"grad_norm": 1.2371400040080145,
"learning_rate": 1.6794492520894324e-05,
"loss": 1.9385,
"step": 378
},
{
"epoch": 0.28517682468021066,
"grad_norm": 1.2289333864143954,
"learning_rate": 1.6776589835496878e-05,
"loss": 1.9311,
"step": 379
},
{
"epoch": 0.28592927012791575,
"grad_norm": 1.382504970155628,
"learning_rate": 1.6758646896483762e-05,
"loss": 1.9347,
"step": 380
},
{
"epoch": 0.2866817155756208,
"grad_norm": 1.161140094896464,
"learning_rate": 1.674066381043782e-05,
"loss": 1.9228,
"step": 381
},
{
"epoch": 0.2874341610233258,
"grad_norm": 1.208942783530805,
"learning_rate": 1.6722640684180354e-05,
"loss": 1.9153,
"step": 382
},
{
"epoch": 0.28818660647103084,
"grad_norm": 1.244997870160472,
"learning_rate": 1.6704577624770536e-05,
"loss": 1.9276,
"step": 383
},
{
"epoch": 0.28893905191873587,
"grad_norm": 1.4380224578435883,
"learning_rate": 1.6686474739504723e-05,
"loss": 1.9464,
"step": 384
},
{
"epoch": 0.28969149736644095,
"grad_norm": 1.2512095596003332,
"learning_rate": 1.666833213591585e-05,
"loss": 1.9381,
"step": 385
},
{
"epoch": 0.290443942814146,
"grad_norm": 1.4550539328336975,
"learning_rate": 1.665014992177278e-05,
"loss": 1.9481,
"step": 386
},
{
"epoch": 0.291196388261851,
"grad_norm": 1.2841302617428239,
"learning_rate": 1.663192820507967e-05,
"loss": 1.9166,
"step": 387
},
{
"epoch": 0.29194883370955604,
"grad_norm": 1.449176133819647,
"learning_rate": 1.6613667094075324e-05,
"loss": 1.9312,
"step": 388
},
{
"epoch": 0.2927012791572611,
"grad_norm": 1.2049959477280525,
"learning_rate": 1.659536669723255e-05,
"loss": 1.9317,
"step": 389
},
{
"epoch": 0.29345372460496616,
"grad_norm": 1.452106637808468,
"learning_rate": 1.6577027123257522e-05,
"loss": 1.96,
"step": 390
},
{
"epoch": 0.2942061700526712,
"grad_norm": 1.3250810901439314,
"learning_rate": 1.655864848108913e-05,
"loss": 1.9102,
"step": 391
},
{
"epoch": 0.2949586155003762,
"grad_norm": 1.4705472292983508,
"learning_rate": 1.6540230879898327e-05,
"loss": 1.931,
"step": 392
},
{
"epoch": 0.29571106094808125,
"grad_norm": 1.3494385667796422,
"learning_rate": 1.6521774429087495e-05,
"loss": 1.9381,
"step": 393
},
{
"epoch": 0.2964635063957863,
"grad_norm": 1.4931579384512899,
"learning_rate": 1.6503279238289776e-05,
"loss": 1.9463,
"step": 394
},
{
"epoch": 0.29721595184349137,
"grad_norm": 1.4608046868459137,
"learning_rate": 1.6484745417368446e-05,
"loss": 1.9306,
"step": 395
},
{
"epoch": 0.2979683972911964,
"grad_norm": 1.5434524233760434,
"learning_rate": 1.646617307641623e-05,
"loss": 1.9475,
"step": 396
},
{
"epoch": 0.2987208427389014,
"grad_norm": 1.3751781232087623,
"learning_rate": 1.6447562325754683e-05,
"loss": 1.9592,
"step": 397
},
{
"epoch": 0.29947328818660646,
"grad_norm": 1.329922382511447,
"learning_rate": 1.642891327593351e-05,
"loss": 1.9154,
"step": 398
},
{
"epoch": 0.3002257336343115,
"grad_norm": 1.2765785646419983,
"learning_rate": 1.641022603772991e-05,
"loss": 1.9332,
"step": 399
},
{
"epoch": 0.3009781790820166,
"grad_norm": 1.2951366277411263,
"learning_rate": 1.639150072214793e-05,
"loss": 1.9146,
"step": 400
},
{
"epoch": 0.3017306245297216,
"grad_norm": 1.2056388239356284,
"learning_rate": 1.637273744041781e-05,
"loss": 1.9589,
"step": 401
},
{
"epoch": 0.30248306997742663,
"grad_norm": 1.3592990117168904,
"learning_rate": 1.63539363039953e-05,
"loss": 1.9468,
"step": 402
},
{
"epoch": 0.30323551542513166,
"grad_norm": 1.2730306029413752,
"learning_rate": 1.6335097424561015e-05,
"loss": 1.9771,
"step": 403
},
{
"epoch": 0.3039879608728367,
"grad_norm": 1.3109360721376324,
"learning_rate": 1.6316220914019765e-05,
"loss": 1.939,
"step": 404
},
{
"epoch": 0.3047404063205418,
"grad_norm": 1.19286267304542,
"learning_rate": 1.6297306884499898e-05,
"loss": 1.9123,
"step": 405
},
{
"epoch": 0.3054928517682468,
"grad_norm": 1.2529230194739442,
"learning_rate": 1.627835544835262e-05,
"loss": 1.9207,
"step": 406
},
{
"epoch": 0.30624529721595184,
"grad_norm": 1.2228972499595365,
"learning_rate": 1.625936671815135e-05,
"loss": 1.9228,
"step": 407
},
{
"epoch": 0.30699774266365687,
"grad_norm": 1.3246438968924625,
"learning_rate": 1.624034080669102e-05,
"loss": 1.9568,
"step": 408
},
{
"epoch": 0.3077501881113619,
"grad_norm": 1.2477031723368133,
"learning_rate": 1.6221277826987435e-05,
"loss": 1.9284,
"step": 409
},
{
"epoch": 0.308502633559067,
"grad_norm": 1.4319900758526833,
"learning_rate": 1.6202177892276588e-05,
"loss": 1.9314,
"step": 410
},
{
"epoch": 0.309255079006772,
"grad_norm": 1.2149258628526967,
"learning_rate": 1.6183041116013976e-05,
"loss": 1.9563,
"step": 411
},
{
"epoch": 0.31000752445447705,
"grad_norm": 1.3472989067268997,
"learning_rate": 1.6163867611873954e-05,
"loss": 1.9329,
"step": 412
},
{
"epoch": 0.3107599699021821,
"grad_norm": 1.184001601771353,
"learning_rate": 1.614465749374904e-05,
"loss": 1.9234,
"step": 413
},
{
"epoch": 0.3115124153498871,
"grad_norm": 1.1695879937323999,
"learning_rate": 1.612541087574924e-05,
"loss": 1.9404,
"step": 414
},
{
"epoch": 0.3122648607975922,
"grad_norm": 1.1830486389687247,
"learning_rate": 1.6106127872201364e-05,
"loss": 1.9303,
"step": 415
},
{
"epoch": 0.3130173062452972,
"grad_norm": 1.234433222556476,
"learning_rate": 1.6086808597648377e-05,
"loss": 1.9695,
"step": 416
},
{
"epoch": 0.31376975169300225,
"grad_norm": 1.2981913572420718,
"learning_rate": 1.6067453166848682e-05,
"loss": 1.923,
"step": 417
},
{
"epoch": 0.3145221971407073,
"grad_norm": 1.1912750523668338,
"learning_rate": 1.6048061694775458e-05,
"loss": 1.9089,
"step": 418
},
{
"epoch": 0.3152746425884123,
"grad_norm": 1.2730608984599712,
"learning_rate": 1.6028634296615973e-05,
"loss": 1.9042,
"step": 419
},
{
"epoch": 0.3160270880361174,
"grad_norm": 1.2583410793582406,
"learning_rate": 1.6009171087770895e-05,
"loss": 1.9184,
"step": 420
},
{
"epoch": 0.31677953348382243,
"grad_norm": 1.1374313434283714,
"learning_rate": 1.598967218385362e-05,
"loss": 1.9347,
"step": 421
},
{
"epoch": 0.31753197893152746,
"grad_norm": 1.1650519642611141,
"learning_rate": 1.5970137700689567e-05,
"loss": 1.9168,
"step": 422
},
{
"epoch": 0.3182844243792325,
"grad_norm": 1.1796961782168611,
"learning_rate": 1.5950567754315504e-05,
"loss": 1.9316,
"step": 423
},
{
"epoch": 0.3190368698269376,
"grad_norm": 1.216091335683399,
"learning_rate": 1.593096246097885e-05,
"loss": 1.8918,
"step": 424
},
{
"epoch": 0.3197893152746426,
"grad_norm": 1.2561937810385562,
"learning_rate": 1.5911321937136997e-05,
"loss": 1.9269,
"step": 425
},
{
"epoch": 0.32054176072234764,
"grad_norm": 1.2327808638330076,
"learning_rate": 1.5891646299456607e-05,
"loss": 1.9492,
"step": 426
},
{
"epoch": 0.32129420617005267,
"grad_norm": 1.2378447647844628,
"learning_rate": 1.5871935664812913e-05,
"loss": 1.9274,
"step": 427
},
{
"epoch": 0.3220466516177577,
"grad_norm": 1.3110147557633791,
"learning_rate": 1.585219015028904e-05,
"loss": 1.961,
"step": 428
},
{
"epoch": 0.3227990970654628,
"grad_norm": 1.2478198832749914,
"learning_rate": 1.58324098731753e-05,
"loss": 1.9139,
"step": 429
},
{
"epoch": 0.3235515425131678,
"grad_norm": 1.2306405628686876,
"learning_rate": 1.581259495096851e-05,
"loss": 1.9254,
"step": 430
},
{
"epoch": 0.32430398796087284,
"grad_norm": 1.2196028569752424,
"learning_rate": 1.5792745501371265e-05,
"loss": 1.9252,
"step": 431
},
{
"epoch": 0.32505643340857787,
"grad_norm": 1.2464265518149333,
"learning_rate": 1.5772861642291266e-05,
"loss": 1.9128,
"step": 432
},
{
"epoch": 0.3258088788562829,
"grad_norm": 1.2256161677006212,
"learning_rate": 1.5752943491840608e-05,
"loss": 1.9319,
"step": 433
},
{
"epoch": 0.326561324303988,
"grad_norm": 1.2208354474352308,
"learning_rate": 1.5732991168335085e-05,
"loss": 1.959,
"step": 434
},
{
"epoch": 0.327313769751693,
"grad_norm": 1.2161944966378884,
"learning_rate": 1.571300479029347e-05,
"loss": 1.9211,
"step": 435
},
{
"epoch": 0.32806621519939805,
"grad_norm": 1.2289538308870511,
"learning_rate": 1.569298447643683e-05,
"loss": 1.9085,
"step": 436
},
{
"epoch": 0.3288186606471031,
"grad_norm": 1.1689771067885344,
"learning_rate": 1.567293034568782e-05,
"loss": 1.9329,
"step": 437
},
{
"epoch": 0.3295711060948081,
"grad_norm": 1.292372851427419,
"learning_rate": 1.5652842517169968e-05,
"loss": 1.9277,
"step": 438
},
{
"epoch": 0.3303235515425132,
"grad_norm": 1.2254367963694681,
"learning_rate": 1.563272111020696e-05,
"loss": 1.9246,
"step": 439
},
{
"epoch": 0.3310759969902182,
"grad_norm": 1.2813814483936088,
"learning_rate": 1.5612566244321948e-05,
"loss": 1.8959,
"step": 440
},
{
"epoch": 0.33182844243792325,
"grad_norm": 1.2155279602305278,
"learning_rate": 1.5592378039236843e-05,
"loss": 1.9575,
"step": 441
},
{
"epoch": 0.3325808878856283,
"grad_norm": 1.1758340190257308,
"learning_rate": 1.5572156614871577e-05,
"loss": 1.9359,
"step": 442
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.2128031499657863,
"learning_rate": 1.555190209134342e-05,
"loss": 1.8846,
"step": 443
},
{
"epoch": 0.3340857787810384,
"grad_norm": 1.223738253378193,
"learning_rate": 1.553161458896625e-05,
"loss": 1.9364,
"step": 444
},
{
"epoch": 0.33483822422874343,
"grad_norm": 1.231725948585103,
"learning_rate": 1.5511294228249845e-05,
"loss": 1.9297,
"step": 445
},
{
"epoch": 0.33559066967644846,
"grad_norm": 1.23220655079934,
"learning_rate": 1.549094112989916e-05,
"loss": 1.9398,
"step": 446
},
{
"epoch": 0.3363431151241535,
"grad_norm": 1.1785654441011295,
"learning_rate": 1.547055541481362e-05,
"loss": 1.9469,
"step": 447
},
{
"epoch": 0.3370955605718585,
"grad_norm": 1.153157424971619,
"learning_rate": 1.545013720408639e-05,
"loss": 1.9445,
"step": 448
},
{
"epoch": 0.3378480060195636,
"grad_norm": 1.1602723025668076,
"learning_rate": 1.5429686619003672e-05,
"loss": 1.9148,
"step": 449
},
{
"epoch": 0.33860045146726864,
"grad_norm": 1.2002456646135142,
"learning_rate": 1.5409203781043964e-05,
"loss": 1.8902,
"step": 450
},
{
"epoch": 0.33935289691497367,
"grad_norm": 1.1314720195866792,
"learning_rate": 1.5388688811877357e-05,
"loss": 1.9243,
"step": 451
},
{
"epoch": 0.3401053423626787,
"grad_norm": 1.2269156773825378,
"learning_rate": 1.5368141833364805e-05,
"loss": 1.9215,
"step": 452
},
{
"epoch": 0.34085778781038373,
"grad_norm": 1.1803943189430242,
"learning_rate": 1.5347562967557395e-05,
"loss": 1.88,
"step": 453
},
{
"epoch": 0.3416102332580888,
"grad_norm": 1.2545321625875356,
"learning_rate": 1.5326952336695637e-05,
"loss": 1.8933,
"step": 454
},
{
"epoch": 0.34236267870579384,
"grad_norm": 1.1531848505417033,
"learning_rate": 1.5306310063208712e-05,
"loss": 1.9067,
"step": 455
},
{
"epoch": 0.3431151241534989,
"grad_norm": 1.2055939670357985,
"learning_rate": 1.5285636269713776e-05,
"loss": 1.9563,
"step": 456
},
{
"epoch": 0.3438675696012039,
"grad_norm": 1.1851560932863339,
"learning_rate": 1.5264931079015216e-05,
"loss": 1.9105,
"step": 457
},
{
"epoch": 0.34462001504890893,
"grad_norm": 1.211011703129878,
"learning_rate": 1.5244194614103914e-05,
"loss": 1.9332,
"step": 458
},
{
"epoch": 0.345372460496614,
"grad_norm": 1.2232298027036999,
"learning_rate": 1.522342699815653e-05,
"loss": 1.9358,
"step": 459
},
{
"epoch": 0.34612490594431905,
"grad_norm": 1.2051141630040154,
"learning_rate": 1.5202628354534762e-05,
"loss": 1.9296,
"step": 460
},
{
"epoch": 0.3468773513920241,
"grad_norm": 1.1875805335113838,
"learning_rate": 1.5181798806784614e-05,
"loss": 1.9067,
"step": 461
},
{
"epoch": 0.3476297968397291,
"grad_norm": 1.261590489970739,
"learning_rate": 1.5160938478635667e-05,
"loss": 1.9436,
"step": 462
},
{
"epoch": 0.34838224228743414,
"grad_norm": 1.1788033022657898,
"learning_rate": 1.5140047494000341e-05,
"loss": 1.9413,
"step": 463
},
{
"epoch": 0.3491346877351392,
"grad_norm": 1.395419647062517,
"learning_rate": 1.5119125976973152e-05,
"loss": 1.921,
"step": 464
},
{
"epoch": 0.34988713318284426,
"grad_norm": 1.1817503662912374,
"learning_rate": 1.509817405182999e-05,
"loss": 1.9493,
"step": 465
},
{
"epoch": 0.3506395786305493,
"grad_norm": 1.329104538612201,
"learning_rate": 1.5077191843027366e-05,
"loss": 1.9459,
"step": 466
},
{
"epoch": 0.3513920240782543,
"grad_norm": 1.16151896786367,
"learning_rate": 1.5056179475201683e-05,
"loss": 1.8871,
"step": 467
},
{
"epoch": 0.35214446952595935,
"grad_norm": 1.3045386473786302,
"learning_rate": 1.5035137073168487e-05,
"loss": 1.9201,
"step": 468
},
{
"epoch": 0.35289691497366443,
"grad_norm": 1.1486744655968801,
"learning_rate": 1.5014064761921736e-05,
"loss": 1.9123,
"step": 469
},
{
"epoch": 0.35364936042136946,
"grad_norm": 1.294338542458702,
"learning_rate": 1.4992962666633044e-05,
"loss": 1.9338,
"step": 470
},
{
"epoch": 0.3544018058690745,
"grad_norm": 1.1345951475012919,
"learning_rate": 1.4971830912650953e-05,
"loss": 1.9121,
"step": 471
},
{
"epoch": 0.3551542513167795,
"grad_norm": 1.3467039727807595,
"learning_rate": 1.4950669625500178e-05,
"loss": 1.949,
"step": 472
},
{
"epoch": 0.35590669676448455,
"grad_norm": 1.1257771848617852,
"learning_rate": 1.4929478930880862e-05,
"loss": 1.9071,
"step": 473
},
{
"epoch": 0.35665914221218964,
"grad_norm": 1.3975418375298359,
"learning_rate": 1.4908258954667832e-05,
"loss": 1.9305,
"step": 474
},
{
"epoch": 0.35741158765989467,
"grad_norm": 1.1600047409292655,
"learning_rate": 1.4887009822909853e-05,
"loss": 1.924,
"step": 475
},
{
"epoch": 0.3581640331075997,
"grad_norm": 1.2411271191905437,
"learning_rate": 1.486573166182887e-05,
"loss": 1.9003,
"step": 476
},
{
"epoch": 0.35891647855530473,
"grad_norm": 1.149875087931339,
"learning_rate": 1.4844424597819276e-05,
"loss": 1.9317,
"step": 477
},
{
"epoch": 0.35966892400300976,
"grad_norm": 1.2715149265455479,
"learning_rate": 1.4823088757447144e-05,
"loss": 1.9274,
"step": 478
},
{
"epoch": 0.36042136945071485,
"grad_norm": 1.1770661807987435,
"learning_rate": 1.4801724267449477e-05,
"loss": 1.9004,
"step": 479
},
{
"epoch": 0.3611738148984199,
"grad_norm": 1.2411166989321967,
"learning_rate": 1.478033125473347e-05,
"loss": 1.9477,
"step": 480
},
{
"epoch": 0.3619262603461249,
"grad_norm": 1.2489825836964896,
"learning_rate": 1.4758909846375736e-05,
"loss": 1.932,
"step": 481
},
{
"epoch": 0.36267870579382994,
"grad_norm": 1.1946174392377489,
"learning_rate": 1.4737460169621564e-05,
"loss": 1.9302,
"step": 482
},
{
"epoch": 0.36343115124153497,
"grad_norm": 1.1693974978644097,
"learning_rate": 1.4715982351884166e-05,
"loss": 1.9158,
"step": 483
},
{
"epoch": 0.36418359668924005,
"grad_norm": 1.2955915318322662,
"learning_rate": 1.4694476520743908e-05,
"loss": 1.8983,
"step": 484
},
{
"epoch": 0.3649360421369451,
"grad_norm": 1.1646234606109815,
"learning_rate": 1.4672942803947556e-05,
"loss": 1.9376,
"step": 485
},
{
"epoch": 0.3656884875846501,
"grad_norm": 1.316734864131483,
"learning_rate": 1.4651381329407527e-05,
"loss": 1.9187,
"step": 486
},
{
"epoch": 0.36644093303235514,
"grad_norm": 1.2087161866532892,
"learning_rate": 1.4629792225201115e-05,
"loss": 1.9213,
"step": 487
},
{
"epoch": 0.3671933784800602,
"grad_norm": 1.244117973812367,
"learning_rate": 1.460817561956974e-05,
"loss": 1.9275,
"step": 488
},
{
"epoch": 0.36794582392776526,
"grad_norm": 1.2808403733781915,
"learning_rate": 1.458653164091819e-05,
"loss": 1.9174,
"step": 489
},
{
"epoch": 0.3686982693754703,
"grad_norm": 1.3368736812018596,
"learning_rate": 1.4564860417813837e-05,
"loss": 1.9248,
"step": 490
},
{
"epoch": 0.3694507148231753,
"grad_norm": 1.1553155407132074,
"learning_rate": 1.4543162078985898e-05,
"loss": 1.8925,
"step": 491
},
{
"epoch": 0.37020316027088035,
"grad_norm": 1.1938225430010063,
"learning_rate": 1.4521436753324659e-05,
"loss": 1.8915,
"step": 492
},
{
"epoch": 0.3709556057185854,
"grad_norm": 1.2429685820092986,
"learning_rate": 1.4499684569880705e-05,
"loss": 1.9373,
"step": 493
},
{
"epoch": 0.37170805116629047,
"grad_norm": 1.222677775561422,
"learning_rate": 1.4477905657864169e-05,
"loss": 1.9098,
"step": 494
},
{
"epoch": 0.3724604966139955,
"grad_norm": 1.1961579016873296,
"learning_rate": 1.4456100146643941e-05,
"loss": 1.9022,
"step": 495
},
{
"epoch": 0.3732129420617005,
"grad_norm": 1.1449105895685063,
"learning_rate": 1.4434268165746925e-05,
"loss": 1.9161,
"step": 496
},
{
"epoch": 0.37396538750940556,
"grad_norm": 1.1300565714324697,
"learning_rate": 1.441240984485725e-05,
"loss": 1.913,
"step": 497
},
{
"epoch": 0.3747178329571106,
"grad_norm": 1.1866959145757736,
"learning_rate": 1.4390525313815516e-05,
"loss": 1.8881,
"step": 498
},
{
"epoch": 0.37547027840481567,
"grad_norm": 1.2850006320546294,
"learning_rate": 1.4368614702617997e-05,
"loss": 1.9567,
"step": 499
},
{
"epoch": 0.3762227238525207,
"grad_norm": 1.1708222489757714,
"learning_rate": 1.4346678141415905e-05,
"loss": 1.9166,
"step": 500
},
{
"epoch": 0.37697516930022573,
"grad_norm": 1.1496458322530194,
"learning_rate": 1.4324715760514588e-05,
"loss": 1.9145,
"step": 501
},
{
"epoch": 0.37772761474793076,
"grad_norm": 1.1918092392991508,
"learning_rate": 1.4302727690372764e-05,
"loss": 1.905,
"step": 502
},
{
"epoch": 0.3784800601956358,
"grad_norm": 1.1249579756898291,
"learning_rate": 1.428071406160175e-05,
"loss": 1.9351,
"step": 503
},
{
"epoch": 0.3792325056433409,
"grad_norm": 1.1917923009198421,
"learning_rate": 1.4258675004964687e-05,
"loss": 1.9062,
"step": 504
},
{
"epoch": 0.3799849510910459,
"grad_norm": 1.1266437209653588,
"learning_rate": 1.4236610651375752e-05,
"loss": 1.9303,
"step": 505
},
{
"epoch": 0.38073739653875094,
"grad_norm": 1.2348580309393102,
"learning_rate": 1.42145211318994e-05,
"loss": 1.9315,
"step": 506
},
{
"epoch": 0.38148984198645597,
"grad_norm": 1.2403659977632822,
"learning_rate": 1.4192406577749562e-05,
"loss": 1.9228,
"step": 507
},
{
"epoch": 0.382242287434161,
"grad_norm": 1.1694994716515152,
"learning_rate": 1.4170267120288885e-05,
"loss": 1.8931,
"step": 508
},
{
"epoch": 0.3829947328818661,
"grad_norm": 1.2846317300647594,
"learning_rate": 1.4148102891027943e-05,
"loss": 1.9534,
"step": 509
},
{
"epoch": 0.3837471783295711,
"grad_norm": 1.180871929155164,
"learning_rate": 1.4125914021624454e-05,
"loss": 1.9314,
"step": 510
},
{
"epoch": 0.38449962377727614,
"grad_norm": 1.1690089466187774,
"learning_rate": 1.4103700643882503e-05,
"loss": 1.9118,
"step": 511
},
{
"epoch": 0.3852520692249812,
"grad_norm": 1.1076465518114502,
"learning_rate": 1.4081462889751756e-05,
"loss": 1.8899,
"step": 512
},
{
"epoch": 0.3860045146726862,
"grad_norm": 1.1810648376867858,
"learning_rate": 1.4059200891326683e-05,
"loss": 1.915,
"step": 513
},
{
"epoch": 0.3867569601203913,
"grad_norm": 1.1337568740717412,
"learning_rate": 1.4036914780845757e-05,
"loss": 1.9231,
"step": 514
},
{
"epoch": 0.3875094055680963,
"grad_norm": 1.1813963921411506,
"learning_rate": 1.4014604690690683e-05,
"loss": 1.9368,
"step": 515
},
{
"epoch": 0.38826185101580135,
"grad_norm": 1.1278242894104809,
"learning_rate": 1.3992270753385614e-05,
"loss": 1.9452,
"step": 516
},
{
"epoch": 0.3890142964635064,
"grad_norm": 1.3714651354703575,
"learning_rate": 1.3969913101596351e-05,
"loss": 1.9269,
"step": 517
},
{
"epoch": 0.3897667419112114,
"grad_norm": 1.2633982097474674,
"learning_rate": 1.394753186812956e-05,
"loss": 1.9427,
"step": 518
},
{
"epoch": 0.3905191873589165,
"grad_norm": 1.3644841229332958,
"learning_rate": 1.3925127185931993e-05,
"loss": 1.8866,
"step": 519
},
{
"epoch": 0.3912716328066215,
"grad_norm": 1.1702313785436145,
"learning_rate": 1.3902699188089679e-05,
"loss": 1.8798,
"step": 520
},
{
"epoch": 0.39202407825432656,
"grad_norm": 1.1462509576257305,
"learning_rate": 1.3880248007827151e-05,
"loss": 1.8936,
"step": 521
},
{
"epoch": 0.3927765237020316,
"grad_norm": 1.2951445113040818,
"learning_rate": 1.3857773778506643e-05,
"loss": 1.9326,
"step": 522
},
{
"epoch": 0.3935289691497366,
"grad_norm": 1.1650743795903011,
"learning_rate": 1.3835276633627313e-05,
"loss": 1.9076,
"step": 523
},
{
"epoch": 0.3942814145974417,
"grad_norm": 1.2278743936835155,
"learning_rate": 1.3812756706824428e-05,
"loss": 1.9053,
"step": 524
},
{
"epoch": 0.39503386004514673,
"grad_norm": 1.1046765671156833,
"learning_rate": 1.3790214131868588e-05,
"loss": 1.969,
"step": 525
},
{
"epoch": 0.39578630549285176,
"grad_norm": 1.32484940887904,
"learning_rate": 1.3767649042664925e-05,
"loss": 1.9302,
"step": 526
},
{
"epoch": 0.3965387509405568,
"grad_norm": 1.1461927443712088,
"learning_rate": 1.3745061573252305e-05,
"loss": 1.9234,
"step": 527
},
{
"epoch": 0.3972911963882618,
"grad_norm": 1.1795973562739899,
"learning_rate": 1.3722451857802535e-05,
"loss": 1.8713,
"step": 528
},
{
"epoch": 0.3980436418359669,
"grad_norm": 1.1990138900736815,
"learning_rate": 1.3699820030619569e-05,
"loss": 1.943,
"step": 529
},
{
"epoch": 0.39879608728367194,
"grad_norm": 1.1955875835188636,
"learning_rate": 1.3677166226138705e-05,
"loss": 1.92,
"step": 530
},
{
"epoch": 0.39954853273137697,
"grad_norm": 1.2033435420140173,
"learning_rate": 1.3654490578925788e-05,
"loss": 1.9624,
"step": 531
},
{
"epoch": 0.400300978179082,
"grad_norm": 1.1141918460712792,
"learning_rate": 1.3631793223676408e-05,
"loss": 1.8939,
"step": 532
},
{
"epoch": 0.40105342362678703,
"grad_norm": 1.1715660266064452,
"learning_rate": 1.3609074295215113e-05,
"loss": 1.9396,
"step": 533
},
{
"epoch": 0.4018058690744921,
"grad_norm": 1.1650220083521712,
"learning_rate": 1.3586333928494582e-05,
"loss": 1.9518,
"step": 534
},
{
"epoch": 0.40255831452219715,
"grad_norm": 1.1790274887436696,
"learning_rate": 1.3563572258594854e-05,
"loss": 1.8972,
"step": 535
},
{
"epoch": 0.4033107599699022,
"grad_norm": 1.116016076479536,
"learning_rate": 1.3540789420722509e-05,
"loss": 1.8797,
"step": 536
},
{
"epoch": 0.4040632054176072,
"grad_norm": 1.1300714713631574,
"learning_rate": 1.3517985550209859e-05,
"loss": 1.9086,
"step": 537
},
{
"epoch": 0.40481565086531224,
"grad_norm": 1.1096054607697965,
"learning_rate": 1.3495160782514154e-05,
"loss": 1.9097,
"step": 538
},
{
"epoch": 0.4055680963130173,
"grad_norm": 1.171868559452081,
"learning_rate": 1.3472315253216782e-05,
"loss": 1.9259,
"step": 539
},
{
"epoch": 0.40632054176072235,
"grad_norm": 1.1755913459985676,
"learning_rate": 1.3449449098022452e-05,
"loss": 1.9101,
"step": 540
},
{
"epoch": 0.4070729872084274,
"grad_norm": 1.2095838496313647,
"learning_rate": 1.3426562452758391e-05,
"loss": 1.9261,
"step": 541
},
{
"epoch": 0.4078254326561324,
"grad_norm": 1.1829872465381066,
"learning_rate": 1.3403655453373545e-05,
"loss": 1.8972,
"step": 542
},
{
"epoch": 0.40857787810383744,
"grad_norm": 1.2664733117194062,
"learning_rate": 1.3380728235937758e-05,
"loss": 1.9181,
"step": 543
},
{
"epoch": 0.40933032355154253,
"grad_norm": 1.1614497912502462,
"learning_rate": 1.3357780936640981e-05,
"loss": 1.9275,
"step": 544
},
{
"epoch": 0.41008276899924756,
"grad_norm": 1.130327945239977,
"learning_rate": 1.333481369179244e-05,
"loss": 1.9163,
"step": 545
},
{
"epoch": 0.4108352144469526,
"grad_norm": 1.1736561330155733,
"learning_rate": 1.3311826637819856e-05,
"loss": 1.8975,
"step": 546
},
{
"epoch": 0.4115876598946576,
"grad_norm": 1.1095343471879087,
"learning_rate": 1.32888199112686e-05,
"loss": 1.9445,
"step": 547
},
{
"epoch": 0.4123401053423627,
"grad_norm": 1.1490338206460358,
"learning_rate": 1.3265793648800915e-05,
"loss": 1.8861,
"step": 548
},
{
"epoch": 0.41309255079006774,
"grad_norm": 1.2214722655482622,
"learning_rate": 1.3242747987195084e-05,
"loss": 1.8867,
"step": 549
},
{
"epoch": 0.41384499623777277,
"grad_norm": 1.1604382697178512,
"learning_rate": 1.3219683063344619e-05,
"loss": 1.9066,
"step": 550
},
{
"epoch": 0.4145974416854778,
"grad_norm": 1.1749456043961222,
"learning_rate": 1.3196599014257459e-05,
"loss": 1.9074,
"step": 551
},
{
"epoch": 0.4153498871331828,
"grad_norm": 1.1292931398938084,
"learning_rate": 1.3173495977055142e-05,
"loss": 1.9096,
"step": 552
},
{
"epoch": 0.4161023325808879,
"grad_norm": 1.2052090683244063,
"learning_rate": 1.3150374088972e-05,
"loss": 1.9237,
"step": 553
},
{
"epoch": 0.41685477802859294,
"grad_norm": 1.156212002094206,
"learning_rate": 1.3127233487354342e-05,
"loss": 1.9329,
"step": 554
},
{
"epoch": 0.417607223476298,
"grad_norm": 1.1167620190649576,
"learning_rate": 1.3104074309659637e-05,
"loss": 1.9256,
"step": 555
},
{
"epoch": 0.418359668924003,
"grad_norm": 1.1600444369849605,
"learning_rate": 1.3080896693455699e-05,
"loss": 1.9397,
"step": 556
},
{
"epoch": 0.41911211437170803,
"grad_norm": 1.1488125080959068,
"learning_rate": 1.305770077641986e-05,
"loss": 1.9156,
"step": 557
},
{
"epoch": 0.4198645598194131,
"grad_norm": 1.1408993569353663,
"learning_rate": 1.3034486696338173e-05,
"loss": 1.9053,
"step": 558
},
{
"epoch": 0.42061700526711815,
"grad_norm": 1.1289548102415197,
"learning_rate": 1.3011254591104578e-05,
"loss": 1.8761,
"step": 559
},
{
"epoch": 0.4213694507148232,
"grad_norm": 1.1128979472977316,
"learning_rate": 1.2988004598720083e-05,
"loss": 1.9092,
"step": 560
},
{
"epoch": 0.4221218961625282,
"grad_norm": 1.1066070086385047,
"learning_rate": 1.2964736857291944e-05,
"loss": 1.9005,
"step": 561
},
{
"epoch": 0.42287434161023324,
"grad_norm": 1.1769627931986593,
"learning_rate": 1.2941451505032857e-05,
"loss": 1.9466,
"step": 562
},
{
"epoch": 0.4236267870579383,
"grad_norm": 1.1372477835129797,
"learning_rate": 1.291814868026012e-05,
"loss": 1.9082,
"step": 563
},
{
"epoch": 0.42437923250564336,
"grad_norm": 1.113596957285773,
"learning_rate": 1.2894828521394824e-05,
"loss": 1.9113,
"step": 564
},
{
"epoch": 0.4251316779533484,
"grad_norm": 1.12947013317858,
"learning_rate": 1.2871491166961028e-05,
"loss": 1.9378,
"step": 565
},
{
"epoch": 0.4258841234010534,
"grad_norm": 1.0906063586545516,
"learning_rate": 1.284813675558493e-05,
"loss": 1.8756,
"step": 566
},
{
"epoch": 0.42663656884875845,
"grad_norm": 1.130975672969097,
"learning_rate": 1.2824765425994047e-05,
"loss": 1.9312,
"step": 567
},
{
"epoch": 0.42738901429646353,
"grad_norm": 1.1231932755082066,
"learning_rate": 1.2801377317016402e-05,
"loss": 1.894,
"step": 568
},
{
"epoch": 0.42814145974416856,
"grad_norm": 1.2113771102569506,
"learning_rate": 1.2777972567579673e-05,
"loss": 1.9295,
"step": 569
},
{
"epoch": 0.4288939051918736,
"grad_norm": 1.1227488786862374,
"learning_rate": 1.2754551316710397e-05,
"loss": 1.9027,
"step": 570
},
{
"epoch": 0.4296463506395786,
"grad_norm": 1.1197744229765851,
"learning_rate": 1.273111370353313e-05,
"loss": 1.9086,
"step": 571
},
{
"epoch": 0.43039879608728365,
"grad_norm": 1.2021741508430692,
"learning_rate": 1.2707659867269613e-05,
"loss": 1.8885,
"step": 572
},
{
"epoch": 0.43115124153498874,
"grad_norm": 1.1189373910115483,
"learning_rate": 1.2684189947237964e-05,
"loss": 1.9396,
"step": 573
},
{
"epoch": 0.43190368698269377,
"grad_norm": 1.2266146716522572,
"learning_rate": 1.2660704082851831e-05,
"loss": 1.9152,
"step": 574
},
{
"epoch": 0.4326561324303988,
"grad_norm": 1.21252052807106,
"learning_rate": 1.263720241361958e-05,
"loss": 1.8852,
"step": 575
},
{
"epoch": 0.43340857787810383,
"grad_norm": 1.2040632349915654,
"learning_rate": 1.2613685079143458e-05,
"loss": 1.9086,
"step": 576
},
{
"epoch": 0.43416102332580886,
"grad_norm": 1.0936752285795388,
"learning_rate": 1.2590152219118762e-05,
"loss": 1.9225,
"step": 577
},
{
"epoch": 0.43491346877351394,
"grad_norm": 1.2682710502833725,
"learning_rate": 1.2566603973333016e-05,
"loss": 1.8746,
"step": 578
},
{
"epoch": 0.435665914221219,
"grad_norm": 1.2281943737085654,
"learning_rate": 1.2543040481665134e-05,
"loss": 1.8924,
"step": 579
},
{
"epoch": 0.436418359668924,
"grad_norm": 1.2764517397481905,
"learning_rate": 1.2519461884084592e-05,
"loss": 1.8911,
"step": 580
},
{
"epoch": 0.43717080511662904,
"grad_norm": 1.1188373353915282,
"learning_rate": 1.24958683206506e-05,
"loss": 1.8946,
"step": 581
},
{
"epoch": 0.43792325056433407,
"grad_norm": 1.1952313691406464,
"learning_rate": 1.2472259931511265e-05,
"loss": 1.907,
"step": 582
},
{
"epoch": 0.43867569601203915,
"grad_norm": 1.2085182339152578,
"learning_rate": 1.244863685690276e-05,
"loss": 1.9029,
"step": 583
},
{
"epoch": 0.4394281414597442,
"grad_norm": 1.132753176461499,
"learning_rate": 1.242499923714849e-05,
"loss": 1.9059,
"step": 584
},
{
"epoch": 0.4401805869074492,
"grad_norm": 1.224570671200772,
"learning_rate": 1.240134721265826e-05,
"loss": 1.9044,
"step": 585
},
{
"epoch": 0.44093303235515424,
"grad_norm": 1.1160656856879785,
"learning_rate": 1.237768092392744e-05,
"loss": 1.8958,
"step": 586
},
{
"epoch": 0.44168547780285927,
"grad_norm": 1.1752413930341374,
"learning_rate": 1.2354000511536135e-05,
"loss": 1.9032,
"step": 587
},
{
"epoch": 0.44243792325056436,
"grad_norm": 1.1516249590971352,
"learning_rate": 1.2330306116148344e-05,
"loss": 1.8777,
"step": 588
},
{
"epoch": 0.4431903686982694,
"grad_norm": 1.1662781399314968,
"learning_rate": 1.230659787851112e-05,
"loss": 1.9088,
"step": 589
},
{
"epoch": 0.4439428141459744,
"grad_norm": 1.1114970973978882,
"learning_rate": 1.228287593945375e-05,
"loss": 1.889,
"step": 590
},
{
"epoch": 0.44469525959367945,
"grad_norm": 1.159900138552732,
"learning_rate": 1.22591404398869e-05,
"loss": 1.8956,
"step": 591
},
{
"epoch": 0.4454477050413845,
"grad_norm": 1.1712851331230487,
"learning_rate": 1.2235391520801801e-05,
"loss": 1.8949,
"step": 592
},
{
"epoch": 0.44620015048908956,
"grad_norm": 1.2135890107666567,
"learning_rate": 1.2211629323269377e-05,
"loss": 1.8964,
"step": 593
},
{
"epoch": 0.4469525959367946,
"grad_norm": 1.1221840457042225,
"learning_rate": 1.2187853988439442e-05,
"loss": 1.8948,
"step": 594
},
{
"epoch": 0.4477050413844996,
"grad_norm": 1.1072589441512435,
"learning_rate": 1.2164065657539846e-05,
"loss": 1.9313,
"step": 595
},
{
"epoch": 0.44845748683220465,
"grad_norm": 1.147108020712539,
"learning_rate": 1.2140264471875627e-05,
"loss": 1.8867,
"step": 596
},
{
"epoch": 0.4492099322799097,
"grad_norm": 1.0898108757266811,
"learning_rate": 1.2116450572828194e-05,
"loss": 1.8705,
"step": 597
},
{
"epoch": 0.44996237772761477,
"grad_norm": 1.1309052465866392,
"learning_rate": 1.2092624101854466e-05,
"loss": 1.9054,
"step": 598
},
{
"epoch": 0.4507148231753198,
"grad_norm": 1.2390550101012947,
"learning_rate": 1.2068785200486044e-05,
"loss": 1.8729,
"step": 599
},
{
"epoch": 0.45146726862302483,
"grad_norm": 1.0898013897152719,
"learning_rate": 1.204493401032837e-05,
"loss": 1.9296,
"step": 600
},
{
"epoch": 0.45221971407072986,
"grad_norm": 1.1478603507427294,
"learning_rate": 1.202107067305987e-05,
"loss": 1.9593,
"step": 601
},
{
"epoch": 0.4529721595184349,
"grad_norm": 1.1081871155284837,
"learning_rate": 1.1997195330431141e-05,
"loss": 1.9262,
"step": 602
},
{
"epoch": 0.45372460496614,
"grad_norm": 1.1513336894045574,
"learning_rate": 1.1973308124264087e-05,
"loss": 1.9394,
"step": 603
},
{
"epoch": 0.454477050413845,
"grad_norm": 1.0941648180673882,
"learning_rate": 1.1949409196451073e-05,
"loss": 1.9098,
"step": 604
},
{
"epoch": 0.45522949586155004,
"grad_norm": 1.216991747736067,
"learning_rate": 1.1925498688954111e-05,
"loss": 1.8955,
"step": 605
},
{
"epoch": 0.45598194130925507,
"grad_norm": 1.1501318960614881,
"learning_rate": 1.1901576743803984e-05,
"loss": 1.9218,
"step": 606
},
{
"epoch": 0.4567343867569601,
"grad_norm": 1.227336652337294,
"learning_rate": 1.1877643503099414e-05,
"loss": 1.9228,
"step": 607
},
{
"epoch": 0.4574868322046652,
"grad_norm": 1.225160884147312,
"learning_rate": 1.1853699109006227e-05,
"loss": 1.9058,
"step": 608
},
{
"epoch": 0.4582392776523702,
"grad_norm": 1.1319989178296463,
"learning_rate": 1.1829743703756498e-05,
"loss": 1.8873,
"step": 609
},
{
"epoch": 0.45899172310007524,
"grad_norm": 1.1169787039662507,
"learning_rate": 1.1805777429647712e-05,
"loss": 1.907,
"step": 610
},
{
"epoch": 0.4597441685477803,
"grad_norm": 1.1924873190509993,
"learning_rate": 1.178180042904191e-05,
"loss": 1.9238,
"step": 611
},
{
"epoch": 0.4604966139954853,
"grad_norm": 1.1007458389053224,
"learning_rate": 1.1757812844364855e-05,
"loss": 1.924,
"step": 612
},
{
"epoch": 0.4612490594431904,
"grad_norm": 1.2317425336449068,
"learning_rate": 1.173381481810518e-05,
"loss": 1.8983,
"step": 613
},
{
"epoch": 0.4620015048908954,
"grad_norm": 1.1383407046368126,
"learning_rate": 1.1709806492813542e-05,
"loss": 1.8862,
"step": 614
},
{
"epoch": 0.46275395033860045,
"grad_norm": 1.1393957190868131,
"learning_rate": 1.168578801110177e-05,
"loss": 1.8733,
"step": 615
},
{
"epoch": 0.4635063957863055,
"grad_norm": 1.1578387041736653,
"learning_rate": 1.166175951564203e-05,
"loss": 1.8871,
"step": 616
},
{
"epoch": 0.4642588412340105,
"grad_norm": 1.1442121745613545,
"learning_rate": 1.1637721149165971e-05,
"loss": 1.8952,
"step": 617
},
{
"epoch": 0.4650112866817156,
"grad_norm": 1.1286978510186116,
"learning_rate": 1.161367305446387e-05,
"loss": 1.8836,
"step": 618
},
{
"epoch": 0.4657637321294206,
"grad_norm": 1.1144198432332384,
"learning_rate": 1.1589615374383793e-05,
"loss": 1.9021,
"step": 619
},
{
"epoch": 0.46651617757712566,
"grad_norm": 1.1656849073047229,
"learning_rate": 1.156554825183075e-05,
"loss": 1.8915,
"step": 620
},
{
"epoch": 0.4672686230248307,
"grad_norm": 1.1070432836307438,
"learning_rate": 1.1541471829765832e-05,
"loss": 1.8659,
"step": 621
},
{
"epoch": 0.4680210684725357,
"grad_norm": 1.1476741623097366,
"learning_rate": 1.1517386251205375e-05,
"loss": 1.9063,
"step": 622
},
{
"epoch": 0.4687735139202408,
"grad_norm": 1.2425410058956965,
"learning_rate": 1.1493291659220104e-05,
"loss": 1.862,
"step": 623
},
{
"epoch": 0.46952595936794583,
"grad_norm": 1.0898714571019255,
"learning_rate": 1.1469188196934289e-05,
"loss": 1.9243,
"step": 624
},
{
"epoch": 0.47027840481565086,
"grad_norm": 1.3005317768798952,
"learning_rate": 1.1445076007524877e-05,
"loss": 1.8881,
"step": 625
},
{
"epoch": 0.4710308502633559,
"grad_norm": 1.0928953630783256,
"learning_rate": 1.1420955234220675e-05,
"loss": 1.8784,
"step": 626
},
{
"epoch": 0.4717832957110609,
"grad_norm": 1.234578915827647,
"learning_rate": 1.1396826020301457e-05,
"loss": 1.893,
"step": 627
},
{
"epoch": 0.472535741158766,
"grad_norm": 1.23834691104855,
"learning_rate": 1.1372688509097158e-05,
"loss": 1.8951,
"step": 628
},
{
"epoch": 0.47328818660647104,
"grad_norm": 1.2195428302530935,
"learning_rate": 1.1348542843986983e-05,
"loss": 1.9252,
"step": 629
},
{
"epoch": 0.47404063205417607,
"grad_norm": 1.304315989532201,
"learning_rate": 1.1324389168398576e-05,
"loss": 1.8774,
"step": 630
},
{
"epoch": 0.4747930775018811,
"grad_norm": 1.1356632289988222,
"learning_rate": 1.1300227625807167e-05,
"loss": 1.8852,
"step": 631
},
{
"epoch": 0.47554552294958613,
"grad_norm": 1.1498242072773486,
"learning_rate": 1.1276058359734719e-05,
"loss": 1.9032,
"step": 632
},
{
"epoch": 0.4762979683972912,
"grad_norm": 1.1287999315504542,
"learning_rate": 1.1251881513749062e-05,
"loss": 1.89,
"step": 633
},
{
"epoch": 0.47705041384499625,
"grad_norm": 1.0784757132137233,
"learning_rate": 1.1227697231463062e-05,
"loss": 1.9437,
"step": 634
},
{
"epoch": 0.4778028592927013,
"grad_norm": 1.2084302997981904,
"learning_rate": 1.1203505656533756e-05,
"loss": 1.8735,
"step": 635
},
{
"epoch": 0.4785553047404063,
"grad_norm": 1.1579974338723793,
"learning_rate": 1.1179306932661496e-05,
"loss": 1.9078,
"step": 636
},
{
"epoch": 0.47930775018811134,
"grad_norm": 1.1963129178412915,
"learning_rate": 1.1155101203589102e-05,
"loss": 1.8955,
"step": 637
},
{
"epoch": 0.4800601956358164,
"grad_norm": 1.2600006035920857,
"learning_rate": 1.1130888613101007e-05,
"loss": 1.9206,
"step": 638
},
{
"epoch": 0.48081264108352145,
"grad_norm": 1.1585936490519133,
"learning_rate": 1.1106669305022397e-05,
"loss": 1.9132,
"step": 639
},
{
"epoch": 0.4815650865312265,
"grad_norm": 1.1795395026465916,
"learning_rate": 1.1082443423218366e-05,
"loss": 1.9059,
"step": 640
},
{
"epoch": 0.4823175319789315,
"grad_norm": 1.2057307091986988,
"learning_rate": 1.1058211111593054e-05,
"loss": 1.9062,
"step": 641
},
{
"epoch": 0.48306997742663654,
"grad_norm": 1.2093716057686956,
"learning_rate": 1.1033972514088793e-05,
"loss": 1.9084,
"step": 642
},
{
"epoch": 0.48382242287434163,
"grad_norm": 1.21922887222546,
"learning_rate": 1.1009727774685257e-05,
"loss": 1.914,
"step": 643
},
{
"epoch": 0.48457486832204666,
"grad_norm": 1.1017227116584083,
"learning_rate": 1.0985477037398606e-05,
"loss": 1.8853,
"step": 644
},
{
"epoch": 0.4853273137697517,
"grad_norm": 1.243030857383578,
"learning_rate": 1.096122044628062e-05,
"loss": 1.8779,
"step": 645
},
{
"epoch": 0.4860797592174567,
"grad_norm": 1.1688522153007397,
"learning_rate": 1.0936958145417858e-05,
"loss": 1.8715,
"step": 646
},
{
"epoch": 0.48683220466516175,
"grad_norm": 1.2084196282287973,
"learning_rate": 1.0912690278930791e-05,
"loss": 1.8975,
"step": 647
},
{
"epoch": 0.48758465011286684,
"grad_norm": 1.1144276846993084,
"learning_rate": 1.0888416990972957e-05,
"loss": 1.8722,
"step": 648
},
{
"epoch": 0.48833709556057187,
"grad_norm": 1.1159549517744598,
"learning_rate": 1.0864138425730088e-05,
"loss": 1.8937,
"step": 649
},
{
"epoch": 0.4890895410082769,
"grad_norm": 1.1528136304096126,
"learning_rate": 1.0839854727419273e-05,
"loss": 1.8959,
"step": 650
},
{
"epoch": 0.4898419864559819,
"grad_norm": 1.098994503962126,
"learning_rate": 1.0815566040288088e-05,
"loss": 1.9224,
"step": 651
},
{
"epoch": 0.49059443190368696,
"grad_norm": 1.1692367182422247,
"learning_rate": 1.0791272508613742e-05,
"loss": 1.9096,
"step": 652
},
{
"epoch": 0.49134687735139204,
"grad_norm": 1.0858510329449407,
"learning_rate": 1.0766974276702227e-05,
"loss": 1.9208,
"step": 653
},
{
"epoch": 0.49209932279909707,
"grad_norm": 1.0920245446839867,
"learning_rate": 1.0742671488887444e-05,
"loss": 1.8748,
"step": 654
},
{
"epoch": 0.4928517682468021,
"grad_norm": 1.039950515079351,
"learning_rate": 1.0718364289530363e-05,
"loss": 1.883,
"step": 655
},
{
"epoch": 0.49360421369450713,
"grad_norm": 1.0867885019727714,
"learning_rate": 1.0694052823018164e-05,
"loss": 1.8566,
"step": 656
},
{
"epoch": 0.49435665914221216,
"grad_norm": 1.130513170862527,
"learning_rate": 1.0669737233763363e-05,
"loss": 1.8725,
"step": 657
},
{
"epoch": 0.49510910458991725,
"grad_norm": 1.0508775184750423,
"learning_rate": 1.0645417666202978e-05,
"loss": 1.9208,
"step": 658
},
{
"epoch": 0.4958615500376223,
"grad_norm": 1.085211801025369,
"learning_rate": 1.0621094264797647e-05,
"loss": 1.9182,
"step": 659
},
{
"epoch": 0.4966139954853273,
"grad_norm": 1.1177303673818486,
"learning_rate": 1.0596767174030786e-05,
"loss": 1.8714,
"step": 660
},
{
"epoch": 0.49736644093303234,
"grad_norm": 1.0723598916819437,
"learning_rate": 1.0572436538407734e-05,
"loss": 1.8481,
"step": 661
},
{
"epoch": 0.49811888638073737,
"grad_norm": 1.0674243832308286,
"learning_rate": 1.054810250245487e-05,
"loss": 1.8906,
"step": 662
},
{
"epoch": 0.49887133182844245,
"grad_norm": 1.1329374377540429,
"learning_rate": 1.0523765210718783e-05,
"loss": 1.8936,
"step": 663
},
{
"epoch": 0.4996237772761475,
"grad_norm": 1.1021991553744763,
"learning_rate": 1.0499424807765408e-05,
"loss": 1.9226,
"step": 664
},
{
"epoch": 0.5003762227238525,
"grad_norm": 1.088496537481319,
"learning_rate": 1.0475081438179143e-05,
"loss": 1.9084,
"step": 665
},
{
"epoch": 0.5011286681715575,
"grad_norm": 1.1049385460563579,
"learning_rate": 1.045073524656202e-05,
"loss": 1.8967,
"step": 666
},
{
"epoch": 0.5018811136192626,
"grad_norm": 1.1299651634387053,
"learning_rate": 1.0426386377532836e-05,
"loss": 1.9249,
"step": 667
},
{
"epoch": 0.5026335590669676,
"grad_norm": 1.1535836021912884,
"learning_rate": 1.040203497572628e-05,
"loss": 1.8832,
"step": 668
},
{
"epoch": 0.5033860045146726,
"grad_norm": 1.163124019940607,
"learning_rate": 1.0377681185792102e-05,
"loss": 1.8719,
"step": 669
},
{
"epoch": 0.5041384499623778,
"grad_norm": 1.204505313618055,
"learning_rate": 1.0353325152394222e-05,
"loss": 1.8955,
"step": 670
},
{
"epoch": 0.5048908954100828,
"grad_norm": 1.121753952744389,
"learning_rate": 1.03289670202099e-05,
"loss": 1.8811,
"step": 671
},
{
"epoch": 0.5056433408577878,
"grad_norm": 1.1521793819043014,
"learning_rate": 1.030460693392885e-05,
"loss": 1.9098,
"step": 672
},
{
"epoch": 0.5063957863054929,
"grad_norm": 1.1762633994525786,
"learning_rate": 1.0280245038252403e-05,
"loss": 1.8782,
"step": 673
},
{
"epoch": 0.5071482317531979,
"grad_norm": 1.068204106148586,
"learning_rate": 1.0255881477892639e-05,
"loss": 1.9166,
"step": 674
},
{
"epoch": 0.5079006772009029,
"grad_norm": 1.109449060181758,
"learning_rate": 1.0231516397571521e-05,
"loss": 1.9115,
"step": 675
},
{
"epoch": 0.508653122648608,
"grad_norm": 1.1554160306209136,
"learning_rate": 1.020714994202004e-05,
"loss": 1.9027,
"step": 676
},
{
"epoch": 0.509405568096313,
"grad_norm": 1.0580861469041871,
"learning_rate": 1.018278225597736e-05,
"loss": 1.8885,
"step": 677
},
{
"epoch": 0.510158013544018,
"grad_norm": 1.1133809784298703,
"learning_rate": 1.0158413484189955e-05,
"loss": 1.8984,
"step": 678
},
{
"epoch": 0.510910458991723,
"grad_norm": 1.144705726755765,
"learning_rate": 1.0134043771410744e-05,
"loss": 1.9138,
"step": 679
},
{
"epoch": 0.5116629044394282,
"grad_norm": 1.2121017240253011,
"learning_rate": 1.0109673262398234e-05,
"loss": 1.8729,
"step": 680
},
{
"epoch": 0.5124153498871332,
"grad_norm": 1.1665475810913808,
"learning_rate": 1.0085302101915672e-05,
"loss": 1.8766,
"step": 681
},
{
"epoch": 0.5131677953348383,
"grad_norm": 1.11785953661259,
"learning_rate": 1.0060930434730162e-05,
"loss": 1.8933,
"step": 682
},
{
"epoch": 0.5139202407825433,
"grad_norm": 1.1250896450737091,
"learning_rate": 1.0036558405611832e-05,
"loss": 1.8886,
"step": 683
},
{
"epoch": 0.5146726862302483,
"grad_norm": 1.1599891870739656,
"learning_rate": 1.0012186159332944e-05,
"loss": 1.912,
"step": 684
},
{
"epoch": 0.5154251316779533,
"grad_norm": 1.1531253900509526,
"learning_rate": 9.98781384066706e-06,
"loss": 1.8899,
"step": 685
},
{
"epoch": 0.5161775771256584,
"grad_norm": 1.062817870478141,
"learning_rate": 9.963441594388172e-06,
"loss": 1.8746,
"step": 686
},
{
"epoch": 0.5169300225733634,
"grad_norm": 1.1189856102530844,
"learning_rate": 9.939069565269841e-06,
"loss": 1.8633,
"step": 687
},
{
"epoch": 0.5176824680210684,
"grad_norm": 1.147123577308819,
"learning_rate": 9.914697898084331e-06,
"loss": 1.8647,
"step": 688
},
{
"epoch": 0.5184349134687735,
"grad_norm": 1.120493259193293,
"learning_rate": 9.89032673760177e-06,
"loss": 1.8816,
"step": 689
},
{
"epoch": 0.5191873589164786,
"grad_norm": 1.1417755962249414,
"learning_rate": 9.865956228589259e-06,
"loss": 1.9022,
"step": 690
},
{
"epoch": 0.5199398043641836,
"grad_norm": 1.0959500520755308,
"learning_rate": 9.841586515810045e-06,
"loss": 1.91,
"step": 691
},
{
"epoch": 0.5206922498118887,
"grad_norm": 1.195232500098137,
"learning_rate": 9.817217744022641e-06,
"loss": 1.8794,
"step": 692
},
{
"epoch": 0.5214446952595937,
"grad_norm": 1.1402216362012891,
"learning_rate": 9.79285005797996e-06,
"loss": 1.8815,
"step": 693
},
{
"epoch": 0.5221971407072987,
"grad_norm": 1.1426612568021763,
"learning_rate": 9.768483602428482e-06,
"loss": 1.9119,
"step": 694
},
{
"epoch": 0.5229495861550038,
"grad_norm": 1.1112919218165922,
"learning_rate": 9.744118522107361e-06,
"loss": 1.896,
"step": 695
},
{
"epoch": 0.5237020316027088,
"grad_norm": 1.116695717577581,
"learning_rate": 9.719754961747599e-06,
"loss": 1.8433,
"step": 696
},
{
"epoch": 0.5244544770504138,
"grad_norm": 1.1802226338123998,
"learning_rate": 9.695393066071153e-06,
"loss": 1.9078,
"step": 697
},
{
"epoch": 0.5252069224981188,
"grad_norm": 1.092864745897081,
"learning_rate": 9.671032979790105e-06,
"loss": 1.865,
"step": 698
},
{
"epoch": 0.5259593679458239,
"grad_norm": 1.0680925642060044,
"learning_rate": 9.64667484760578e-06,
"loss": 1.8825,
"step": 699
},
{
"epoch": 0.526711813393529,
"grad_norm": 1.0893412359040773,
"learning_rate": 9.622318814207903e-06,
"loss": 1.8647,
"step": 700
},
{
"epoch": 0.527464258841234,
"grad_norm": 1.0847165970391075,
"learning_rate": 9.597965024273723e-06,
"loss": 1.902,
"step": 701
},
{
"epoch": 0.5282167042889391,
"grad_norm": 1.1569036394608034,
"learning_rate": 9.573613622467166e-06,
"loss": 1.9017,
"step": 702
},
{
"epoch": 0.5289691497366441,
"grad_norm": 1.113636140662339,
"learning_rate": 9.549264753437982e-06,
"loss": 1.8987,
"step": 703
},
{
"epoch": 0.5297215951843491,
"grad_norm": 1.1030560255859905,
"learning_rate": 9.524918561820857e-06,
"loss": 1.9042,
"step": 704
},
{
"epoch": 0.5304740406320542,
"grad_norm": 1.118395786423614,
"learning_rate": 9.500575192234595e-06,
"loss": 1.885,
"step": 705
},
{
"epoch": 0.5312264860797592,
"grad_norm": 1.1032008753423126,
"learning_rate": 9.476234789281215e-06,
"loss": 1.9095,
"step": 706
},
{
"epoch": 0.5319789315274642,
"grad_norm": 1.1984041099434453,
"learning_rate": 9.451897497545136e-06,
"loss": 1.9282,
"step": 707
},
{
"epoch": 0.5327313769751693,
"grad_norm": 1.0981284439472005,
"learning_rate": 9.427563461592271e-06,
"loss": 1.882,
"step": 708
},
{
"epoch": 0.5334838224228743,
"grad_norm": 1.073475056620095,
"learning_rate": 9.403232825969217e-06,
"loss": 1.8683,
"step": 709
},
{
"epoch": 0.5342362678705794,
"grad_norm": 1.1258299640952105,
"learning_rate": 9.378905735202356e-06,
"loss": 1.8933,
"step": 710
},
{
"epoch": 0.5349887133182845,
"grad_norm": 1.1356708255574468,
"learning_rate": 9.354582333797027e-06,
"loss": 1.8711,
"step": 711
},
{
"epoch": 0.5357411587659895,
"grad_norm": 1.1432882270023195,
"learning_rate": 9.330262766236638e-06,
"loss": 1.879,
"step": 712
},
{
"epoch": 0.5364936042136945,
"grad_norm": 1.7362507532960176,
"learning_rate": 9.305947176981843e-06,
"loss": 1.8804,
"step": 713
},
{
"epoch": 0.5372460496613995,
"grad_norm": 1.2221969928907117,
"learning_rate": 9.281635710469639e-06,
"loss": 1.9103,
"step": 714
},
{
"epoch": 0.5379984951091046,
"grad_norm": 1.0665877958349086,
"learning_rate": 9.25732851111256e-06,
"loss": 1.894,
"step": 715
},
{
"epoch": 0.5387509405568096,
"grad_norm": 1.0747197911177933,
"learning_rate": 9.233025723297776e-06,
"loss": 1.9136,
"step": 716
},
{
"epoch": 0.5395033860045146,
"grad_norm": 1.1126858970229665,
"learning_rate": 9.208727491386258e-06,
"loss": 1.8848,
"step": 717
},
{
"epoch": 0.5402558314522197,
"grad_norm": 1.081270348841783,
"learning_rate": 9.184433959711916e-06,
"loss": 1.8757,
"step": 718
},
{
"epoch": 0.5410082768999247,
"grad_norm": 1.1147773519142443,
"learning_rate": 9.160145272580729e-06,
"loss": 1.92,
"step": 719
},
{
"epoch": 0.5417607223476298,
"grad_norm": 1.0680444709497847,
"learning_rate": 9.135861574269917e-06,
"loss": 1.8958,
"step": 720
},
{
"epoch": 0.5425131677953349,
"grad_norm": 1.115208078916154,
"learning_rate": 9.111583009027048e-06,
"loss": 1.8995,
"step": 721
},
{
"epoch": 0.5432656132430399,
"grad_norm": 1.1102606043406795,
"learning_rate": 9.087309721069214e-06,
"loss": 1.8913,
"step": 722
},
{
"epoch": 0.5440180586907449,
"grad_norm": 1.1112655401939273,
"learning_rate": 9.063041854582145e-06,
"loss": 1.9015,
"step": 723
},
{
"epoch": 0.54477050413845,
"grad_norm": 1.135076975109515,
"learning_rate": 9.038779553719386e-06,
"loss": 1.8825,
"step": 724
},
{
"epoch": 0.545522949586155,
"grad_norm": 1.0931111788383723,
"learning_rate": 9.014522962601398e-06,
"loss": 1.8576,
"step": 725
},
{
"epoch": 0.54627539503386,
"grad_norm": 1.10930274293935,
"learning_rate": 8.990272225314743e-06,
"loss": 1.8819,
"step": 726
},
{
"epoch": 0.547027840481565,
"grad_norm": 1.088798229317648,
"learning_rate": 8.96602748591121e-06,
"loss": 1.8752,
"step": 727
},
{
"epoch": 0.5477802859292701,
"grad_norm": 1.0972073968721745,
"learning_rate": 8.941788888406948e-06,
"loss": 1.8284,
"step": 728
},
{
"epoch": 0.5485327313769752,
"grad_norm": 1.081280728240555,
"learning_rate": 8.917556576781638e-06,
"loss": 1.8781,
"step": 729
},
{
"epoch": 0.5492851768246803,
"grad_norm": 1.0607585073434993,
"learning_rate": 8.893330694977606e-06,
"loss": 1.8805,
"step": 730
},
{
"epoch": 0.5500376222723853,
"grad_norm": 1.103092240375503,
"learning_rate": 8.869111386898997e-06,
"loss": 1.8727,
"step": 731
},
{
"epoch": 0.5507900677200903,
"grad_norm": 1.1072208953139326,
"learning_rate": 8.844898796410901e-06,
"loss": 1.8962,
"step": 732
},
{
"epoch": 0.5515425131677953,
"grad_norm": 1.0816873098353128,
"learning_rate": 8.820693067338507e-06,
"loss": 1.8606,
"step": 733
},
{
"epoch": 0.5522949586155004,
"grad_norm": 1.0648510597859977,
"learning_rate": 8.796494343466247e-06,
"loss": 1.8902,
"step": 734
},
{
"epoch": 0.5530474040632054,
"grad_norm": 1.146842548525928,
"learning_rate": 8.772302768536943e-06,
"loss": 1.915,
"step": 735
},
{
"epoch": 0.5537998495109104,
"grad_norm": 1.1465613740505685,
"learning_rate": 8.748118486250942e-06,
"loss": 1.8951,
"step": 736
},
{
"epoch": 0.5545522949586155,
"grad_norm": 1.093964552351514,
"learning_rate": 8.723941640265283e-06,
"loss": 1.8838,
"step": 737
},
{
"epoch": 0.5553047404063205,
"grad_norm": 1.116061516697543,
"learning_rate": 8.699772374192835e-06,
"loss": 1.885,
"step": 738
},
{
"epoch": 0.5560571858540256,
"grad_norm": 1.0985197893921672,
"learning_rate": 8.675610831601424e-06,
"loss": 1.8882,
"step": 739
},
{
"epoch": 0.5568096313017307,
"grad_norm": 1.1484046699918495,
"learning_rate": 8.65145715601302e-06,
"loss": 1.8949,
"step": 740
},
{
"epoch": 0.5575620767494357,
"grad_norm": 1.135920825638118,
"learning_rate": 8.627311490902843e-06,
"loss": 1.8746,
"step": 741
},
{
"epoch": 0.5583145221971407,
"grad_norm": 1.082262942097127,
"learning_rate": 8.603173979698544e-06,
"loss": 1.8595,
"step": 742
},
{
"epoch": 0.5590669676448458,
"grad_norm": 1.0995492221575693,
"learning_rate": 8.579044765779329e-06,
"loss": 1.8887,
"step": 743
},
{
"epoch": 0.5598194130925508,
"grad_norm": 1.0300135714419392,
"learning_rate": 8.554923992475126e-06,
"loss": 1.8774,
"step": 744
},
{
"epoch": 0.5605718585402558,
"grad_norm": 1.0811930693480996,
"learning_rate": 8.530811803065715e-06,
"loss": 1.9057,
"step": 745
},
{
"epoch": 0.5613243039879608,
"grad_norm": 1.121110234768637,
"learning_rate": 8.5067083407799e-06,
"loss": 1.9023,
"step": 746
},
{
"epoch": 0.5620767494356659,
"grad_norm": 1.106803540080784,
"learning_rate": 8.482613748794628e-06,
"loss": 1.9079,
"step": 747
},
{
"epoch": 0.5628291948833709,
"grad_norm": 1.0582586304621557,
"learning_rate": 8.458528170234171e-06,
"loss": 1.8791,
"step": 748
},
{
"epoch": 0.563581640331076,
"grad_norm": 1.075880008538793,
"learning_rate": 8.434451748169255e-06,
"loss": 1.8817,
"step": 749
},
{
"epoch": 0.5643340857787811,
"grad_norm": 1.0752574996218844,
"learning_rate": 8.410384625616208e-06,
"loss": 1.8713,
"step": 750
},
{
"epoch": 0.5650865312264861,
"grad_norm": 1.0657448451640463,
"learning_rate": 8.386326945536134e-06,
"loss": 1.8891,
"step": 751
},
{
"epoch": 0.5658389766741911,
"grad_norm": 1.0696496973223852,
"learning_rate": 8.36227885083403e-06,
"loss": 1.8647,
"step": 752
},
{
"epoch": 0.5665914221218962,
"grad_norm": 1.0759312819478026,
"learning_rate": 8.338240484357971e-06,
"loss": 1.882,
"step": 753
},
{
"epoch": 0.5673438675696012,
"grad_norm": 1.064206035000726,
"learning_rate": 8.31421198889823e-06,
"loss": 1.8846,
"step": 754
},
{
"epoch": 0.5680963130173062,
"grad_norm": 1.0466413284676448,
"learning_rate": 8.290193507186464e-06,
"loss": 1.8739,
"step": 755
},
{
"epoch": 0.5688487584650113,
"grad_norm": 1.085335176042622,
"learning_rate": 8.266185181894821e-06,
"loss": 1.8753,
"step": 756
},
{
"epoch": 0.5696012039127163,
"grad_norm": 1.0643101282268683,
"learning_rate": 8.24218715563515e-06,
"loss": 1.871,
"step": 757
},
{
"epoch": 0.5703536493604213,
"grad_norm": 1.0834453992837438,
"learning_rate": 8.218199570958094e-06,
"loss": 1.8884,
"step": 758
},
{
"epoch": 0.5711060948081265,
"grad_norm": 1.0821042807623824,
"learning_rate": 8.194222570352295e-06,
"loss": 1.892,
"step": 759
},
{
"epoch": 0.5718585402558315,
"grad_norm": 1.0748421078468517,
"learning_rate": 8.170256296243505e-06,
"loss": 1.89,
"step": 760
},
{
"epoch": 0.5726109857035365,
"grad_norm": 1.1102151774384843,
"learning_rate": 8.146300890993776e-06,
"loss": 1.8943,
"step": 761
},
{
"epoch": 0.5733634311512416,
"grad_norm": 1.1746792828903914,
"learning_rate": 8.12235649690059e-06,
"loss": 1.8991,
"step": 762
},
{
"epoch": 0.5741158765989466,
"grad_norm": 1.0842197587398976,
"learning_rate": 8.098423256196018e-06,
"loss": 1.8804,
"step": 763
},
{
"epoch": 0.5748683220466516,
"grad_norm": 1.0702520842016043,
"learning_rate": 8.074501311045892e-06,
"loss": 1.8774,
"step": 764
},
{
"epoch": 0.5756207674943566,
"grad_norm": 1.0693456442492546,
"learning_rate": 8.050590803548927e-06,
"loss": 1.8699,
"step": 765
},
{
"epoch": 0.5763732129420617,
"grad_norm": 1.0994793788492523,
"learning_rate": 8.026691875735918e-06,
"loss": 1.8726,
"step": 766
},
{
"epoch": 0.5771256583897667,
"grad_norm": 1.1377783027758315,
"learning_rate": 8.00280466956886e-06,
"loss": 1.8777,
"step": 767
},
{
"epoch": 0.5778781038374717,
"grad_norm": 1.0434648434735232,
"learning_rate": 7.978929326940135e-06,
"loss": 1.8767,
"step": 768
},
{
"epoch": 0.5786305492851769,
"grad_norm": 1.139732504886554,
"learning_rate": 7.955065989671636e-06,
"loss": 1.8927,
"step": 769
},
{
"epoch": 0.5793829947328819,
"grad_norm": 1.1081392751420618,
"learning_rate": 7.93121479951396e-06,
"loss": 1.8867,
"step": 770
},
{
"epoch": 0.5801354401805869,
"grad_norm": 1.0920534874359324,
"learning_rate": 7.907375898145538e-06,
"loss": 1.8562,
"step": 771
},
{
"epoch": 0.580887885628292,
"grad_norm": 1.053434184714466,
"learning_rate": 7.883549427171806e-06,
"loss": 1.8967,
"step": 772
},
{
"epoch": 0.581640331075997,
"grad_norm": 1.1026958023882896,
"learning_rate": 7.859735528124375e-06,
"loss": 1.8572,
"step": 773
},
{
"epoch": 0.582392776523702,
"grad_norm": 1.0836945065636798,
"learning_rate": 7.835934342460156e-06,
"loss": 1.8508,
"step": 774
},
{
"epoch": 0.5831452219714071,
"grad_norm": 1.0342001755072274,
"learning_rate": 7.81214601156056e-06,
"loss": 1.869,
"step": 775
},
{
"epoch": 0.5838976674191121,
"grad_norm": 1.0576579840286333,
"learning_rate": 7.788370676730625e-06,
"loss": 1.9003,
"step": 776
},
{
"epoch": 0.5846501128668171,
"grad_norm": 1.0459117857943212,
"learning_rate": 7.764608479198204e-06,
"loss": 1.8917,
"step": 777
},
{
"epoch": 0.5854025583145221,
"grad_norm": 1.0587258910785786,
"learning_rate": 7.740859560113101e-06,
"loss": 1.8724,
"step": 778
},
{
"epoch": 0.5861550037622273,
"grad_norm": 1.1043700099821472,
"learning_rate": 7.717124060546254e-06,
"loss": 1.9001,
"step": 779
},
{
"epoch": 0.5869074492099323,
"grad_norm": 1.0373401986358386,
"learning_rate": 7.693402121488884e-06,
"loss": 1.8792,
"step": 780
},
{
"epoch": 0.5876598946576373,
"grad_norm": 1.078222662916046,
"learning_rate": 7.669693883851663e-06,
"loss": 1.8774,
"step": 781
},
{
"epoch": 0.5884123401053424,
"grad_norm": 1.1004587622411446,
"learning_rate": 7.645999488463867e-06,
"loss": 1.8825,
"step": 782
},
{
"epoch": 0.5891647855530474,
"grad_norm": 1.03948179824543,
"learning_rate": 7.622319076072564e-06,
"loss": 1.8709,
"step": 783
},
{
"epoch": 0.5899172310007524,
"grad_norm": 1.0409662005331402,
"learning_rate": 7.598652787341744e-06,
"loss": 1.9015,
"step": 784
},
{
"epoch": 0.5906696764484575,
"grad_norm": 1.048861728744586,
"learning_rate": 7.575000762851511e-06,
"loss": 1.8375,
"step": 785
},
{
"epoch": 0.5914221218961625,
"grad_norm": 1.0427458613752496,
"learning_rate": 7.551363143097244e-06,
"loss": 1.8533,
"step": 786
},
{
"epoch": 0.5921745673438675,
"grad_norm": 1.0560205662208115,
"learning_rate": 7.527740068488735e-06,
"loss": 1.8801,
"step": 787
},
{
"epoch": 0.5929270127915726,
"grad_norm": 1.0284376139487794,
"learning_rate": 7.504131679349402e-06,
"loss": 1.9211,
"step": 788
},
{
"epoch": 0.5936794582392777,
"grad_norm": 1.10045531344144,
"learning_rate": 7.48053811591541e-06,
"loss": 1.8868,
"step": 789
},
{
"epoch": 0.5944319036869827,
"grad_norm": 1.0707064415323388,
"learning_rate": 7.456959518334871e-06,
"loss": 1.8675,
"step": 790
},
{
"epoch": 0.5951843491346878,
"grad_norm": 1.0678604180332616,
"learning_rate": 7.4333960266669855e-06,
"loss": 1.8869,
"step": 791
},
{
"epoch": 0.5959367945823928,
"grad_norm": 1.0335979975813494,
"learning_rate": 7.409847780881241e-06,
"loss": 1.868,
"step": 792
},
{
"epoch": 0.5966892400300978,
"grad_norm": 1.0768514093790162,
"learning_rate": 7.386314920856546e-06,
"loss": 1.8905,
"step": 793
},
{
"epoch": 0.5974416854778029,
"grad_norm": 1.04450964575544,
"learning_rate": 7.362797586380423e-06,
"loss": 1.8627,
"step": 794
},
{
"epoch": 0.5981941309255079,
"grad_norm": 1.1060259185740902,
"learning_rate": 7.339295917148173e-06,
"loss": 1.8864,
"step": 795
},
{
"epoch": 0.5989465763732129,
"grad_norm": 1.1120615052858698,
"learning_rate": 7.315810052762038e-06,
"loss": 1.8676,
"step": 796
},
{
"epoch": 0.5996990218209179,
"grad_norm": 1.0753811359289978,
"learning_rate": 7.292340132730389e-06,
"loss": 1.8951,
"step": 797
},
{
"epoch": 0.600451467268623,
"grad_norm": 1.0644335012475332,
"learning_rate": 7.268886296466871e-06,
"loss": 1.9045,
"step": 798
},
{
"epoch": 0.6012039127163281,
"grad_norm": 1.1095391720208168,
"learning_rate": 7.245448683289605e-06,
"loss": 1.8685,
"step": 799
},
{
"epoch": 0.6019563581640331,
"grad_norm": 1.0893258646896942,
"learning_rate": 7.222027432420329e-06,
"loss": 1.8843,
"step": 800
},
{
"epoch": 0.6027088036117382,
"grad_norm": 1.0699625038783227,
"learning_rate": 7.198622682983603e-06,
"loss": 1.8948,
"step": 801
},
{
"epoch": 0.6034612490594432,
"grad_norm": 1.0648616152152532,
"learning_rate": 7.1752345740059536e-06,
"loss": 1.867,
"step": 802
},
{
"epoch": 0.6042136945071482,
"grad_norm": 1.0824479797332096,
"learning_rate": 7.151863244415076e-06,
"loss": 1.8857,
"step": 803
},
{
"epoch": 0.6049661399548533,
"grad_norm": 1.0882658999777497,
"learning_rate": 7.128508833038976e-06,
"loss": 1.8616,
"step": 804
},
{
"epoch": 0.6057185854025583,
"grad_norm": 1.0346524470102498,
"learning_rate": 7.105171478605182e-06,
"loss": 1.9004,
"step": 805
},
{
"epoch": 0.6064710308502633,
"grad_norm": 1.1148746521459307,
"learning_rate": 7.081851319739884e-06,
"loss": 1.9037,
"step": 806
},
{
"epoch": 0.6072234762979684,
"grad_norm": 1.104795271634037,
"learning_rate": 7.0585484949671475e-06,
"loss": 1.8581,
"step": 807
},
{
"epoch": 0.6079759217456734,
"grad_norm": 1.1287536280744406,
"learning_rate": 7.035263142708058e-06,
"loss": 1.8719,
"step": 808
},
{
"epoch": 0.6087283671933785,
"grad_norm": 1.065962601783126,
"learning_rate": 7.0119954012799195e-06,
"loss": 1.8816,
"step": 809
},
{
"epoch": 0.6094808126410836,
"grad_norm": 1.0420686856614378,
"learning_rate": 6.988745408895424e-06,
"loss": 1.8833,
"step": 810
},
{
"epoch": 0.6102332580887886,
"grad_norm": 1.1389179070263824,
"learning_rate": 6.965513303661826e-06,
"loss": 1.8627,
"step": 811
},
{
"epoch": 0.6109857035364936,
"grad_norm": 1.0899571897340397,
"learning_rate": 6.942299223580144e-06,
"loss": 1.862,
"step": 812
},
{
"epoch": 0.6117381489841986,
"grad_norm": 1.094079979917299,
"learning_rate": 6.9191033065443045e-06,
"loss": 1.8353,
"step": 813
},
{
"epoch": 0.6124905944319037,
"grad_norm": 1.2384822137857412,
"learning_rate": 6.895925690340367e-06,
"loss": 1.8737,
"step": 814
},
{
"epoch": 0.6132430398796087,
"grad_norm": 1.1178163868195434,
"learning_rate": 6.872766512645661e-06,
"loss": 1.8989,
"step": 815
},
{
"epoch": 0.6139954853273137,
"grad_norm": 1.0834192704436556,
"learning_rate": 6.849625911028005e-06,
"loss": 1.886,
"step": 816
},
{
"epoch": 0.6147479307750188,
"grad_norm": 1.18098704995797,
"learning_rate": 6.826504022944862e-06,
"loss": 1.8845,
"step": 817
},
{
"epoch": 0.6155003762227238,
"grad_norm": 1.084160480633442,
"learning_rate": 6.803400985742545e-06,
"loss": 1.8831,
"step": 818
},
{
"epoch": 0.6162528216704289,
"grad_norm": 1.0424871195462626,
"learning_rate": 6.780316936655382e-06,
"loss": 1.8569,
"step": 819
},
{
"epoch": 0.617005267118134,
"grad_norm": 1.120406511405237,
"learning_rate": 6.7572520128049164e-06,
"loss": 1.8348,
"step": 820
},
{
"epoch": 0.617757712565839,
"grad_norm": 1.1095836180486962,
"learning_rate": 6.734206351199086e-06,
"loss": 1.8549,
"step": 821
},
{
"epoch": 0.618510158013544,
"grad_norm": 1.0363101716143148,
"learning_rate": 6.7111800887314e-06,
"loss": 1.8466,
"step": 822
},
{
"epoch": 0.6192626034612491,
"grad_norm": 1.2180940023345548,
"learning_rate": 6.688173362180148e-06,
"loss": 1.8787,
"step": 823
},
{
"epoch": 0.6200150489089541,
"grad_norm": 1.14763630470459,
"learning_rate": 6.665186308207562e-06,
"loss": 1.8503,
"step": 824
},
{
"epoch": 0.6207674943566591,
"grad_norm": 1.0369034496090916,
"learning_rate": 6.642219063359023e-06,
"loss": 1.8796,
"step": 825
},
{
"epoch": 0.6215199398043642,
"grad_norm": 1.0905320538315506,
"learning_rate": 6.619271764062244e-06,
"loss": 1.8565,
"step": 826
},
{
"epoch": 0.6222723852520692,
"grad_norm": 1.1322470515207577,
"learning_rate": 6.596344546626461e-06,
"loss": 1.8824,
"step": 827
},
{
"epoch": 0.6230248306997742,
"grad_norm": 1.0718317317034252,
"learning_rate": 6.5734375472416115e-06,
"loss": 1.8731,
"step": 828
},
{
"epoch": 0.6237772761474794,
"grad_norm": 1.0874087841372506,
"learning_rate": 6.550550901977552e-06,
"loss": 1.8818,
"step": 829
},
{
"epoch": 0.6245297215951844,
"grad_norm": 1.127242196846014,
"learning_rate": 6.527684746783221e-06,
"loss": 1.8704,
"step": 830
},
{
"epoch": 0.6252821670428894,
"grad_norm": 1.0942518078833112,
"learning_rate": 6.5048392174858465e-06,
"loss": 1.8605,
"step": 831
},
{
"epoch": 0.6260346124905944,
"grad_norm": 1.0639590580353147,
"learning_rate": 6.482014449790145e-06,
"loss": 1.8858,
"step": 832
},
{
"epoch": 0.6267870579382995,
"grad_norm": 1.0734319890028652,
"learning_rate": 6.459210579277492e-06,
"loss": 1.9042,
"step": 833
},
{
"epoch": 0.6275395033860045,
"grad_norm": 1.0272161152972843,
"learning_rate": 6.4364277414051465e-06,
"loss": 1.8535,
"step": 834
},
{
"epoch": 0.6282919488337095,
"grad_norm": 1.1019719052408967,
"learning_rate": 6.41366607150542e-06,
"loss": 1.8581,
"step": 835
},
{
"epoch": 0.6290443942814146,
"grad_norm": 1.066530671040146,
"learning_rate": 6.390925704784894e-06,
"loss": 1.8616,
"step": 836
},
{
"epoch": 0.6297968397291196,
"grad_norm": 1.0489151200874183,
"learning_rate": 6.368206776323593e-06,
"loss": 1.8662,
"step": 837
},
{
"epoch": 0.6305492851768246,
"grad_norm": 1.1038234385827237,
"learning_rate": 6.345509421074218e-06,
"loss": 1.8763,
"step": 838
},
{
"epoch": 0.6313017306245298,
"grad_norm": 1.1035165845816162,
"learning_rate": 6.322833773861296e-06,
"loss": 1.8889,
"step": 839
},
{
"epoch": 0.6320541760722348,
"grad_norm": 1.056796594189293,
"learning_rate": 6.300179969380435e-06,
"loss": 1.8894,
"step": 840
},
{
"epoch": 0.6328066215199398,
"grad_norm": 1.1262551747815883,
"learning_rate": 6.277548142197468e-06,
"loss": 1.8652,
"step": 841
},
{
"epoch": 0.6335590669676449,
"grad_norm": 1.1226312501088387,
"learning_rate": 6.254938426747697e-06,
"loss": 1.8603,
"step": 842
},
{
"epoch": 0.6343115124153499,
"grad_norm": 1.0628514257337833,
"learning_rate": 6.232350957335078e-06,
"loss": 1.8661,
"step": 843
},
{
"epoch": 0.6350639578630549,
"grad_norm": 1.0614321342821789,
"learning_rate": 6.2097858681314115e-06,
"loss": 1.8483,
"step": 844
},
{
"epoch": 0.63581640331076,
"grad_norm": 1.0956810703157411,
"learning_rate": 6.187243293175573e-06,
"loss": 1.8522,
"step": 845
},
{
"epoch": 0.636568848758465,
"grad_norm": 1.0316647820147749,
"learning_rate": 6.164723366372688e-06,
"loss": 1.8246,
"step": 846
},
{
"epoch": 0.63732129420617,
"grad_norm": 1.059185688849693,
"learning_rate": 6.142226221493359e-06,
"loss": 1.879,
"step": 847
},
{
"epoch": 0.6380737396538751,
"grad_norm": 1.1116975998613274,
"learning_rate": 6.119751992172853e-06,
"loss": 1.9026,
"step": 848
},
{
"epoch": 0.6388261851015802,
"grad_norm": 1.053606056443976,
"learning_rate": 6.097300811910327e-06,
"loss": 1.8927,
"step": 849
},
{
"epoch": 0.6395786305492852,
"grad_norm": 1.0128831623077768,
"learning_rate": 6.07487281406801e-06,
"loss": 1.8792,
"step": 850
},
{
"epoch": 0.6403310759969902,
"grad_norm": 1.0373070563762181,
"learning_rate": 6.052468131870444e-06,
"loss": 1.9109,
"step": 851
},
{
"epoch": 0.6410835214446953,
"grad_norm": 1.0938469620283464,
"learning_rate": 6.030086898403652e-06,
"loss": 1.8867,
"step": 852
},
{
"epoch": 0.6418359668924003,
"grad_norm": 1.0718195776015125,
"learning_rate": 6.007729246614387e-06,
"loss": 1.829,
"step": 853
},
{
"epoch": 0.6425884123401053,
"grad_norm": 1.0695852387237827,
"learning_rate": 5.985395309309319e-06,
"loss": 1.859,
"step": 854
},
{
"epoch": 0.6433408577878104,
"grad_norm": 1.0800909238629528,
"learning_rate": 5.963085219154247e-06,
"loss": 1.8781,
"step": 855
},
{
"epoch": 0.6440933032355154,
"grad_norm": 1.0633913537814792,
"learning_rate": 5.94079910867332e-06,
"loss": 1.897,
"step": 856
},
{
"epoch": 0.6448457486832204,
"grad_norm": 1.046832444949393,
"learning_rate": 5.918537110248244e-06,
"loss": 1.8757,
"step": 857
},
{
"epoch": 0.6455981941309256,
"grad_norm": 1.125937864462823,
"learning_rate": 5.896299356117501e-06,
"loss": 1.8509,
"step": 858
},
{
"epoch": 0.6463506395786306,
"grad_norm": 1.068935112590435,
"learning_rate": 5.874085978375548e-06,
"loss": 1.8652,
"step": 859
},
{
"epoch": 0.6471030850263356,
"grad_norm": 1.0454577081772816,
"learning_rate": 5.8518971089720626e-06,
"loss": 1.8545,
"step": 860
},
{
"epoch": 0.6478555304740407,
"grad_norm": 1.066166566570173,
"learning_rate": 5.829732879711116e-06,
"loss": 1.8513,
"step": 861
},
{
"epoch": 0.6486079759217457,
"grad_norm": 1.08756382287716,
"learning_rate": 5.807593422250441e-06,
"loss": 1.8762,
"step": 862
},
{
"epoch": 0.6493604213694507,
"grad_norm": 1.0583008995308651,
"learning_rate": 5.785478868100604e-06,
"loss": 1.8671,
"step": 863
},
{
"epoch": 0.6501128668171557,
"grad_norm": 1.061436568318655,
"learning_rate": 5.763389348624251e-06,
"loss": 1.8756,
"step": 864
},
{
"epoch": 0.6508653122648608,
"grad_norm": 1.0421654680383785,
"learning_rate": 5.741324995035318e-06,
"loss": 1.8487,
"step": 865
},
{
"epoch": 0.6516177577125658,
"grad_norm": 1.0610240436422747,
"learning_rate": 5.719285938398254e-06,
"loss": 1.8559,
"step": 866
},
{
"epoch": 0.6523702031602708,
"grad_norm": 1.10634321610701,
"learning_rate": 5.69727230962724e-06,
"loss": 1.8823,
"step": 867
},
{
"epoch": 0.653122648607976,
"grad_norm": 1.0685330592231643,
"learning_rate": 5.675284239485415e-06,
"loss": 1.8663,
"step": 868
},
{
"epoch": 0.653875094055681,
"grad_norm": 1.083642864531283,
"learning_rate": 5.653321858584095e-06,
"loss": 1.8466,
"step": 869
},
{
"epoch": 0.654627539503386,
"grad_norm": 1.0450582392175871,
"learning_rate": 5.631385297382004e-06,
"loss": 1.8813,
"step": 870
},
{
"epoch": 0.6553799849510911,
"grad_norm": 1.063990757121348,
"learning_rate": 5.609474686184488e-06,
"loss": 1.8672,
"step": 871
},
{
"epoch": 0.6561324303987961,
"grad_norm": 1.0673884066784576,
"learning_rate": 5.58759015514275e-06,
"loss": 1.891,
"step": 872
},
{
"epoch": 0.6568848758465011,
"grad_norm": 1.0476852428703887,
"learning_rate": 5.565731834253077e-06,
"loss": 1.8932,
"step": 873
},
{
"epoch": 0.6576373212942062,
"grad_norm": 1.0657447611086608,
"learning_rate": 5.543899853356062e-06,
"loss": 1.8678,
"step": 874
},
{
"epoch": 0.6583897667419112,
"grad_norm": 1.0299069267659997,
"learning_rate": 5.522094342135835e-06,
"loss": 1.8575,
"step": 875
},
{
"epoch": 0.6591422121896162,
"grad_norm": 1.0216200871570382,
"learning_rate": 5.500315430119298e-06,
"loss": 1.8882,
"step": 876
},
{
"epoch": 0.6598946576373212,
"grad_norm": 1.0101377471451225,
"learning_rate": 5.478563246675345e-06,
"loss": 1.8788,
"step": 877
},
{
"epoch": 0.6606471030850264,
"grad_norm": 1.03088011820034,
"learning_rate": 5.456837921014105e-06,
"loss": 1.8731,
"step": 878
},
{
"epoch": 0.6613995485327314,
"grad_norm": 1.032314642198546,
"learning_rate": 5.4351395821861665e-06,
"loss": 1.8403,
"step": 879
},
{
"epoch": 0.6621519939804364,
"grad_norm": 1.0501227263040016,
"learning_rate": 5.413468359081814e-06,
"loss": 1.8573,
"step": 880
},
{
"epoch": 0.6629044394281415,
"grad_norm": 1.045156869690123,
"learning_rate": 5.391824380430262e-06,
"loss": 1.8559,
"step": 881
},
{
"epoch": 0.6636568848758465,
"grad_norm": 1.0291749582131042,
"learning_rate": 5.3702077747988904e-06,
"loss": 1.8707,
"step": 882
},
{
"epoch": 0.6644093303235515,
"grad_norm": 1.0592190385996014,
"learning_rate": 5.3486186705924785e-06,
"loss": 1.8623,
"step": 883
},
{
"epoch": 0.6651617757712566,
"grad_norm": 1.1414274311523724,
"learning_rate": 5.327057196052449e-06,
"loss": 1.8527,
"step": 884
},
{
"epoch": 0.6659142212189616,
"grad_norm": 1.0380358643535323,
"learning_rate": 5.305523479256096e-06,
"loss": 1.8267,
"step": 885
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.1355095130539972,
"learning_rate": 5.284017648115837e-06,
"loss": 1.84,
"step": 886
},
{
"epoch": 0.6674191121143717,
"grad_norm": 1.0486771971224826,
"learning_rate": 5.262539830378438e-06,
"loss": 1.8217,
"step": 887
},
{
"epoch": 0.6681715575620768,
"grad_norm": 1.0136027505778908,
"learning_rate": 5.241090153624264e-06,
"loss": 1.8779,
"step": 888
},
{
"epoch": 0.6689240030097818,
"grad_norm": 1.058914963648435,
"learning_rate": 5.219668745266533e-06,
"loss": 1.8936,
"step": 889
},
{
"epoch": 0.6696764484574869,
"grad_norm": 1.0646997899559358,
"learning_rate": 5.198275732550522e-06,
"loss": 1.8627,
"step": 890
},
{
"epoch": 0.6704288939051919,
"grad_norm": 1.03999716636285,
"learning_rate": 5.17691124255286e-06,
"loss": 1.8391,
"step": 891
},
{
"epoch": 0.6711813393528969,
"grad_norm": 1.029504367840649,
"learning_rate": 5.155575402180721e-06,
"loss": 1.8265,
"step": 892
},
{
"epoch": 0.671933784800602,
"grad_norm": 1.037070735541169,
"learning_rate": 5.134268338171133e-06,
"loss": 1.8645,
"step": 893
},
{
"epoch": 0.672686230248307,
"grad_norm": 1.0300807634865214,
"learning_rate": 5.1129901770901525e-06,
"loss": 1.8881,
"step": 894
},
{
"epoch": 0.673438675696012,
"grad_norm": 1.0101680459853548,
"learning_rate": 5.091741045332173e-06,
"loss": 1.8287,
"step": 895
},
{
"epoch": 0.674191121143717,
"grad_norm": 1.0377526220275621,
"learning_rate": 5.070521069119143e-06,
"loss": 1.8574,
"step": 896
},
{
"epoch": 0.6749435665914221,
"grad_norm": 1.098563839124832,
"learning_rate": 5.049330374499826e-06,
"loss": 1.855,
"step": 897
},
{
"epoch": 0.6756960120391272,
"grad_norm": 1.053244681331422,
"learning_rate": 5.028169087349051e-06,
"loss": 1.8632,
"step": 898
},
{
"epoch": 0.6764484574868322,
"grad_norm": 1.062194461309525,
"learning_rate": 5.0070373333669595e-06,
"loss": 1.867,
"step": 899
},
{
"epoch": 0.6772009029345373,
"grad_norm": 1.0634769068276029,
"learning_rate": 4.98593523807827e-06,
"loss": 1.8698,
"step": 900
},
{
"epoch": 0.6779533483822423,
"grad_norm": 1.064534835156436,
"learning_rate": 4.964862926831513e-06,
"loss": 1.8487,
"step": 901
},
{
"epoch": 0.6787057938299473,
"grad_norm": 1.0965470149807914,
"learning_rate": 4.94382052479832e-06,
"loss": 1.8903,
"step": 902
},
{
"epoch": 0.6794582392776524,
"grad_norm": 1.0474009117789016,
"learning_rate": 4.922808156972633e-06,
"loss": 1.8768,
"step": 903
},
{
"epoch": 0.6802106847253574,
"grad_norm": 1.0797473026706037,
"learning_rate": 4.901825948170013e-06,
"loss": 1.8515,
"step": 904
},
{
"epoch": 0.6809631301730624,
"grad_norm": 1.0813921908340174,
"learning_rate": 4.880874023026847e-06,
"loss": 1.8091,
"step": 905
},
{
"epoch": 0.6817155756207675,
"grad_norm": 1.0571610160922023,
"learning_rate": 4.859952505999663e-06,
"loss": 1.8747,
"step": 906
},
{
"epoch": 0.6824680210684725,
"grad_norm": 1.0276296079598137,
"learning_rate": 4.839061521364332e-06,
"loss": 1.8537,
"step": 907
},
{
"epoch": 0.6832204665161776,
"grad_norm": 1.0623850760569649,
"learning_rate": 4.81820119321539e-06,
"loss": 1.8546,
"step": 908
},
{
"epoch": 0.6839729119638827,
"grad_norm": 1.0605487774621996,
"learning_rate": 4.79737164546524e-06,
"loss": 1.9073,
"step": 909
},
{
"epoch": 0.6847253574115877,
"grad_norm": 1.055775521933287,
"learning_rate": 4.776573001843475e-06,
"loss": 1.8941,
"step": 910
},
{
"epoch": 0.6854778028592927,
"grad_norm": 1.0161717838590547,
"learning_rate": 4.75580538589609e-06,
"loss": 1.836,
"step": 911
},
{
"epoch": 0.6862302483069977,
"grad_norm": 1.005689943207561,
"learning_rate": 4.735068920984786e-06,
"loss": 1.8764,
"step": 912
},
{
"epoch": 0.6869826937547028,
"grad_norm": 1.0316928323984398,
"learning_rate": 4.714363730286227e-06,
"loss": 1.8451,
"step": 913
},
{
"epoch": 0.6877351392024078,
"grad_norm": 1.0668836355198565,
"learning_rate": 4.69368993679129e-06,
"loss": 1.8866,
"step": 914
},
{
"epoch": 0.6884875846501128,
"grad_norm": 1.0438489771358255,
"learning_rate": 4.67304766330437e-06,
"loss": 1.85,
"step": 915
},
{
"epoch": 0.6892400300978179,
"grad_norm": 1.0099907929872505,
"learning_rate": 4.652437032442604e-06,
"loss": 1.8202,
"step": 916
},
{
"epoch": 0.6899924755455229,
"grad_norm": 1.0825561360561284,
"learning_rate": 4.631858166635198e-06,
"loss": 1.9002,
"step": 917
},
{
"epoch": 0.690744920993228,
"grad_norm": 1.0338959329395747,
"learning_rate": 4.6113111881226425e-06,
"loss": 1.8533,
"step": 918
},
{
"epoch": 0.6914973664409331,
"grad_norm": 1.013280662201295,
"learning_rate": 4.590796218956041e-06,
"loss": 1.8485,
"step": 919
},
{
"epoch": 0.6922498118886381,
"grad_norm": 1.0162508021718046,
"learning_rate": 4.570313380996331e-06,
"loss": 1.8678,
"step": 920
},
{
"epoch": 0.6930022573363431,
"grad_norm": 1.0778884165424412,
"learning_rate": 4.549862795913614e-06,
"loss": 1.8308,
"step": 921
},
{
"epoch": 0.6937547027840482,
"grad_norm": 1.0312971152947497,
"learning_rate": 4.5294445851863824e-06,
"loss": 1.8584,
"step": 922
},
{
"epoch": 0.6945071482317532,
"grad_norm": 1.077471155872955,
"learning_rate": 4.50905887010084e-06,
"loss": 1.8573,
"step": 923
},
{
"epoch": 0.6952595936794582,
"grad_norm": 1.043342749750788,
"learning_rate": 4.488705771750155e-06,
"loss": 1.856,
"step": 924
},
{
"epoch": 0.6960120391271633,
"grad_norm": 1.0342741892733396,
"learning_rate": 4.468385411033749e-06,
"loss": 1.8593,
"step": 925
},
{
"epoch": 0.6967644845748683,
"grad_norm": 1.0255967068778735,
"learning_rate": 4.44809790865658e-06,
"loss": 1.8451,
"step": 926
},
{
"epoch": 0.6975169300225733,
"grad_norm": 1.0546707056505376,
"learning_rate": 4.427843385128424e-06,
"loss": 1.8768,
"step": 927
},
{
"epoch": 0.6982693754702785,
"grad_norm": 1.0504688408199025,
"learning_rate": 4.407621960763163e-06,
"loss": 1.8378,
"step": 928
},
{
"epoch": 0.6990218209179835,
"grad_norm": 1.0212676369827438,
"learning_rate": 4.3874337556780535e-06,
"loss": 1.8574,
"step": 929
},
{
"epoch": 0.6997742663656885,
"grad_norm": 1.0383600963448383,
"learning_rate": 4.367278889793049e-06,
"loss": 1.8648,
"step": 930
},
{
"epoch": 0.7005267118133935,
"grad_norm": 1.019758513181163,
"learning_rate": 4.347157482830036e-06,
"loss": 1.858,
"step": 931
},
{
"epoch": 0.7012791572610986,
"grad_norm": 1.0208996886767867,
"learning_rate": 4.327069654312184e-06,
"loss": 1.8423,
"step": 932
},
{
"epoch": 0.7020316027088036,
"grad_norm": 1.0252268870077097,
"learning_rate": 4.30701552356317e-06,
"loss": 1.8336,
"step": 933
},
{
"epoch": 0.7027840481565086,
"grad_norm": 1.029668938552875,
"learning_rate": 4.286995209706537e-06,
"loss": 1.8652,
"step": 934
},
{
"epoch": 0.7035364936042137,
"grad_norm": 1.018511117658671,
"learning_rate": 4.267008831664919e-06,
"loss": 1.8698,
"step": 935
},
{
"epoch": 0.7042889390519187,
"grad_norm": 1.0834729399319512,
"learning_rate": 4.247056508159392e-06,
"loss": 1.8638,
"step": 936
},
{
"epoch": 0.7050413844996237,
"grad_norm": 1.0630611912469907,
"learning_rate": 4.227138357708735e-06,
"loss": 1.8952,
"step": 937
},
{
"epoch": 0.7057938299473289,
"grad_norm": 1.0226011425196277,
"learning_rate": 4.207254498628737e-06,
"loss": 1.8595,
"step": 938
},
{
"epoch": 0.7065462753950339,
"grad_norm": 1.058444306922959,
"learning_rate": 4.187405049031492e-06,
"loss": 1.842,
"step": 939
},
{
"epoch": 0.7072987208427389,
"grad_norm": 1.063239796401595,
"learning_rate": 4.167590126824701e-06,
"loss": 1.8477,
"step": 940
},
{
"epoch": 0.708051166290444,
"grad_norm": 1.0358820665410977,
"learning_rate": 4.147809849710964e-06,
"loss": 1.8425,
"step": 941
},
{
"epoch": 0.708803611738149,
"grad_norm": 1.0840123991307846,
"learning_rate": 4.128064335187091e-06,
"loss": 1.8406,
"step": 942
},
{
"epoch": 0.709556057185854,
"grad_norm": 1.0299200339717463,
"learning_rate": 4.108353700543396e-06,
"loss": 1.8801,
"step": 943
},
{
"epoch": 0.710308502633559,
"grad_norm": 1.0229780426095583,
"learning_rate": 4.088678062863003e-06,
"loss": 1.8516,
"step": 944
},
{
"epoch": 0.7110609480812641,
"grad_norm": 1.0582788531202727,
"learning_rate": 4.069037539021155e-06,
"loss": 1.8418,
"step": 945
},
{
"epoch": 0.7118133935289691,
"grad_norm": 1.0427782470958744,
"learning_rate": 4.0494322456845006e-06,
"loss": 1.814,
"step": 946
},
{
"epoch": 0.7125658389766741,
"grad_norm": 1.0199607961006796,
"learning_rate": 4.029862299310437e-06,
"loss": 1.8699,
"step": 947
},
{
"epoch": 0.7133182844243793,
"grad_norm": 1.044915823272757,
"learning_rate": 4.010327816146382e-06,
"loss": 1.8682,
"step": 948
},
{
"epoch": 0.7140707298720843,
"grad_norm": 1.0638120243828764,
"learning_rate": 3.990828912229105e-06,
"loss": 1.8586,
"step": 949
},
{
"epoch": 0.7148231753197893,
"grad_norm": 1.0468000298516114,
"learning_rate": 3.971365703384028e-06,
"loss": 1.8792,
"step": 950
},
{
"epoch": 0.7155756207674944,
"grad_norm": 1.0757916490545916,
"learning_rate": 3.951938305224542e-06,
"loss": 1.8364,
"step": 951
},
{
"epoch": 0.7163280662151994,
"grad_norm": 1.0096656097938363,
"learning_rate": 3.932546833151318e-06,
"loss": 1.8556,
"step": 952
},
{
"epoch": 0.7170805116629044,
"grad_norm": 1.0344476559221756,
"learning_rate": 3.913191402351624e-06,
"loss": 1.853,
"step": 953
},
{
"epoch": 0.7178329571106095,
"grad_norm": 1.0491607925285744,
"learning_rate": 3.893872127798638e-06,
"loss": 1.8928,
"step": 954
},
{
"epoch": 0.7185854025583145,
"grad_norm": 1.0398129268562113,
"learning_rate": 3.874589124250766e-06,
"loss": 1.8549,
"step": 955
},
{
"epoch": 0.7193378480060195,
"grad_norm": 1.0552743212203994,
"learning_rate": 3.855342506250963e-06,
"loss": 1.8355,
"step": 956
},
{
"epoch": 0.7200902934537246,
"grad_norm": 1.062056199052133,
"learning_rate": 3.836132388126048e-06,
"loss": 1.8583,
"step": 957
},
{
"epoch": 0.7208427389014297,
"grad_norm": 1.022813760203401,
"learning_rate": 3.816958883986027e-06,
"loss": 1.8442,
"step": 958
},
{
"epoch": 0.7215951843491347,
"grad_norm": 1.0771658967910391,
"learning_rate": 3.7978221077234167e-06,
"loss": 1.8618,
"step": 959
},
{
"epoch": 0.7223476297968398,
"grad_norm": 1.0583646537279077,
"learning_rate": 3.7787221730125668e-06,
"loss": 1.8691,
"step": 960
},
{
"epoch": 0.7231000752445448,
"grad_norm": 1.0467775992275208,
"learning_rate": 3.759659193308981e-06,
"loss": 1.8739,
"step": 961
},
{
"epoch": 0.7238525206922498,
"grad_norm": 1.0731067901272393,
"learning_rate": 3.740633281848652e-06,
"loss": 1.8889,
"step": 962
},
{
"epoch": 0.7246049661399548,
"grad_norm": 1.0677899424993107,
"learning_rate": 3.7216445516473797e-06,
"loss": 1.8778,
"step": 963
},
{
"epoch": 0.7253574115876599,
"grad_norm": 1.0381025247289344,
"learning_rate": 3.7026931155001055e-06,
"loss": 1.8399,
"step": 964
},
{
"epoch": 0.7261098570353649,
"grad_norm": 1.024603439436162,
"learning_rate": 3.6837790859802382e-06,
"loss": 1.817,
"step": 965
},
{
"epoch": 0.7268623024830699,
"grad_norm": 1.0186767492700006,
"learning_rate": 3.664902575438988e-06,
"loss": 1.8565,
"step": 966
},
{
"epoch": 0.7276147479307751,
"grad_norm": 1.0337978048349519,
"learning_rate": 3.6460636960047024e-06,
"loss": 1.8591,
"step": 967
},
{
"epoch": 0.7283671933784801,
"grad_norm": 1.0499068745514537,
"learning_rate": 3.627262559582191e-06,
"loss": 1.8291,
"step": 968
},
{
"epoch": 0.7291196388261851,
"grad_norm": 1.1000079989565783,
"learning_rate": 3.60849927785207e-06,
"loss": 1.8532,
"step": 969
},
{
"epoch": 0.7298720842738902,
"grad_norm": 1.1204764625886554,
"learning_rate": 3.5897739622700944e-06,
"loss": 1.891,
"step": 970
},
{
"epoch": 0.7306245297215952,
"grad_norm": 1.030202312522904,
"learning_rate": 3.571086724066494e-06,
"loss": 1.8148,
"step": 971
},
{
"epoch": 0.7313769751693002,
"grad_norm": 1.0252227638425289,
"learning_rate": 3.552437674245317e-06,
"loss": 1.8315,
"step": 972
},
{
"epoch": 0.7321294206170053,
"grad_norm": 0.9974517874786354,
"learning_rate": 3.5338269235837695e-06,
"loss": 1.8313,
"step": 973
},
{
"epoch": 0.7328818660647103,
"grad_norm": 1.0526353047370969,
"learning_rate": 3.5152545826315578e-06,
"loss": 1.8662,
"step": 974
},
{
"epoch": 0.7336343115124153,
"grad_norm": 1.0576734422765772,
"learning_rate": 3.4967207617102263e-06,
"loss": 1.8627,
"step": 975
},
{
"epoch": 0.7343867569601203,
"grad_norm": 1.0308083834152681,
"learning_rate": 3.478225570912509e-06,
"loss": 1.8696,
"step": 976
},
{
"epoch": 0.7351392024078255,
"grad_norm": 1.0250776240341724,
"learning_rate": 3.459769120101676e-06,
"loss": 1.8308,
"step": 977
},
{
"epoch": 0.7358916478555305,
"grad_norm": 1.0327836549212666,
"learning_rate": 3.441351518910875e-06,
"loss": 1.8655,
"step": 978
},
{
"epoch": 0.7366440933032355,
"grad_norm": 1.0449871364550598,
"learning_rate": 3.4229728767424807e-06,
"loss": 1.8859,
"step": 979
},
{
"epoch": 0.7373965387509406,
"grad_norm": 1.0318237845332718,
"learning_rate": 3.4046333027674536e-06,
"loss": 1.824,
"step": 980
},
{
"epoch": 0.7381489841986456,
"grad_norm": 1.0702108655026827,
"learning_rate": 3.386332905924681e-06,
"loss": 1.8221,
"step": 981
},
{
"epoch": 0.7389014296463506,
"grad_norm": 1.0562154482204267,
"learning_rate": 3.36807179492033e-06,
"loss": 1.8718,
"step": 982
},
{
"epoch": 0.7396538750940557,
"grad_norm": 1.0324456217101385,
"learning_rate": 3.3498500782272224e-06,
"loss": 1.8255,
"step": 983
},
{
"epoch": 0.7404063205417607,
"grad_norm": 1.0580825223632866,
"learning_rate": 3.3316678640841503e-06,
"loss": 1.8652,
"step": 984
},
{
"epoch": 0.7411587659894657,
"grad_norm": 1.061773069714642,
"learning_rate": 3.3135252604952795e-06,
"loss": 1.8477,
"step": 985
},
{
"epoch": 0.7419112114371708,
"grad_norm": 1.0872431620569398,
"learning_rate": 3.2954223752294657e-06,
"loss": 1.856,
"step": 986
},
{
"epoch": 0.7426636568848759,
"grad_norm": 1.011281136494092,
"learning_rate": 3.277359315819647e-06,
"loss": 1.86,
"step": 987
},
{
"epoch": 0.7434161023325809,
"grad_norm": 1.0173225026847934,
"learning_rate": 3.2593361895621865e-06,
"loss": 1.8734,
"step": 988
},
{
"epoch": 0.744168547780286,
"grad_norm": 1.046300761733739,
"learning_rate": 3.2413531035162414e-06,
"loss": 1.8812,
"step": 989
},
{
"epoch": 0.744920993227991,
"grad_norm": 1.001233626479689,
"learning_rate": 3.223410164503127e-06,
"loss": 1.8339,
"step": 990
},
{
"epoch": 0.745673438675696,
"grad_norm": 1.0251781121358305,
"learning_rate": 3.2055074791056807e-06,
"loss": 1.8964,
"step": 991
},
{
"epoch": 0.746425884123401,
"grad_norm": 1.0339028204655578,
"learning_rate": 3.187645153667628e-06,
"loss": 1.8442,
"step": 992
},
{
"epoch": 0.7471783295711061,
"grad_norm": 1.0075070470915974,
"learning_rate": 3.16982329429295e-06,
"loss": 1.834,
"step": 993
},
{
"epoch": 0.7479307750188111,
"grad_norm": 1.027280129873711,
"learning_rate": 3.1520420068452705e-06,
"loss": 1.8347,
"step": 994
},
{
"epoch": 0.7486832204665161,
"grad_norm": 1.049092979317654,
"learning_rate": 3.134301396947186e-06,
"loss": 1.8443,
"step": 995
},
{
"epoch": 0.7494356659142212,
"grad_norm": 1.0344039885772653,
"learning_rate": 3.1166015699796915e-06,
"loss": 1.8694,
"step": 996
},
{
"epoch": 0.7501881113619263,
"grad_norm": 1.0228421584827532,
"learning_rate": 3.0989426310815018e-06,
"loss": 1.8506,
"step": 997
},
{
"epoch": 0.7509405568096313,
"grad_norm": 1.0278633709793787,
"learning_rate": 3.081324685148479e-06,
"loss": 1.8359,
"step": 998
},
{
"epoch": 0.7516930022573364,
"grad_norm": 1.039840084504926,
"learning_rate": 3.0637478368329543e-06,
"loss": 1.8413,
"step": 999
},
{
"epoch": 0.7524454477050414,
"grad_norm": 1.0069147469028807,
"learning_rate": 3.046212190543165e-06,
"loss": 1.8215,
"step": 1000
},
{
"epoch": 0.7531978931527464,
"grad_norm": 0.9856582124035167,
"learning_rate": 3.028717850442575e-06,
"loss": 1.8599,
"step": 1001
},
{
"epoch": 0.7539503386004515,
"grad_norm": 1.028167100507457,
"learning_rate": 3.0112649204493117e-06,
"loss": 1.8613,
"step": 1002
},
{
"epoch": 0.7547027840481565,
"grad_norm": 1.0479093922691505,
"learning_rate": 2.993853504235501e-06,
"loss": 1.8437,
"step": 1003
},
{
"epoch": 0.7554552294958615,
"grad_norm": 0.984731405108161,
"learning_rate": 2.976483705226683e-06,
"loss": 1.8765,
"step": 1004
},
{
"epoch": 0.7562076749435666,
"grad_norm": 1.0309416224517076,
"learning_rate": 2.9591556266011945e-06,
"loss": 1.8451,
"step": 1005
},
{
"epoch": 0.7569601203912716,
"grad_norm": 1.0059763429510242,
"learning_rate": 2.9418693712895295e-06,
"loss": 1.842,
"step": 1006
},
{
"epoch": 0.7577125658389767,
"grad_norm": 1.0452737631696916,
"learning_rate": 2.92462504197377e-06,
"loss": 1.8509,
"step": 1007
},
{
"epoch": 0.7584650112866818,
"grad_norm": 0.9965397748955215,
"learning_rate": 2.9074227410869315e-06,
"loss": 1.8787,
"step": 1008
},
{
"epoch": 0.7592174567343868,
"grad_norm": 1.052273008512367,
"learning_rate": 2.890262570812398e-06,
"loss": 1.8527,
"step": 1009
},
{
"epoch": 0.7599699021820918,
"grad_norm": 1.012024732674179,
"learning_rate": 2.8731446330832715e-06,
"loss": 1.821,
"step": 1010
},
{
"epoch": 0.7607223476297968,
"grad_norm": 1.0262753273006846,
"learning_rate": 2.8560690295818115e-06,
"loss": 1.8685,
"step": 1011
},
{
"epoch": 0.7614747930775019,
"grad_norm": 1.0312480980399954,
"learning_rate": 2.8390358617387836e-06,
"loss": 1.8339,
"step": 1012
},
{
"epoch": 0.7622272385252069,
"grad_norm": 1.0494255440844618,
"learning_rate": 2.8220452307329073e-06,
"loss": 1.8755,
"step": 1013
},
{
"epoch": 0.7629796839729119,
"grad_norm": 1.0301298180643048,
"learning_rate": 2.805097237490203e-06,
"loss": 1.836,
"step": 1014
},
{
"epoch": 0.763732129420617,
"grad_norm": 1.0687281970466598,
"learning_rate": 2.7881919826834435e-06,
"loss": 1.891,
"step": 1015
},
{
"epoch": 0.764484574868322,
"grad_norm": 1.0485763759081654,
"learning_rate": 2.7713295667315065e-06,
"loss": 1.8551,
"step": 1016
},
{
"epoch": 0.7652370203160271,
"grad_norm": 1.0562777594944632,
"learning_rate": 2.754510089798824e-06,
"loss": 1.8357,
"step": 1017
},
{
"epoch": 0.7659894657637322,
"grad_norm": 1.0446870214589272,
"learning_rate": 2.737733651794755e-06,
"loss": 1.8642,
"step": 1018
},
{
"epoch": 0.7667419112114372,
"grad_norm": 1.0531350381713522,
"learning_rate": 2.7210003523730044e-06,
"loss": 1.8543,
"step": 1019
},
{
"epoch": 0.7674943566591422,
"grad_norm": 1.04010343649036,
"learning_rate": 2.7043102909310327e-06,
"loss": 1.8727,
"step": 1020
},
{
"epoch": 0.7682468021068473,
"grad_norm": 1.042300679105017,
"learning_rate": 2.687663566609452e-06,
"loss": 1.8449,
"step": 1021
},
{
"epoch": 0.7689992475545523,
"grad_norm": 1.043225727207654,
"learning_rate": 2.6710602782914664e-06,
"loss": 1.8775,
"step": 1022
},
{
"epoch": 0.7697516930022573,
"grad_norm": 1.0466767527217815,
"learning_rate": 2.6545005246022438e-06,
"loss": 1.8634,
"step": 1023
},
{
"epoch": 0.7705041384499624,
"grad_norm": 1.065644717911311,
"learning_rate": 2.6379844039083758e-06,
"loss": 1.8688,
"step": 1024
},
{
"epoch": 0.7712565838976674,
"grad_norm": 1.0201706344384631,
"learning_rate": 2.6215120143172447e-06,
"loss": 1.8438,
"step": 1025
},
{
"epoch": 0.7720090293453724,
"grad_norm": 1.057107159471271,
"learning_rate": 2.6050834536764903e-06,
"loss": 1.8798,
"step": 1026
},
{
"epoch": 0.7727614747930776,
"grad_norm": 1.0168029361720792,
"learning_rate": 2.58869881957338e-06,
"loss": 1.847,
"step": 1027
},
{
"epoch": 0.7735139202407826,
"grad_norm": 0.9980575209626507,
"learning_rate": 2.5723582093342736e-06,
"loss": 1.8094,
"step": 1028
},
{
"epoch": 0.7742663656884876,
"grad_norm": 1.0309742813583669,
"learning_rate": 2.5560617200240155e-06,
"loss": 1.8545,
"step": 1029
},
{
"epoch": 0.7750188111361926,
"grad_norm": 1.0519619171861982,
"learning_rate": 2.5398094484453663e-06,
"loss": 1.8662,
"step": 1030
},
{
"epoch": 0.7757712565838977,
"grad_norm": 1.0732357376506185,
"learning_rate": 2.523601491138432e-06,
"loss": 1.7983,
"step": 1031
},
{
"epoch": 0.7765237020316027,
"grad_norm": 1.045715664450681,
"learning_rate": 2.507437944380087e-06,
"loss": 1.851,
"step": 1032
},
{
"epoch": 0.7772761474793077,
"grad_norm": 1.0247925234216377,
"learning_rate": 2.4913189041833997e-06,
"loss": 1.8516,
"step": 1033
},
{
"epoch": 0.7780285929270128,
"grad_norm": 1.034259709324356,
"learning_rate": 2.47524446629707e-06,
"loss": 1.8856,
"step": 1034
},
{
"epoch": 0.7787810383747178,
"grad_norm": 1.0180382317022318,
"learning_rate": 2.4592147262048506e-06,
"loss": 1.8534,
"step": 1035
},
{
"epoch": 0.7795334838224228,
"grad_norm": 1.0490238452562832,
"learning_rate": 2.4432297791249893e-06,
"loss": 1.8643,
"step": 1036
},
{
"epoch": 0.780285929270128,
"grad_norm": 1.0100685579239412,
"learning_rate": 2.42728972000966e-06,
"loss": 1.814,
"step": 1037
},
{
"epoch": 0.781038374717833,
"grad_norm": 1.057197898838983,
"learning_rate": 2.4113946435443847e-06,
"loss": 1.8393,
"step": 1038
},
{
"epoch": 0.781790820165538,
"grad_norm": 1.0409810882322574,
"learning_rate": 2.3955446441475027e-06,
"loss": 1.8916,
"step": 1039
},
{
"epoch": 0.782543265613243,
"grad_norm": 1.02902836640672,
"learning_rate": 2.3797398159695795e-06,
"loss": 1.882,
"step": 1040
},
{
"epoch": 0.7832957110609481,
"grad_norm": 1.0355431962042332,
"learning_rate": 2.363980252892862e-06,
"loss": 1.8453,
"step": 1041
},
{
"epoch": 0.7840481565086531,
"grad_norm": 0.9768319390054508,
"learning_rate": 2.3482660485307196e-06,
"loss": 1.81,
"step": 1042
},
{
"epoch": 0.7848006019563581,
"grad_norm": 1.020987904962542,
"learning_rate": 2.3325972962270813e-06,
"loss": 1.8482,
"step": 1043
},
{
"epoch": 0.7855530474040632,
"grad_norm": 1.0475616171974942,
"learning_rate": 2.3169740890558922e-06,
"loss": 1.9014,
"step": 1044
},
{
"epoch": 0.7863054928517682,
"grad_norm": 1.0419130599999848,
"learning_rate": 2.301396519820551e-06,
"loss": 1.7988,
"step": 1045
},
{
"epoch": 0.7870579382994732,
"grad_norm": 1.0393855911960832,
"learning_rate": 2.285864681053365e-06,
"loss": 1.8326,
"step": 1046
},
{
"epoch": 0.7878103837471784,
"grad_norm": 1.0263025867355982,
"learning_rate": 2.270378665014995e-06,
"loss": 1.8875,
"step": 1047
},
{
"epoch": 0.7885628291948834,
"grad_norm": 1.0177886911430776,
"learning_rate": 2.2549385636939136e-06,
"loss": 1.8545,
"step": 1048
},
{
"epoch": 0.7893152746425884,
"grad_norm": 1.0659138440623757,
"learning_rate": 2.239544468805853e-06,
"loss": 1.8498,
"step": 1049
},
{
"epoch": 0.7900677200902935,
"grad_norm": 1.025257688715242,
"learning_rate": 2.2241964717932652e-06,
"loss": 1.8306,
"step": 1050
},
{
"epoch": 0.7908201655379985,
"grad_norm": 1.0149741675676625,
"learning_rate": 2.208894663824772e-06,
"loss": 1.8499,
"step": 1051
},
{
"epoch": 0.7915726109857035,
"grad_norm": 1.013347270625162,
"learning_rate": 2.1936391357946307e-06,
"loss": 1.8447,
"step": 1052
},
{
"epoch": 0.7923250564334086,
"grad_norm": 1.0329931792656077,
"learning_rate": 2.178429978322193e-06,
"loss": 1.8505,
"step": 1053
},
{
"epoch": 0.7930775018811136,
"grad_norm": 1.0065027101119342,
"learning_rate": 2.16326728175136e-06,
"loss": 1.8391,
"step": 1054
},
{
"epoch": 0.7938299473288186,
"grad_norm": 1.031821724857908,
"learning_rate": 2.148151136150054e-06,
"loss": 1.8717,
"step": 1055
},
{
"epoch": 0.7945823927765236,
"grad_norm": 1.0286792910140437,
"learning_rate": 2.133081631309679e-06,
"loss": 1.8304,
"step": 1056
},
{
"epoch": 0.7953348382242288,
"grad_norm": 0.9969913317872536,
"learning_rate": 2.118058856744588e-06,
"loss": 1.8402,
"step": 1057
},
{
"epoch": 0.7960872836719338,
"grad_norm": 1.4904329019576084,
"learning_rate": 2.103082901691552e-06,
"loss": 1.8127,
"step": 1058
},
{
"epoch": 0.7968397291196389,
"grad_norm": 1.0205166085961555,
"learning_rate": 2.0881538551092306e-06,
"loss": 1.859,
"step": 1059
},
{
"epoch": 0.7975921745673439,
"grad_norm": 1.0072082423817648,
"learning_rate": 2.073271805677638e-06,
"loss": 1.8826,
"step": 1060
},
{
"epoch": 0.7983446200150489,
"grad_norm": 1.0092626270915435,
"learning_rate": 2.0584368417976266e-06,
"loss": 1.8443,
"step": 1061
},
{
"epoch": 0.7990970654627539,
"grad_norm": 1.0478228265845229,
"learning_rate": 2.0436490515903506e-06,
"loss": 1.8525,
"step": 1062
},
{
"epoch": 0.799849510910459,
"grad_norm": 1.0105538676247696,
"learning_rate": 2.028908522896752e-06,
"loss": 1.8793,
"step": 1063
},
{
"epoch": 0.800601956358164,
"grad_norm": 0.9973684975047772,
"learning_rate": 2.014215343277032e-06,
"loss": 1.8324,
"step": 1064
},
{
"epoch": 0.801354401805869,
"grad_norm": 0.9989732096085077,
"learning_rate": 1.999569600010136e-06,
"loss": 1.8656,
"step": 1065
},
{
"epoch": 0.8021068472535741,
"grad_norm": 1.02452133814114,
"learning_rate": 1.9849713800932304e-06,
"loss": 1.8385,
"step": 1066
},
{
"epoch": 0.8028592927012792,
"grad_norm": 1.0198675609830914,
"learning_rate": 1.9704207702411892e-06,
"loss": 1.8581,
"step": 1067
},
{
"epoch": 0.8036117381489842,
"grad_norm": 1.1001376037851702,
"learning_rate": 1.9559178568860792e-06,
"loss": 1.8563,
"step": 1068
},
{
"epoch": 0.8043641835966893,
"grad_norm": 1.0221648341410818,
"learning_rate": 1.941462726176643e-06,
"loss": 1.8523,
"step": 1069
},
{
"epoch": 0.8051166290443943,
"grad_norm": 1.0196196729103861,
"learning_rate": 1.9270554639777903e-06,
"loss": 1.8205,
"step": 1070
},
{
"epoch": 0.8058690744920993,
"grad_norm": 1.0032683190132314,
"learning_rate": 1.9126961558700875e-06,
"loss": 1.8509,
"step": 1071
},
{
"epoch": 0.8066215199398044,
"grad_norm": 1.022089034769349,
"learning_rate": 1.8983848871492494e-06,
"loss": 1.859,
"step": 1072
},
{
"epoch": 0.8073739653875094,
"grad_norm": 1.0164380629727434,
"learning_rate": 1.884121742825631e-06,
"loss": 1.8737,
"step": 1073
},
{
"epoch": 0.8081264108352144,
"grad_norm": 1.0132734321066406,
"learning_rate": 1.8699068076237215e-06,
"loss": 1.829,
"step": 1074
},
{
"epoch": 0.8088788562829194,
"grad_norm": 1.033980568414524,
"learning_rate": 1.8557401659816531e-06,
"loss": 1.8687,
"step": 1075
},
{
"epoch": 0.8096313017306245,
"grad_norm": 0.9963526835379817,
"learning_rate": 1.8416219020506732e-06,
"loss": 1.8265,
"step": 1076
},
{
"epoch": 0.8103837471783296,
"grad_norm": 1.0054248651966842,
"learning_rate": 1.8275520996946783e-06,
"loss": 1.8115,
"step": 1077
},
{
"epoch": 0.8111361926260346,
"grad_norm": 0.976451075038402,
"learning_rate": 1.8135308424896792e-06,
"loss": 1.8693,
"step": 1078
},
{
"epoch": 0.8118886380737397,
"grad_norm": 0.9872180061310946,
"learning_rate": 1.799558213723347e-06,
"loss": 1.8394,
"step": 1079
},
{
"epoch": 0.8126410835214447,
"grad_norm": 0.9912126246917158,
"learning_rate": 1.7856342963944717e-06,
"loss": 1.829,
"step": 1080
},
{
"epoch": 0.8133935289691497,
"grad_norm": 1.0071994836707505,
"learning_rate": 1.7717591732125072e-06,
"loss": 1.8241,
"step": 1081
},
{
"epoch": 0.8141459744168548,
"grad_norm": 1.0449844371982129,
"learning_rate": 1.7579329265970612e-06,
"loss": 1.849,
"step": 1082
},
{
"epoch": 0.8148984198645598,
"grad_norm": 1.0019387794651853,
"learning_rate": 1.7441556386774095e-06,
"loss": 1.8212,
"step": 1083
},
{
"epoch": 0.8156508653122648,
"grad_norm": 1.0201821875740784,
"learning_rate": 1.7304273912920088e-06,
"loss": 1.8589,
"step": 1084
},
{
"epoch": 0.8164033107599699,
"grad_norm": 0.987870815064525,
"learning_rate": 1.7167482659880098e-06,
"loss": 1.8456,
"step": 1085
},
{
"epoch": 0.8171557562076749,
"grad_norm": 0.9991300646940607,
"learning_rate": 1.7031183440207732e-06,
"loss": 1.8242,
"step": 1086
},
{
"epoch": 0.81790820165538,
"grad_norm": 1.0260149036020485,
"learning_rate": 1.6895377063533848e-06,
"loss": 1.8672,
"step": 1087
},
{
"epoch": 0.8186606471030851,
"grad_norm": 1.0260219565305344,
"learning_rate": 1.6760064336561876e-06,
"loss": 1.8764,
"step": 1088
},
{
"epoch": 0.8194130925507901,
"grad_norm": 1.009970250415965,
"learning_rate": 1.6625246063062717e-06,
"loss": 1.8616,
"step": 1089
},
{
"epoch": 0.8201655379984951,
"grad_norm": 1.0074059635646477,
"learning_rate": 1.64909230438704e-06,
"loss": 1.8845,
"step": 1090
},
{
"epoch": 0.8209179834462002,
"grad_norm": 0.9917144604591168,
"learning_rate": 1.6357096076876867e-06,
"loss": 1.8735,
"step": 1091
},
{
"epoch": 0.8216704288939052,
"grad_norm": 1.0265707122204901,
"learning_rate": 1.6223765957027682e-06,
"loss": 1.8322,
"step": 1092
},
{
"epoch": 0.8224228743416102,
"grad_norm": 1.0197591464961497,
"learning_rate": 1.6090933476316882e-06,
"loss": 1.8558,
"step": 1093
},
{
"epoch": 0.8231753197893152,
"grad_norm": 1.0553201823632523,
"learning_rate": 1.595859942378266e-06,
"loss": 1.8264,
"step": 1094
},
{
"epoch": 0.8239277652370203,
"grad_norm": 1.0292103793427896,
"learning_rate": 1.5826764585502341e-06,
"loss": 1.8454,
"step": 1095
},
{
"epoch": 0.8246802106847254,
"grad_norm": 0.9945598121473408,
"learning_rate": 1.569542974458801e-06,
"loss": 1.8563,
"step": 1096
},
{
"epoch": 0.8254326561324304,
"grad_norm": 0.9910954899689373,
"learning_rate": 1.5564595681181593e-06,
"loss": 1.8204,
"step": 1097
},
{
"epoch": 0.8261851015801355,
"grad_norm": 0.9991520869493447,
"learning_rate": 1.5434263172450381e-06,
"loss": 1.8479,
"step": 1098
},
{
"epoch": 0.8269375470278405,
"grad_norm": 1.00925580706731,
"learning_rate": 1.5304432992582485e-06,
"loss": 1.8737,
"step": 1099
},
{
"epoch": 0.8276899924755455,
"grad_norm": 1.004491999963813,
"learning_rate": 1.5175105912781962e-06,
"loss": 1.8219,
"step": 1100
},
{
"epoch": 0.8284424379232506,
"grad_norm": 1.0249742345962538,
"learning_rate": 1.504628270126457e-06,
"loss": 1.8443,
"step": 1101
},
{
"epoch": 0.8291948833709556,
"grad_norm": 1.0146515769503766,
"learning_rate": 1.4917964123252881e-06,
"loss": 1.8268,
"step": 1102
},
{
"epoch": 0.8299473288186606,
"grad_norm": 1.002462647256442,
"learning_rate": 1.479015094097206e-06,
"loss": 1.8156,
"step": 1103
},
{
"epoch": 0.8306997742663657,
"grad_norm": 0.9900401634932451,
"learning_rate": 1.4662843913644987e-06,
"loss": 1.8151,
"step": 1104
},
{
"epoch": 0.8314522197140707,
"grad_norm": 1.0270026457084644,
"learning_rate": 1.4536043797488132e-06,
"loss": 1.8606,
"step": 1105
},
{
"epoch": 0.8322046651617758,
"grad_norm": 1.0118137810868357,
"learning_rate": 1.440975134570667e-06,
"loss": 1.8641,
"step": 1106
},
{
"epoch": 0.8329571106094809,
"grad_norm": 0.9880428169119571,
"learning_rate": 1.4283967308490366e-06,
"loss": 1.8102,
"step": 1107
},
{
"epoch": 0.8337095560571859,
"grad_norm": 1.0037059813493723,
"learning_rate": 1.4158692433008792e-06,
"loss": 1.8521,
"step": 1108
},
{
"epoch": 0.8344620015048909,
"grad_norm": 1.0214672859318756,
"learning_rate": 1.4033927463407204e-06,
"loss": 1.8315,
"step": 1109
},
{
"epoch": 0.835214446952596,
"grad_norm": 1.0156706637252109,
"learning_rate": 1.390967314080186e-06,
"loss": 1.8703,
"step": 1110
},
{
"epoch": 0.835966892400301,
"grad_norm": 1.018620844478463,
"learning_rate": 1.3785930203275776e-06,
"loss": 1.8233,
"step": 1111
},
{
"epoch": 0.836719337848006,
"grad_norm": 0.9874572172983379,
"learning_rate": 1.3662699385874268e-06,
"loss": 1.8413,
"step": 1112
},
{
"epoch": 0.837471783295711,
"grad_norm": 1.0108118345387729,
"learning_rate": 1.353998142060061e-06,
"loss": 1.8325,
"step": 1113
},
{
"epoch": 0.8382242287434161,
"grad_norm": 1.0225491830082007,
"learning_rate": 1.3417777036411693e-06,
"loss": 1.8365,
"step": 1114
},
{
"epoch": 0.8389766741911211,
"grad_norm": 1.0190388498379541,
"learning_rate": 1.329608695921364e-06,
"loss": 1.8722,
"step": 1115
},
{
"epoch": 0.8397291196388262,
"grad_norm": 1.0020409030532436,
"learning_rate": 1.3174911911857647e-06,
"loss": 1.8481,
"step": 1116
},
{
"epoch": 0.8404815650865313,
"grad_norm": 1.0004330558915364,
"learning_rate": 1.3054252614135432e-06,
"loss": 1.8474,
"step": 1117
},
{
"epoch": 0.8412340105342363,
"grad_norm": 1.0016734119877326,
"learning_rate": 1.293410978277526e-06,
"loss": 1.8058,
"step": 1118
},
{
"epoch": 0.8419864559819413,
"grad_norm": 1.0165468869372531,
"learning_rate": 1.281448413143741e-06,
"loss": 1.8515,
"step": 1119
},
{
"epoch": 0.8427389014296464,
"grad_norm": 1.002100192821327,
"learning_rate": 1.2695376370710143e-06,
"loss": 1.855,
"step": 1120
},
{
"epoch": 0.8434913468773514,
"grad_norm": 0.9911876954973505,
"learning_rate": 1.2576787208105378e-06,
"loss": 1.8461,
"step": 1121
},
{
"epoch": 0.8442437923250564,
"grad_norm": 0.9992578682438206,
"learning_rate": 1.2458717348054483e-06,
"loss": 1.8348,
"step": 1122
},
{
"epoch": 0.8449962377727614,
"grad_norm": 1.0199442498314335,
"learning_rate": 1.234116749190415e-06,
"loss": 1.8314,
"step": 1123
},
{
"epoch": 0.8457486832204665,
"grad_norm": 0.9958994540059071,
"learning_rate": 1.222413833791216e-06,
"loss": 1.8904,
"step": 1124
},
{
"epoch": 0.8465011286681715,
"grad_norm": 0.9956244712689866,
"learning_rate": 1.2107630581243323e-06,
"loss": 1.8365,
"step": 1125
},
{
"epoch": 0.8472535741158767,
"grad_norm": 1.0072998042413934,
"learning_rate": 1.199164491396525e-06,
"loss": 1.8402,
"step": 1126
},
{
"epoch": 0.8480060195635817,
"grad_norm": 1.010287129286621,
"learning_rate": 1.1876182025044302e-06,
"loss": 1.818,
"step": 1127
},
{
"epoch": 0.8487584650112867,
"grad_norm": 1.0188978935555666,
"learning_rate": 1.1761242600341504e-06,
"loss": 1.846,
"step": 1128
},
{
"epoch": 0.8495109104589917,
"grad_norm": 0.9799196940944139,
"learning_rate": 1.1646827322608422e-06,
"loss": 1.8315,
"step": 1129
},
{
"epoch": 0.8502633559066968,
"grad_norm": 1.0110310408969876,
"learning_rate": 1.1532936871483169e-06,
"loss": 1.8492,
"step": 1130
},
{
"epoch": 0.8510158013544018,
"grad_norm": 1.027913629063318,
"learning_rate": 1.1419571923486339e-06,
"loss": 1.8564,
"step": 1131
},
{
"epoch": 0.8517682468021068,
"grad_norm": 1.0125707650743552,
"learning_rate": 1.130673315201689e-06,
"loss": 1.8248,
"step": 1132
},
{
"epoch": 0.8525206922498119,
"grad_norm": 1.024996944393088,
"learning_rate": 1.1194421227348385e-06,
"loss": 1.8551,
"step": 1133
},
{
"epoch": 0.8532731376975169,
"grad_norm": 0.9949857133714178,
"learning_rate": 1.108263681662477e-06,
"loss": 1.8558,
"step": 1134
},
{
"epoch": 0.8540255831452219,
"grad_norm": 1.0207395646356938,
"learning_rate": 1.097138058385654e-06,
"loss": 1.8634,
"step": 1135
},
{
"epoch": 0.8547780285929271,
"grad_norm": 1.00205145389641,
"learning_rate": 1.0860653189916736e-06,
"loss": 1.8529,
"step": 1136
},
{
"epoch": 0.8555304740406321,
"grad_norm": 1.018740671757251,
"learning_rate": 1.0750455292537077e-06,
"loss": 1.8045,
"step": 1137
},
{
"epoch": 0.8562829194883371,
"grad_norm": 1.0055198107846395,
"learning_rate": 1.0640787546303987e-06,
"loss": 1.8679,
"step": 1138
},
{
"epoch": 0.8570353649360422,
"grad_norm": 1.0304125672959479,
"learning_rate": 1.0531650602654752e-06,
"loss": 1.8751,
"step": 1139
},
{
"epoch": 0.8577878103837472,
"grad_norm": 0.9800176515276247,
"learning_rate": 1.0423045109873664e-06,
"loss": 1.8428,
"step": 1140
},
{
"epoch": 0.8585402558314522,
"grad_norm": 0.9996913535404124,
"learning_rate": 1.0314971713088096e-06,
"loss": 1.868,
"step": 1141
},
{
"epoch": 0.8592927012791572,
"grad_norm": 1.0222150182150487,
"learning_rate": 1.020743105426476e-06,
"loss": 1.8486,
"step": 1142
},
{
"epoch": 0.8600451467268623,
"grad_norm": 1.0035873571181235,
"learning_rate": 1.0100423772205826e-06,
"loss": 1.874,
"step": 1143
},
{
"epoch": 0.8607975921745673,
"grad_norm": 1.0062069170371268,
"learning_rate": 9.993950502545158e-07,
"loss": 1.8591,
"step": 1144
},
{
"epoch": 0.8615500376222723,
"grad_norm": 0.9878088893375802,
"learning_rate": 9.88801187774454e-07,
"loss": 1.8325,
"step": 1145
},
{
"epoch": 0.8623024830699775,
"grad_norm": 1.0225192192758739,
"learning_rate": 9.78260852708991e-07,
"loss": 1.8625,
"step": 1146
},
{
"epoch": 0.8630549285176825,
"grad_norm": 0.9974653844482616,
"learning_rate": 9.67774107668763e-07,
"loss": 1.8207,
"step": 1147
},
{
"epoch": 0.8638073739653875,
"grad_norm": 1.0009393457291873,
"learning_rate": 9.573410149460749e-07,
"loss": 1.832,
"step": 1148
},
{
"epoch": 0.8645598194130926,
"grad_norm": 1.0009759679652983,
"learning_rate": 9.469616365145318e-07,
"loss": 1.8844,
"step": 1149
},
{
"epoch": 0.8653122648607976,
"grad_norm": 0.9977099867831,
"learning_rate": 9.366360340286718e-07,
"loss": 1.8504,
"step": 1150
},
{
"epoch": 0.8660647103085026,
"grad_norm": 1.0168294541536407,
"learning_rate": 9.263642688235963e-07,
"loss": 1.8359,
"step": 1151
},
{
"epoch": 0.8668171557562077,
"grad_norm": 1.0042064302960472,
"learning_rate": 9.161464019146115e-07,
"loss": 1.8581,
"step": 1152
},
{
"epoch": 0.8675696012039127,
"grad_norm": 0.9904945060249274,
"learning_rate": 9.059824939968575e-07,
"loss": 1.8269,
"step": 1153
},
{
"epoch": 0.8683220466516177,
"grad_norm": 1.0229138746619746,
"learning_rate": 8.958726054449573e-07,
"loss": 1.8104,
"step": 1154
},
{
"epoch": 0.8690744920993227,
"grad_norm": 1.036994532492036,
"learning_rate": 8.858167963126508e-07,
"loss": 1.8612,
"step": 1155
},
{
"epoch": 0.8698269375470279,
"grad_norm": 1.0162563610196287,
"learning_rate": 8.75815126332441e-07,
"loss": 1.8342,
"step": 1156
},
{
"epoch": 0.8705793829947329,
"grad_norm": 1.0087227120971793,
"learning_rate": 8.658676549152411e-07,
"loss": 1.8397,
"step": 1157
},
{
"epoch": 0.871331828442438,
"grad_norm": 1.0019428761148148,
"learning_rate": 8.55974441150016e-07,
"loss": 1.8381,
"step": 1158
},
{
"epoch": 0.872084273890143,
"grad_norm": 1.019449586810325,
"learning_rate": 8.46135543803438e-07,
"loss": 1.8324,
"step": 1159
},
{
"epoch": 0.872836719337848,
"grad_norm": 1.0136398523581656,
"learning_rate": 8.363510213195314e-07,
"loss": 1.8766,
"step": 1160
},
{
"epoch": 0.873589164785553,
"grad_norm": 1.0225873408055568,
"learning_rate": 8.266209318193319e-07,
"loss": 1.8773,
"step": 1161
},
{
"epoch": 0.8743416102332581,
"grad_norm": 1.0105201480759105,
"learning_rate": 8.169453331005351e-07,
"loss": 1.823,
"step": 1162
},
{
"epoch": 0.8750940556809631,
"grad_norm": 1.0109149830651798,
"learning_rate": 8.073242826371564e-07,
"loss": 1.8269,
"step": 1163
},
{
"epoch": 0.8758465011286681,
"grad_norm": 1.0081575070784234,
"learning_rate": 7.977578375791906e-07,
"loss": 1.8619,
"step": 1164
},
{
"epoch": 0.8765989465763732,
"grad_norm": 0.998072141071612,
"learning_rate": 7.882460547522708e-07,
"loss": 1.8714,
"step": 1165
},
{
"epoch": 0.8773513920240783,
"grad_norm": 0.9980412643179496,
"learning_rate": 7.787889906573287e-07,
"loss": 1.8372,
"step": 1166
},
{
"epoch": 0.8781038374717833,
"grad_norm": 0.9908090565389136,
"learning_rate": 7.693867014702638e-07,
"loss": 1.8447,
"step": 1167
},
{
"epoch": 0.8788562829194884,
"grad_norm": 0.9952421488346186,
"learning_rate": 7.600392430416037e-07,
"loss": 1.8354,
"step": 1168
},
{
"epoch": 0.8796087283671934,
"grad_norm": 1.007561259190393,
"learning_rate": 7.507466708961853e-07,
"loss": 1.8535,
"step": 1169
},
{
"epoch": 0.8803611738148984,
"grad_norm": 0.9920622847399143,
"learning_rate": 7.415090402327996e-07,
"loss": 1.8289,
"step": 1170
},
{
"epoch": 0.8811136192626035,
"grad_norm": 1.0123462781342452,
"learning_rate": 7.323264059238977e-07,
"loss": 1.8619,
"step": 1171
},
{
"epoch": 0.8818660647103085,
"grad_norm": 1.007212185373345,
"learning_rate": 7.23198822515232e-07,
"loss": 1.8291,
"step": 1172
},
{
"epoch": 0.8826185101580135,
"grad_norm": 1.0088293569450928,
"learning_rate": 7.141263442255553e-07,
"loss": 1.8337,
"step": 1173
},
{
"epoch": 0.8833709556057185,
"grad_norm": 0.9771417291177641,
"learning_rate": 7.051090249462878e-07,
"loss": 1.8618,
"step": 1174
},
{
"epoch": 0.8841234010534236,
"grad_norm": 1.044470421749025,
"learning_rate": 6.961469182411996e-07,
"loss": 1.837,
"step": 1175
},
{
"epoch": 0.8848758465011287,
"grad_norm": 1.0316765631807912,
"learning_rate": 6.872400773460952e-07,
"loss": 1.8386,
"step": 1176
},
{
"epoch": 0.8856282919488337,
"grad_norm": 0.9786963735789368,
"learning_rate": 6.783885551684921e-07,
"loss": 1.8326,
"step": 1177
},
{
"epoch": 0.8863807373965388,
"grad_norm": 0.9988943488925585,
"learning_rate": 6.695924042873092e-07,
"loss": 1.8223,
"step": 1178
},
{
"epoch": 0.8871331828442438,
"grad_norm": 1.0051283319791353,
"learning_rate": 6.608516769525531e-07,
"loss": 1.8041,
"step": 1179
},
{
"epoch": 0.8878856282919488,
"grad_norm": 0.9767004791475964,
"learning_rate": 6.521664250850179e-07,
"loss": 1.8774,
"step": 1180
},
{
"epoch": 0.8886380737396539,
"grad_norm": 1.012652973378157,
"learning_rate": 6.43536700275953e-07,
"loss": 1.8323,
"step": 1181
},
{
"epoch": 0.8893905191873589,
"grad_norm": 0.9738531936975727,
"learning_rate": 6.349625537867854e-07,
"loss": 1.8409,
"step": 1182
},
{
"epoch": 0.8901429646350639,
"grad_norm": 0.9859086858328941,
"learning_rate": 6.264440365487912e-07,
"loss": 1.8437,
"step": 1183
},
{
"epoch": 0.890895410082769,
"grad_norm": 0.9993204101223951,
"learning_rate": 6.179811991628115e-07,
"loss": 1.8473,
"step": 1184
},
{
"epoch": 0.891647855530474,
"grad_norm": 1.0122748104166368,
"learning_rate": 6.095740918989357e-07,
"loss": 1.8377,
"step": 1185
},
{
"epoch": 0.8924003009781791,
"grad_norm": 1.005993506280934,
"learning_rate": 6.012227646962198e-07,
"loss": 1.8347,
"step": 1186
},
{
"epoch": 0.8931527464258842,
"grad_norm": 0.9949378300707312,
"learning_rate": 5.929272671623687e-07,
"loss": 1.8497,
"step": 1187
},
{
"epoch": 0.8939051918735892,
"grad_norm": 1.0105052218144452,
"learning_rate": 5.846876485734687e-07,
"loss": 1.8509,
"step": 1188
},
{
"epoch": 0.8946576373212942,
"grad_norm": 0.9867818438870452,
"learning_rate": 5.765039578736631e-07,
"loss": 1.8513,
"step": 1189
},
{
"epoch": 0.8954100827689992,
"grad_norm": 0.987759894262447,
"learning_rate": 5.683762436748919e-07,
"loss": 1.8391,
"step": 1190
},
{
"epoch": 0.8961625282167043,
"grad_norm": 0.983835103441717,
"learning_rate": 5.603045542565821e-07,
"loss": 1.7756,
"step": 1191
},
{
"epoch": 0.8969149736644093,
"grad_norm": 1.001764463620661,
"learning_rate": 5.522889375653673e-07,
"loss": 1.8375,
"step": 1192
},
{
"epoch": 0.8976674191121143,
"grad_norm": 0.9858310047164821,
"learning_rate": 5.443294412148092e-07,
"loss": 1.8371,
"step": 1193
},
{
"epoch": 0.8984198645598194,
"grad_norm": 0.9986066724861258,
"learning_rate": 5.364261124851011e-07,
"loss": 1.864,
"step": 1194
},
{
"epoch": 0.8991723100075244,
"grad_norm": 0.991930757203584,
"learning_rate": 5.28578998322804e-07,
"loss": 1.8432,
"step": 1195
},
{
"epoch": 0.8999247554552295,
"grad_norm": 1.027519467992599,
"learning_rate": 5.207881453405494e-07,
"loss": 1.8317,
"step": 1196
},
{
"epoch": 0.9006772009029346,
"grad_norm": 0.990768101619544,
"learning_rate": 5.130535998167829e-07,
"loss": 1.8368,
"step": 1197
},
{
"epoch": 0.9014296463506396,
"grad_norm": 1.0114135177038623,
"learning_rate": 5.053754076954653e-07,
"loss": 1.8712,
"step": 1198
},
{
"epoch": 0.9021820917983446,
"grad_norm": 0.9946899930633637,
"learning_rate": 4.977536145858242e-07,
"loss": 1.8314,
"step": 1199
},
{
"epoch": 0.9029345372460497,
"grad_norm": 1.001217888374902,
"learning_rate": 4.901882657620627e-07,
"loss": 1.8638,
"step": 1200
},
{
"epoch": 0.9036869826937547,
"grad_norm": 0.9875817679462767,
"learning_rate": 4.826794061631068e-07,
"loss": 1.8688,
"step": 1201
},
{
"epoch": 0.9044394281414597,
"grad_norm": 0.9967413790483429,
"learning_rate": 4.752270803923231e-07,
"loss": 1.8133,
"step": 1202
},
{
"epoch": 0.9051918735891648,
"grad_norm": 0.9921613700004421,
"learning_rate": 4.678313327172701e-07,
"loss": 1.8223,
"step": 1203
},
{
"epoch": 0.9059443190368698,
"grad_norm": 1.015193148502593,
"learning_rate": 4.6049220706941957e-07,
"loss": 1.8663,
"step": 1204
},
{
"epoch": 0.9066967644845748,
"grad_norm": 1.0281879277511876,
"learning_rate": 4.5320974704390675e-07,
"loss": 1.8687,
"step": 1205
},
{
"epoch": 0.90744920993228,
"grad_norm": 1.0039197356342466,
"learning_rate": 4.459839958992662e-07,
"loss": 1.8578,
"step": 1206
},
{
"epoch": 0.908201655379985,
"grad_norm": 1.0343792655116228,
"learning_rate": 4.388149965571753e-07,
"loss": 1.8606,
"step": 1207
},
{
"epoch": 0.90895410082769,
"grad_norm": 0.9907087180655133,
"learning_rate": 4.317027916022043e-07,
"loss": 1.8299,
"step": 1208
},
{
"epoch": 0.909706546275395,
"grad_norm": 1.006885685076737,
"learning_rate": 4.2464742328155116e-07,
"loss": 1.8382,
"step": 1209
},
{
"epoch": 0.9104589917231001,
"grad_norm": 0.9864911526557583,
"learning_rate": 4.176489335048084e-07,
"loss": 1.8167,
"step": 1210
},
{
"epoch": 0.9112114371708051,
"grad_norm": 1.0044942635590397,
"learning_rate": 4.1070736384369423e-07,
"loss": 1.8382,
"step": 1211
},
{
"epoch": 0.9119638826185101,
"grad_norm": 0.9921151350636432,
"learning_rate": 4.0382275553182527e-07,
"loss": 1.8425,
"step": 1212
},
{
"epoch": 0.9127163280662152,
"grad_norm": 1.005766561453971,
"learning_rate": 3.9699514946445416e-07,
"loss": 1.8743,
"step": 1213
},
{
"epoch": 0.9134687735139202,
"grad_norm": 0.9954753064107937,
"learning_rate": 3.902245861982412e-07,
"loss": 1.8427,
"step": 1214
},
{
"epoch": 0.9142212189616253,
"grad_norm": 0.9959764478431177,
"learning_rate": 3.835111059510022e-07,
"loss": 1.8225,
"step": 1215
},
{
"epoch": 0.9149736644093304,
"grad_norm": 0.9740325377481212,
"learning_rate": 3.768547486014751e-07,
"loss": 1.8243,
"step": 1216
},
{
"epoch": 0.9157261098570354,
"grad_norm": 0.9991600394138905,
"learning_rate": 3.7025555368908285e-07,
"loss": 1.8527,
"step": 1217
},
{
"epoch": 0.9164785553047404,
"grad_norm": 1.0190936168167124,
"learning_rate": 3.6371356041369874e-07,
"loss": 1.8599,
"step": 1218
},
{
"epoch": 0.9172310007524455,
"grad_norm": 0.988748332388004,
"learning_rate": 3.5722880763541134e-07,
"loss": 1.8377,
"step": 1219
},
{
"epoch": 0.9179834462001505,
"grad_norm": 1.0130245713312436,
"learning_rate": 3.508013338742944e-07,
"loss": 1.8615,
"step": 1220
},
{
"epoch": 0.9187358916478555,
"grad_norm": 0.9634756019126125,
"learning_rate": 3.444311773101794e-07,
"loss": 1.842,
"step": 1221
},
{
"epoch": 0.9194883370955605,
"grad_norm": 0.9894734454583846,
"learning_rate": 3.38118375782428e-07,
"loss": 1.8367,
"step": 1222
},
{
"epoch": 0.9202407825432656,
"grad_norm": 0.9856247258631955,
"learning_rate": 3.3186296678970885e-07,
"loss": 1.8492,
"step": 1223
},
{
"epoch": 0.9209932279909706,
"grad_norm": 0.9929443199092715,
"learning_rate": 3.25664987489771e-07,
"loss": 1.8432,
"step": 1224
},
{
"epoch": 0.9217456734386757,
"grad_norm": 1.0089613304152658,
"learning_rate": 3.1952447469922545e-07,
"loss": 1.8469,
"step": 1225
},
{
"epoch": 0.9224981188863808,
"grad_norm": 1.0192580267474831,
"learning_rate": 3.1344146489332705e-07,
"loss": 1.8315,
"step": 1226
},
{
"epoch": 0.9232505643340858,
"grad_norm": 0.9974432035704971,
"learning_rate": 3.074159942057586e-07,
"loss": 1.864,
"step": 1227
},
{
"epoch": 0.9240030097817908,
"grad_norm": 0.9905766821813555,
"learning_rate": 3.0144809842841293e-07,
"loss": 1.8458,
"step": 1228
},
{
"epoch": 0.9247554552294959,
"grad_norm": 1.0086858007652628,
"learning_rate": 2.955378130111819e-07,
"loss": 1.8298,
"step": 1229
},
{
"epoch": 0.9255079006772009,
"grad_norm": 0.9965986309031104,
"learning_rate": 2.896851730617489e-07,
"loss": 1.8609,
"step": 1230
},
{
"epoch": 0.9262603461249059,
"grad_norm": 1.033624750735306,
"learning_rate": 2.8389021334537357e-07,
"loss": 1.865,
"step": 1231
},
{
"epoch": 0.927012791572611,
"grad_norm": 1.0432127248116443,
"learning_rate": 2.7815296828469286e-07,
"loss": 1.8684,
"step": 1232
},
{
"epoch": 0.927765237020316,
"grad_norm": 0.9703391639946372,
"learning_rate": 2.7247347195951013e-07,
"loss": 1.8173,
"step": 1233
},
{
"epoch": 0.928517682468021,
"grad_norm": 1.020333175883032,
"learning_rate": 2.668517581065977e-07,
"loss": 1.8523,
"step": 1234
},
{
"epoch": 0.9292701279157262,
"grad_norm": 0.9911481200828857,
"learning_rate": 2.612878601194935e-07,
"loss": 1.8268,
"step": 1235
},
{
"epoch": 0.9300225733634312,
"grad_norm": 1.0037990170824629,
"learning_rate": 2.5578181104830347e-07,
"loss": 1.8602,
"step": 1236
},
{
"epoch": 0.9307750188111362,
"grad_norm": 0.9981236468539332,
"learning_rate": 2.5033364359950406e-07,
"loss": 1.8428,
"step": 1237
},
{
"epoch": 0.9315274642588413,
"grad_norm": 0.9997038796294978,
"learning_rate": 2.449433901357512e-07,
"loss": 1.8315,
"step": 1238
},
{
"epoch": 0.9322799097065463,
"grad_norm": 1.0121241415162892,
"learning_rate": 2.3961108267568365e-07,
"loss": 1.8481,
"step": 1239
},
{
"epoch": 0.9330323551542513,
"grad_norm": 1.0103729389318301,
"learning_rate": 2.343367528937379e-07,
"loss": 1.8578,
"step": 1240
},
{
"epoch": 0.9337848006019563,
"grad_norm": 1.0010596192605887,
"learning_rate": 2.2912043211995583e-07,
"loss": 1.7838,
"step": 1241
},
{
"epoch": 0.9345372460496614,
"grad_norm": 1.0012237155022985,
"learning_rate": 2.2396215133980047e-07,
"loss": 1.8462,
"step": 1242
},
{
"epoch": 0.9352896914973664,
"grad_norm": 1.0133392592424881,
"learning_rate": 2.1886194119396963e-07,
"loss": 1.8402,
"step": 1243
},
{
"epoch": 0.9360421369450714,
"grad_norm": 0.9909256162877704,
"learning_rate": 2.138198319782192e-07,
"loss": 1.8534,
"step": 1244
},
{
"epoch": 0.9367945823927766,
"grad_norm": 0.9981211767141346,
"learning_rate": 2.088358536431767e-07,
"loss": 1.8318,
"step": 1245
},
{
"epoch": 0.9375470278404816,
"grad_norm": 1.008409601797997,
"learning_rate": 2.0391003579416814e-07,
"loss": 1.8533,
"step": 1246
},
{
"epoch": 0.9382994732881866,
"grad_norm": 0.9959885048137673,
"learning_rate": 1.9904240769104022e-07,
"loss": 1.8181,
"step": 1247
},
{
"epoch": 0.9390519187358917,
"grad_norm": 0.9947073181315413,
"learning_rate": 1.9423299824798624e-07,
"loss": 1.8373,
"step": 1248
},
{
"epoch": 0.9398043641835967,
"grad_norm": 0.9822005139896943,
"learning_rate": 1.89481836033375e-07,
"loss": 1.8385,
"step": 1249
},
{
"epoch": 0.9405568096313017,
"grad_norm": 0.9771107928246694,
"learning_rate": 1.8478894926958203e-07,
"loss": 1.8193,
"step": 1250
},
{
"epoch": 0.9413092550790068,
"grad_norm": 1.0284632021044697,
"learning_rate": 1.8015436583281975e-07,
"loss": 1.8232,
"step": 1251
},
{
"epoch": 0.9420617005267118,
"grad_norm": 1.0145966195472258,
"learning_rate": 1.7557811325297324e-07,
"loss": 1.8353,
"step": 1252
},
{
"epoch": 0.9428141459744168,
"grad_norm": 0.9993767720622243,
"learning_rate": 1.7106021871343803e-07,
"loss": 1.842,
"step": 1253
},
{
"epoch": 0.9435665914221218,
"grad_norm": 0.9946780446035074,
"learning_rate": 1.666007090509525e-07,
"loss": 1.8131,
"step": 1254
},
{
"epoch": 0.944319036869827,
"grad_norm": 1.0002144925996097,
"learning_rate": 1.621996107554491e-07,
"loss": 1.8307,
"step": 1255
},
{
"epoch": 0.945071482317532,
"grad_norm": 0.9835433240608699,
"learning_rate": 1.5785694996988789e-07,
"loss": 1.8021,
"step": 1256
},
{
"epoch": 0.945823927765237,
"grad_norm": 0.9983332143259994,
"learning_rate": 1.5357275249010427e-07,
"loss": 1.8352,
"step": 1257
},
{
"epoch": 0.9465763732129421,
"grad_norm": 0.9972759832972569,
"learning_rate": 1.493470437646549e-07,
"loss": 1.8504,
"step": 1258
},
{
"epoch": 0.9473288186606471,
"grad_norm": 0.9961255073013742,
"learning_rate": 1.4517984889466985e-07,
"loss": 1.8185,
"step": 1259
},
{
"epoch": 0.9480812641083521,
"grad_norm": 1.0063467410681264,
"learning_rate": 1.410711926336994e-07,
"loss": 1.857,
"step": 1260
},
{
"epoch": 0.9488337095560572,
"grad_norm": 0.9526833803452752,
"learning_rate": 1.3702109938757092e-07,
"loss": 1.7668,
"step": 1261
},
{
"epoch": 0.9495861550037622,
"grad_norm": 1.002573108388787,
"learning_rate": 1.330295932142378e-07,
"loss": 1.8339,
"step": 1262
},
{
"epoch": 0.9503386004514672,
"grad_norm": 0.9932667489028059,
"learning_rate": 1.2909669782364409e-07,
"loss": 1.8171,
"step": 1263
},
{
"epoch": 0.9510910458991723,
"grad_norm": 0.9825762253960021,
"learning_rate": 1.252224365775767e-07,
"loss": 1.8333,
"step": 1264
},
{
"epoch": 0.9518434913468774,
"grad_norm": 1.0234192832820652,
"learning_rate": 1.2140683248953345e-07,
"loss": 1.8906,
"step": 1265
},
{
"epoch": 0.9525959367945824,
"grad_norm": 0.9768833855101954,
"learning_rate": 1.1764990822458078e-07,
"loss": 1.814,
"step": 1266
},
{
"epoch": 0.9533483822422875,
"grad_norm": 1.0022975360365913,
"learning_rate": 1.1395168609921959e-07,
"loss": 1.8555,
"step": 1267
},
{
"epoch": 0.9541008276899925,
"grad_norm": 0.9847756991016295,
"learning_rate": 1.1031218808125854e-07,
"loss": 1.8477,
"step": 1268
},
{
"epoch": 0.9548532731376975,
"grad_norm": 0.9661213568033307,
"learning_rate": 1.0673143578967427e-07,
"loss": 1.843,
"step": 1269
},
{
"epoch": 0.9556057185854026,
"grad_norm": 0.9671218425859786,
"learning_rate": 1.0320945049449249e-07,
"loss": 1.8535,
"step": 1270
},
{
"epoch": 0.9563581640331076,
"grad_norm": 1.016949762959402,
"learning_rate": 9.974625311665375e-08,
"loss": 1.8778,
"step": 1271
},
{
"epoch": 0.9571106094808126,
"grad_norm": 0.9786230305715762,
"learning_rate": 9.634186422789571e-08,
"loss": 1.8547,
"step": 1272
},
{
"epoch": 0.9578630549285176,
"grad_norm": 0.9839163184742172,
"learning_rate": 9.299630405062433e-08,
"loss": 1.8315,
"step": 1273
},
{
"epoch": 0.9586155003762227,
"grad_norm": 0.9931035001365742,
"learning_rate": 8.970959245780064e-08,
"loss": 1.8646,
"step": 1274
},
{
"epoch": 0.9593679458239278,
"grad_norm": 0.9955970859455311,
"learning_rate": 8.648174897281425e-08,
"loss": 1.8269,
"step": 1275
},
{
"epoch": 0.9601203912716328,
"grad_norm": 0.9867771082815295,
"learning_rate": 8.331279276937887e-08,
"loss": 1.8365,
"step": 1276
},
{
"epoch": 0.9608728367193379,
"grad_norm": 0.9857788969729198,
"learning_rate": 8.020274267140694e-08,
"loss": 1.8696,
"step": 1277
},
{
"epoch": 0.9616252821670429,
"grad_norm": 1.040076797404121,
"learning_rate": 7.71516171529052e-08,
"loss": 1.8966,
"step": 1278
},
{
"epoch": 0.9623777276147479,
"grad_norm": 0.9988100206954824,
"learning_rate": 7.415943433786043e-08,
"loss": 1.8562,
"step": 1279
},
{
"epoch": 0.963130173062453,
"grad_norm": 0.9995420968278054,
"learning_rate": 7.122621200013835e-08,
"loss": 1.82,
"step": 1280
},
{
"epoch": 0.963882618510158,
"grad_norm": 0.990740228639178,
"learning_rate": 6.835196756336704e-08,
"loss": 1.8171,
"step": 1281
},
{
"epoch": 0.964635063957863,
"grad_norm": 1.0017629230198868,
"learning_rate": 6.553671810084483e-08,
"loss": 1.8571,
"step": 1282
},
{
"epoch": 0.9653875094055681,
"grad_norm": 0.9859200259477047,
"learning_rate": 6.278048033543371e-08,
"loss": 1.8416,
"step": 1283
},
{
"epoch": 0.9661399548532731,
"grad_norm": 0.9745572860466265,
"learning_rate": 6.008327063945718e-08,
"loss": 1.7901,
"step": 1284
},
{
"epoch": 0.9668924003009782,
"grad_norm": 1.0169054264845245,
"learning_rate": 5.744510503461143e-08,
"loss": 1.8831,
"step": 1285
},
{
"epoch": 0.9676448457486833,
"grad_norm": 0.9844322304738157,
"learning_rate": 5.486599919185875e-08,
"loss": 1.8508,
"step": 1286
},
{
"epoch": 0.9683972911963883,
"grad_norm": 1.014070069456265,
"learning_rate": 5.234596843134543e-08,
"loss": 1.8567,
"step": 1287
},
{
"epoch": 0.9691497366440933,
"grad_norm": 1.003260872283204,
"learning_rate": 4.988502772230286e-08,
"loss": 1.8438,
"step": 1288
},
{
"epoch": 0.9699021820917983,
"grad_norm": 1.0114531562661886,
"learning_rate": 4.7483191682964333e-08,
"loss": 1.8496,
"step": 1289
},
{
"epoch": 0.9706546275395034,
"grad_norm": 1.0109898858870248,
"learning_rate": 4.514047458047288e-08,
"loss": 1.8458,
"step": 1290
},
{
"epoch": 0.9714070729872084,
"grad_norm": 0.9991253400478782,
"learning_rate": 4.2856890330801315e-08,
"loss": 1.8454,
"step": 1291
},
{
"epoch": 0.9721595184349134,
"grad_norm": 0.9782075679601189,
"learning_rate": 4.063245249866454e-08,
"loss": 1.8229,
"step": 1292
},
{
"epoch": 0.9729119638826185,
"grad_norm": 0.9918901422516555,
"learning_rate": 3.84671742974474e-08,
"loss": 1.8111,
"step": 1293
},
{
"epoch": 0.9736644093303235,
"grad_norm": 0.9883012028951532,
"learning_rate": 3.63610685891147e-08,
"loss": 1.7931,
"step": 1294
},
{
"epoch": 0.9744168547780286,
"grad_norm": 0.9781414283613079,
"learning_rate": 3.4314147884143554e-08,
"loss": 1.8257,
"step": 1295
},
{
"epoch": 0.9751693002257337,
"grad_norm": 0.9767192390841168,
"learning_rate": 3.2326424341445616e-08,
"loss": 1.8761,
"step": 1296
},
{
"epoch": 0.9759217456734387,
"grad_norm": 0.9784981741272334,
"learning_rate": 3.039790976829715e-08,
"loss": 1.8011,
"step": 1297
},
{
"epoch": 0.9766741911211437,
"grad_norm": 0.9768598274361385,
"learning_rate": 2.8528615620265766e-08,
"loss": 1.8292,
"step": 1298
},
{
"epoch": 0.9774266365688488,
"grad_norm": 0.9932232958530512,
"learning_rate": 2.6718553001142676e-08,
"loss": 1.8487,
"step": 1299
},
{
"epoch": 0.9781790820165538,
"grad_norm": 1.0083603662961036,
"learning_rate": 2.496773266288055e-08,
"loss": 1.8565,
"step": 1300
},
{
"epoch": 0.9789315274642588,
"grad_norm": 1.0253112203643229,
"learning_rate": 2.3276165005524652e-08,
"loss": 1.8874,
"step": 1301
},
{
"epoch": 0.9796839729119639,
"grad_norm": 1.010053834928846,
"learning_rate": 2.164386007715624e-08,
"loss": 1.8608,
"step": 1302
},
{
"epoch": 0.9804364183596689,
"grad_norm": 1.017579586685599,
"learning_rate": 2.0070827573827055e-08,
"loss": 1.8471,
"step": 1303
},
{
"epoch": 0.9811888638073739,
"grad_norm": 1.0019104828077237,
"learning_rate": 1.855707683950714e-08,
"loss": 1.8442,
"step": 1304
},
{
"epoch": 0.981941309255079,
"grad_norm": 0.9806065092454616,
"learning_rate": 1.710261686602488e-08,
"loss": 1.8186,
"step": 1305
},
{
"epoch": 0.9826937547027841,
"grad_norm": 0.9963472915227872,
"learning_rate": 1.5707456293018177e-08,
"loss": 1.8017,
"step": 1306
},
{
"epoch": 0.9834462001504891,
"grad_norm": 0.9936155457237965,
"learning_rate": 1.4371603407878909e-08,
"loss": 1.8489,
"step": 1307
},
{
"epoch": 0.9841986455981941,
"grad_norm": 0.9988518558062381,
"learning_rate": 1.3095066145704105e-08,
"loss": 1.8351,
"step": 1308
},
{
"epoch": 0.9849510910458992,
"grad_norm": 0.9929036204643935,
"learning_rate": 1.1877852089253739e-08,
"loss": 1.838,
"step": 1309
},
{
"epoch": 0.9857035364936042,
"grad_norm": 0.9855707975624383,
"learning_rate": 1.0719968468898556e-08,
"loss": 1.8381,
"step": 1310
},
{
"epoch": 0.9864559819413092,
"grad_norm": 0.9694213183289568,
"learning_rate": 9.621422162583437e-09,
"loss": 1.8274,
"step": 1311
},
{
"epoch": 0.9872084273890143,
"grad_norm": 0.9858900955762706,
"learning_rate": 8.58221969578077e-09,
"loss": 1.8577,
"step": 1312
},
{
"epoch": 0.9879608728367193,
"grad_norm": 0.9727390455861563,
"learning_rate": 7.602367241458241e-09,
"loss": 1.8248,
"step": 1313
},
{
"epoch": 0.9887133182844243,
"grad_norm": 0.971479377200592,
"learning_rate": 6.681870620034448e-09,
"loss": 1.8297,
"step": 1314
},
{
"epoch": 0.9894657637321295,
"grad_norm": 0.9922476205799885,
"learning_rate": 5.820735299352231e-09,
"loss": 1.8545,
"step": 1315
},
{
"epoch": 0.9902182091798345,
"grad_norm": 0.9932081810891289,
"learning_rate": 5.018966394639835e-09,
"loss": 1.8387,
"step": 1316
},
{
"epoch": 0.9909706546275395,
"grad_norm": 0.9837801570755045,
"learning_rate": 4.276568668485359e-09,
"loss": 1.8031,
"step": 1317
},
{
"epoch": 0.9917231000752446,
"grad_norm": 0.9713591553585486,
"learning_rate": 3.59354653080346e-09,
"loss": 1.8428,
"step": 1318
},
{
"epoch": 0.9924755455229496,
"grad_norm": 1.005652741652822,
"learning_rate": 2.9699040388131427e-09,
"loss": 1.8306,
"step": 1319
},
{
"epoch": 0.9932279909706546,
"grad_norm": 1.0051611442984065,
"learning_rate": 2.4056448970144474e-09,
"loss": 1.8818,
"step": 1320
},
{
"epoch": 0.9939804364183596,
"grad_norm": 0.9738440231910677,
"learning_rate": 1.9007724571606935e-09,
"loss": 1.8467,
"step": 1321
},
{
"epoch": 0.9947328818660647,
"grad_norm": 1.006211528832604,
"learning_rate": 1.4552897182462667e-09,
"loss": 1.8498,
"step": 1322
},
{
"epoch": 0.9954853273137697,
"grad_norm": 1.0171099301738729,
"learning_rate": 1.069199326481085e-09,
"loss": 1.8116,
"step": 1323
},
{
"epoch": 0.9962377727614747,
"grad_norm": 0.97266438015028,
"learning_rate": 7.425035752817167e-10,
"loss": 1.846,
"step": 1324
},
{
"epoch": 0.9969902182091799,
"grad_norm": 1.0439381594163042,
"learning_rate": 4.752044052513949e-10,
"loss": 1.8728,
"step": 1325
},
{
"epoch": 0.9977426636568849,
"grad_norm": 0.9976896960980217,
"learning_rate": 2.673034041755784e-10,
"loss": 1.8456,
"step": 1326
},
{
"epoch": 0.9984951091045899,
"grad_norm": 0.9953495475342304,
"learning_rate": 1.1880180700640787e-10,
"loss": 1.8433,
"step": 1327
},
{
"epoch": 0.999247554552295,
"grad_norm": 0.9965862178177205,
"learning_rate": 2.970049585715451e-11,
"loss": 1.8321,
"step": 1328
},
{
"epoch": 1.0,
"grad_norm": 0.9659214946424362,
"learning_rate": 0.0,
"loss": 1.8484,
"step": 1329
},
{
"epoch": 1.0,
"step": 1329,
"total_flos": 208699171799040.0,
"train_loss": 1.9003729102663567,
"train_runtime": 27485.2321,
"train_samples_per_second": 9.281,
"train_steps_per_second": 0.048
}
],
"logging_steps": 1.0,
"max_steps": 1329,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"total_flos": 208699171799040.0,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}