Fill-Mask
Transformers
TensorBoard
Safetensors
bert
Inference Endpoints
NusaBERT-large / trainer_state.json
w11wo's picture
End of training
fe71459 verified
raw
history blame
162 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3535169785169785,
"eval_steps": 500,
"global_step": 500000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 4.596311569213867,
"learning_rate": 6.249999999999999e-07,
"loss": 9.2114,
"step": 500
},
{
"epoch": 0.0,
"grad_norm": 5.16270637512207,
"learning_rate": 1.2499999999999999e-06,
"loss": 7.3081,
"step": 1000
},
{
"epoch": 0.0,
"grad_norm": 3.267263412475586,
"learning_rate": 1.875e-06,
"loss": 6.1634,
"step": 1500
},
{
"epoch": 0.01,
"grad_norm": 2.1591992378234863,
"learning_rate": 2.4999999999999998e-06,
"loss": 5.421,
"step": 2000
},
{
"epoch": 0.01,
"grad_norm": 2.160722494125366,
"learning_rate": 3.125e-06,
"loss": 4.856,
"step": 2500
},
{
"epoch": 0.01,
"grad_norm": 2.330343723297119,
"learning_rate": 3.75e-06,
"loss": 4.3826,
"step": 3000
},
{
"epoch": 0.01,
"grad_norm": 2.1275618076324463,
"learning_rate": 4.3750000000000005e-06,
"loss": 3.9848,
"step": 3500
},
{
"epoch": 0.01,
"grad_norm": 2.1402294635772705,
"learning_rate": 4.9999999999999996e-06,
"loss": 3.6491,
"step": 4000
},
{
"epoch": 0.01,
"grad_norm": 2.159619092941284,
"learning_rate": 5.625e-06,
"loss": 3.3861,
"step": 4500
},
{
"epoch": 0.01,
"grad_norm": 2.5018796920776367,
"learning_rate": 6.25e-06,
"loss": 3.1845,
"step": 5000
},
{
"epoch": 0.01,
"grad_norm": 1.9996334314346313,
"learning_rate": 6.875e-06,
"loss": 3.0335,
"step": 5500
},
{
"epoch": 0.02,
"grad_norm": 2.103320598602295,
"learning_rate": 7.5e-06,
"loss": 2.9096,
"step": 6000
},
{
"epoch": 0.02,
"grad_norm": 2.025847911834717,
"learning_rate": 8.125e-06,
"loss": 2.8088,
"step": 6500
},
{
"epoch": 0.02,
"grad_norm": 2.030522108078003,
"learning_rate": 8.750000000000001e-06,
"loss": 2.7156,
"step": 7000
},
{
"epoch": 0.02,
"grad_norm": 1.992558479309082,
"learning_rate": 9.375000000000001e-06,
"loss": 2.6262,
"step": 7500
},
{
"epoch": 0.02,
"grad_norm": 2.1512062549591064,
"learning_rate": 9.999999999999999e-06,
"loss": 2.5432,
"step": 8000
},
{
"epoch": 0.02,
"grad_norm": 2.0734474658966064,
"learning_rate": 1.0625e-05,
"loss": 2.4722,
"step": 8500
},
{
"epoch": 0.02,
"grad_norm": 1.9478808641433716,
"learning_rate": 1.125e-05,
"loss": 2.4111,
"step": 9000
},
{
"epoch": 0.03,
"grad_norm": 1.762665033340454,
"learning_rate": 1.1874999999999999e-05,
"loss": 2.3521,
"step": 9500
},
{
"epoch": 0.03,
"grad_norm": 1.8274019956588745,
"learning_rate": 1.25e-05,
"loss": 2.3099,
"step": 10000
},
{
"epoch": 0.03,
"grad_norm": 1.905918002128601,
"learning_rate": 1.3125e-05,
"loss": 2.2629,
"step": 10500
},
{
"epoch": 0.03,
"grad_norm": 1.8081414699554443,
"learning_rate": 1.375e-05,
"loss": 2.2239,
"step": 11000
},
{
"epoch": 0.03,
"grad_norm": 7.712226867675781,
"learning_rate": 1.4375e-05,
"loss": 2.1907,
"step": 11500
},
{
"epoch": 0.03,
"grad_norm": 1.6963427066802979,
"learning_rate": 1.5e-05,
"loss": 2.1602,
"step": 12000
},
{
"epoch": 0.03,
"grad_norm": 1.717537522315979,
"learning_rate": 1.5625e-05,
"loss": 2.1384,
"step": 12500
},
{
"epoch": 0.04,
"grad_norm": 1.745806336402893,
"learning_rate": 1.625e-05,
"loss": 2.1061,
"step": 13000
},
{
"epoch": 0.04,
"grad_norm": 1.7633601427078247,
"learning_rate": 1.6875e-05,
"loss": 2.0838,
"step": 13500
},
{
"epoch": 0.04,
"grad_norm": 1.7061880826950073,
"learning_rate": 1.7500000000000002e-05,
"loss": 2.0648,
"step": 14000
},
{
"epoch": 0.04,
"grad_norm": 1.7471063137054443,
"learning_rate": 1.8125e-05,
"loss": 2.0462,
"step": 14500
},
{
"epoch": 0.04,
"grad_norm": 1.705340027809143,
"learning_rate": 1.8750000000000002e-05,
"loss": 2.0281,
"step": 15000
},
{
"epoch": 0.04,
"grad_norm": 41.675968170166016,
"learning_rate": 1.9375e-05,
"loss": 2.003,
"step": 15500
},
{
"epoch": 0.04,
"grad_norm": 1.737722396850586,
"learning_rate": 1.9999999999999998e-05,
"loss": 1.9914,
"step": 16000
},
{
"epoch": 0.04,
"grad_norm": 1.8232406377792358,
"learning_rate": 2.0625e-05,
"loss": 1.9724,
"step": 16500
},
{
"epoch": 0.05,
"grad_norm": 1.8312487602233887,
"learning_rate": 2.125e-05,
"loss": 1.9577,
"step": 17000
},
{
"epoch": 0.05,
"grad_norm": 2.025630235671997,
"learning_rate": 2.1875e-05,
"loss": 1.9411,
"step": 17500
},
{
"epoch": 0.05,
"grad_norm": 1.9454607963562012,
"learning_rate": 2.25e-05,
"loss": 1.9263,
"step": 18000
},
{
"epoch": 0.05,
"grad_norm": 1.637341856956482,
"learning_rate": 2.3125000000000003e-05,
"loss": 1.9221,
"step": 18500
},
{
"epoch": 0.05,
"grad_norm": 1.846366286277771,
"learning_rate": 2.3749999999999998e-05,
"loss": 1.9086,
"step": 19000
},
{
"epoch": 0.05,
"grad_norm": 1.802040457725525,
"learning_rate": 2.4375e-05,
"loss": 1.8961,
"step": 19500
},
{
"epoch": 0.05,
"grad_norm": 1.7378031015396118,
"learning_rate": 2.5e-05,
"loss": 1.8893,
"step": 20000
},
{
"epoch": 0.06,
"grad_norm": 1.6410856246948242,
"learning_rate": 2.5625e-05,
"loss": 1.8752,
"step": 20500
},
{
"epoch": 0.06,
"grad_norm": 1.7153388261795044,
"learning_rate": 2.625e-05,
"loss": 1.862,
"step": 21000
},
{
"epoch": 0.06,
"grad_norm": 1.6210004091262817,
"learning_rate": 2.6875000000000003e-05,
"loss": 1.855,
"step": 21500
},
{
"epoch": 0.06,
"grad_norm": 1.6593818664550781,
"learning_rate": 2.75e-05,
"loss": 1.8478,
"step": 22000
},
{
"epoch": 0.06,
"grad_norm": 1.659287691116333,
"learning_rate": 2.8125e-05,
"loss": 1.8353,
"step": 22500
},
{
"epoch": 0.06,
"grad_norm": 1.703875184059143,
"learning_rate": 2.875e-05,
"loss": 1.8288,
"step": 23000
},
{
"epoch": 0.06,
"grad_norm": 1.7122712135314941,
"learning_rate": 2.9375e-05,
"loss": 1.8289,
"step": 23500
},
{
"epoch": 0.06,
"grad_norm": 1.6744304895401,
"learning_rate": 3e-05,
"loss": 1.8219,
"step": 24000
},
{
"epoch": 0.07,
"grad_norm": 1.7783963680267334,
"learning_rate": 2.9968487394957983e-05,
"loss": 1.8141,
"step": 24500
},
{
"epoch": 0.07,
"grad_norm": 1.7388477325439453,
"learning_rate": 2.9936974789915968e-05,
"loss": 1.805,
"step": 25000
},
{
"epoch": 0.07,
"grad_norm": 1.6574689149856567,
"learning_rate": 2.990546218487395e-05,
"loss": 1.8005,
"step": 25500
},
{
"epoch": 0.07,
"grad_norm": 1.6803966760635376,
"learning_rate": 2.9873949579831935e-05,
"loss": 1.7902,
"step": 26000
},
{
"epoch": 0.07,
"grad_norm": 1.6314315795898438,
"learning_rate": 2.9842436974789916e-05,
"loss": 1.7832,
"step": 26500
},
{
"epoch": 0.07,
"grad_norm": 1.6180912256240845,
"learning_rate": 2.98109243697479e-05,
"loss": 1.7774,
"step": 27000
},
{
"epoch": 0.07,
"grad_norm": 1.6669533252716064,
"learning_rate": 2.9779411764705883e-05,
"loss": 1.774,
"step": 27500
},
{
"epoch": 0.08,
"grad_norm": 1.5653916597366333,
"learning_rate": 2.9747899159663868e-05,
"loss": 1.7673,
"step": 28000
},
{
"epoch": 0.08,
"grad_norm": 1.6632215976715088,
"learning_rate": 2.971638655462185e-05,
"loss": 1.7639,
"step": 28500
},
{
"epoch": 0.08,
"grad_norm": 1.6262154579162598,
"learning_rate": 2.9684873949579835e-05,
"loss": 1.757,
"step": 29000
},
{
"epoch": 0.08,
"grad_norm": 4.847783088684082,
"learning_rate": 2.9653361344537817e-05,
"loss": 1.9286,
"step": 29500
},
{
"epoch": 0.08,
"grad_norm": 2.6416807174682617,
"learning_rate": 2.9621848739495802e-05,
"loss": 3.7773,
"step": 30000
},
{
"epoch": 0.08,
"grad_norm": 3.4526023864746094,
"learning_rate": 2.9590336134453784e-05,
"loss": 4.3611,
"step": 30500
},
{
"epoch": 0.08,
"grad_norm": 65.76104736328125,
"learning_rate": 2.9558823529411766e-05,
"loss": 4.5628,
"step": 31000
},
{
"epoch": 0.09,
"grad_norm": 6.145516395568848,
"learning_rate": 2.9527310924369747e-05,
"loss": 4.388,
"step": 31500
},
{
"epoch": 0.09,
"grad_norm": 4.991481781005859,
"learning_rate": 2.949579831932773e-05,
"loss": 4.1991,
"step": 32000
},
{
"epoch": 0.09,
"grad_norm": 2.632403612136841,
"learning_rate": 2.9464285714285714e-05,
"loss": 3.7935,
"step": 32500
},
{
"epoch": 0.09,
"grad_norm": 3.691666841506958,
"learning_rate": 2.9432773109243696e-05,
"loss": 3.4704,
"step": 33000
},
{
"epoch": 0.09,
"grad_norm": 14.81291675567627,
"learning_rate": 2.940126050420168e-05,
"loss": 2.6663,
"step": 33500
},
{
"epoch": 0.09,
"grad_norm": 2.4295215606689453,
"learning_rate": 2.9369747899159663e-05,
"loss": 2.4661,
"step": 34000
},
{
"epoch": 0.09,
"grad_norm": 52.97163391113281,
"learning_rate": 2.9338235294117648e-05,
"loss": 2.1129,
"step": 34500
},
{
"epoch": 0.09,
"grad_norm": 2.337153196334839,
"learning_rate": 2.930672268907563e-05,
"loss": 1.7961,
"step": 35000
},
{
"epoch": 0.1,
"grad_norm": 6.669353008270264,
"learning_rate": 2.9275210084033615e-05,
"loss": 1.7907,
"step": 35500
},
{
"epoch": 0.1,
"grad_norm": 1.5874249935150146,
"learning_rate": 2.9243697478991596e-05,
"loss": 1.7663,
"step": 36000
},
{
"epoch": 0.1,
"grad_norm": 1.7114965915679932,
"learning_rate": 2.921218487394958e-05,
"loss": 1.7439,
"step": 36500
},
{
"epoch": 0.1,
"grad_norm": 1.8134816884994507,
"learning_rate": 2.9180672268907563e-05,
"loss": 1.7361,
"step": 37000
},
{
"epoch": 0.1,
"grad_norm": 1.505012035369873,
"learning_rate": 2.9149159663865545e-05,
"loss": 1.7323,
"step": 37500
},
{
"epoch": 0.1,
"grad_norm": 1.6047751903533936,
"learning_rate": 2.911764705882353e-05,
"loss": 1.7212,
"step": 38000
},
{
"epoch": 0.1,
"grad_norm": 1.5497486591339111,
"learning_rate": 2.9086134453781512e-05,
"loss": 1.7215,
"step": 38500
},
{
"epoch": 0.11,
"grad_norm": 1.5367647409439087,
"learning_rate": 2.9054621848739497e-05,
"loss": 1.7027,
"step": 39000
},
{
"epoch": 0.11,
"grad_norm": 4.223250865936279,
"learning_rate": 2.902310924369748e-05,
"loss": 1.6914,
"step": 39500
},
{
"epoch": 0.11,
"grad_norm": 1.5872981548309326,
"learning_rate": 2.8991596638655464e-05,
"loss": 1.6878,
"step": 40000
},
{
"epoch": 0.11,
"grad_norm": 1.5480022430419922,
"learning_rate": 2.8960084033613446e-05,
"loss": 1.6816,
"step": 40500
},
{
"epoch": 0.11,
"grad_norm": 1.5464568138122559,
"learning_rate": 2.892857142857143e-05,
"loss": 1.6796,
"step": 41000
},
{
"epoch": 0.11,
"grad_norm": 1.557543158531189,
"learning_rate": 2.8897058823529413e-05,
"loss": 1.6709,
"step": 41500
},
{
"epoch": 0.11,
"grad_norm": 1.5462812185287476,
"learning_rate": 2.8865546218487398e-05,
"loss": 1.6728,
"step": 42000
},
{
"epoch": 0.12,
"grad_norm": 1.5833927392959595,
"learning_rate": 2.883403361344538e-05,
"loss": 1.6676,
"step": 42500
},
{
"epoch": 0.12,
"grad_norm": 1.63410222530365,
"learning_rate": 2.8802521008403365e-05,
"loss": 1.6696,
"step": 43000
},
{
"epoch": 0.12,
"grad_norm": 1.4682618379592896,
"learning_rate": 2.8771008403361346e-05,
"loss": 1.6693,
"step": 43500
},
{
"epoch": 0.12,
"grad_norm": 1.5386840105056763,
"learning_rate": 2.8739495798319328e-05,
"loss": 1.6602,
"step": 44000
},
{
"epoch": 0.12,
"grad_norm": 1.5572445392608643,
"learning_rate": 2.8707983193277313e-05,
"loss": 1.6581,
"step": 44500
},
{
"epoch": 0.12,
"grad_norm": 1.5247888565063477,
"learning_rate": 2.8676470588235295e-05,
"loss": 1.6546,
"step": 45000
},
{
"epoch": 0.12,
"grad_norm": 1.5297437906265259,
"learning_rate": 2.864495798319328e-05,
"loss": 1.6467,
"step": 45500
},
{
"epoch": 0.12,
"grad_norm": 1.5252556800842285,
"learning_rate": 2.8613445378151262e-05,
"loss": 1.6504,
"step": 46000
},
{
"epoch": 0.13,
"grad_norm": 1.4626063108444214,
"learning_rate": 2.8581932773109244e-05,
"loss": 1.6441,
"step": 46500
},
{
"epoch": 0.13,
"grad_norm": 1.511093020439148,
"learning_rate": 2.8550420168067225e-05,
"loss": 1.6433,
"step": 47000
},
{
"epoch": 0.13,
"grad_norm": 1.572654366493225,
"learning_rate": 2.851890756302521e-05,
"loss": 1.6527,
"step": 47500
},
{
"epoch": 0.13,
"grad_norm": 1.5643205642700195,
"learning_rate": 2.8487394957983192e-05,
"loss": 1.6376,
"step": 48000
},
{
"epoch": 0.13,
"grad_norm": 1.497128963470459,
"learning_rate": 2.8455882352941177e-05,
"loss": 1.6397,
"step": 48500
},
{
"epoch": 0.13,
"grad_norm": 1.464203953742981,
"learning_rate": 2.842436974789916e-05,
"loss": 1.6358,
"step": 49000
},
{
"epoch": 0.13,
"grad_norm": 1.8414405584335327,
"learning_rate": 2.8392857142857144e-05,
"loss": 1.6366,
"step": 49500
},
{
"epoch": 0.14,
"grad_norm": 1.7834322452545166,
"learning_rate": 2.8361344537815126e-05,
"loss": 1.642,
"step": 50000
},
{
"epoch": 0.14,
"grad_norm": 1.477858304977417,
"learning_rate": 2.8329831932773108e-05,
"loss": 1.6342,
"step": 50500
},
{
"epoch": 0.14,
"grad_norm": 1.5328236818313599,
"learning_rate": 2.8298319327731093e-05,
"loss": 1.6333,
"step": 51000
},
{
"epoch": 0.14,
"grad_norm": 1.540300965309143,
"learning_rate": 2.8266806722689075e-05,
"loss": 1.6352,
"step": 51500
},
{
"epoch": 0.14,
"grad_norm": 1.8767386674880981,
"learning_rate": 2.823529411764706e-05,
"loss": 1.6328,
"step": 52000
},
{
"epoch": 0.14,
"grad_norm": 1.5387629270553589,
"learning_rate": 2.820378151260504e-05,
"loss": 1.632,
"step": 52500
},
{
"epoch": 0.14,
"grad_norm": 1.6315770149230957,
"learning_rate": 2.8172268907563027e-05,
"loss": 1.627,
"step": 53000
},
{
"epoch": 0.14,
"grad_norm": 5.726038455963135,
"learning_rate": 2.814075630252101e-05,
"loss": 1.6293,
"step": 53500
},
{
"epoch": 0.15,
"grad_norm": 1.5697258710861206,
"learning_rate": 2.8109243697478993e-05,
"loss": 1.6211,
"step": 54000
},
{
"epoch": 0.15,
"grad_norm": 1.5938401222229004,
"learning_rate": 2.8077731092436975e-05,
"loss": 1.6196,
"step": 54500
},
{
"epoch": 0.15,
"grad_norm": 1.5256606340408325,
"learning_rate": 2.804621848739496e-05,
"loss": 1.6177,
"step": 55000
},
{
"epoch": 0.15,
"grad_norm": 2.223390817642212,
"learning_rate": 2.8014705882352942e-05,
"loss": 1.6246,
"step": 55500
},
{
"epoch": 0.15,
"grad_norm": 1.4948030710220337,
"learning_rate": 2.7983193277310927e-05,
"loss": 1.6239,
"step": 56000
},
{
"epoch": 0.15,
"grad_norm": 1.5147298574447632,
"learning_rate": 2.795168067226891e-05,
"loss": 1.6164,
"step": 56500
},
{
"epoch": 0.15,
"grad_norm": 1.5068755149841309,
"learning_rate": 2.792016806722689e-05,
"loss": 1.612,
"step": 57000
},
{
"epoch": 0.16,
"grad_norm": 1.5074622631072998,
"learning_rate": 2.7888655462184876e-05,
"loss": 1.6113,
"step": 57500
},
{
"epoch": 0.16,
"grad_norm": 1.4880355596542358,
"learning_rate": 2.7857142857142858e-05,
"loss": 1.6102,
"step": 58000
},
{
"epoch": 0.16,
"grad_norm": 1.6379941701889038,
"learning_rate": 2.7825630252100843e-05,
"loss": 1.6084,
"step": 58500
},
{
"epoch": 0.16,
"grad_norm": 1.4973347187042236,
"learning_rate": 2.7794117647058824e-05,
"loss": 1.6007,
"step": 59000
},
{
"epoch": 0.16,
"grad_norm": 1.5474885702133179,
"learning_rate": 2.776260504201681e-05,
"loss": 1.6042,
"step": 59500
},
{
"epoch": 0.16,
"grad_norm": 1.602220058441162,
"learning_rate": 2.773109243697479e-05,
"loss": 1.6106,
"step": 60000
},
{
"epoch": 0.16,
"grad_norm": 1.6185747385025024,
"learning_rate": 2.7699579831932776e-05,
"loss": 1.6058,
"step": 60500
},
{
"epoch": 0.17,
"grad_norm": 1.56905996799469,
"learning_rate": 2.7668067226890758e-05,
"loss": 1.6013,
"step": 61000
},
{
"epoch": 0.17,
"grad_norm": 1.5619949102401733,
"learning_rate": 2.763655462184874e-05,
"loss": 1.6034,
"step": 61500
},
{
"epoch": 0.17,
"grad_norm": 1.504239559173584,
"learning_rate": 2.7605042016806722e-05,
"loss": 1.6057,
"step": 62000
},
{
"epoch": 0.17,
"grad_norm": 1.4879348278045654,
"learning_rate": 2.7573529411764707e-05,
"loss": 1.6021,
"step": 62500
},
{
"epoch": 0.17,
"grad_norm": 1.5099623203277588,
"learning_rate": 2.754201680672269e-05,
"loss": 1.6026,
"step": 63000
},
{
"epoch": 0.17,
"grad_norm": 1.4979091882705688,
"learning_rate": 2.751050420168067e-05,
"loss": 1.5986,
"step": 63500
},
{
"epoch": 0.17,
"grad_norm": 1.4825040102005005,
"learning_rate": 2.7478991596638655e-05,
"loss": 1.5957,
"step": 64000
},
{
"epoch": 0.17,
"grad_norm": 1.493453860282898,
"learning_rate": 2.7447478991596637e-05,
"loss": 1.5989,
"step": 64500
},
{
"epoch": 0.18,
"grad_norm": 1.530388593673706,
"learning_rate": 2.7415966386554622e-05,
"loss": 1.5953,
"step": 65000
},
{
"epoch": 0.18,
"grad_norm": 1.5459638833999634,
"learning_rate": 2.7384453781512604e-05,
"loss": 1.5957,
"step": 65500
},
{
"epoch": 0.18,
"grad_norm": 2.0421242713928223,
"learning_rate": 2.735294117647059e-05,
"loss": 1.5984,
"step": 66000
},
{
"epoch": 0.18,
"grad_norm": 1.4634993076324463,
"learning_rate": 2.732142857142857e-05,
"loss": 1.5897,
"step": 66500
},
{
"epoch": 0.18,
"grad_norm": 1.530594825744629,
"learning_rate": 2.7289915966386556e-05,
"loss": 1.5902,
"step": 67000
},
{
"epoch": 0.18,
"grad_norm": 1.5332798957824707,
"learning_rate": 2.7258403361344538e-05,
"loss": 1.5874,
"step": 67500
},
{
"epoch": 0.18,
"grad_norm": 1.753754734992981,
"learning_rate": 2.7226890756302523e-05,
"loss": 1.59,
"step": 68000
},
{
"epoch": 0.19,
"grad_norm": 1.5545145273208618,
"learning_rate": 2.7195378151260505e-05,
"loss": 1.5949,
"step": 68500
},
{
"epoch": 0.19,
"grad_norm": 1.5194141864776611,
"learning_rate": 2.716386554621849e-05,
"loss": 1.588,
"step": 69000
},
{
"epoch": 0.19,
"grad_norm": 1.532632827758789,
"learning_rate": 2.713235294117647e-05,
"loss": 1.5918,
"step": 69500
},
{
"epoch": 0.19,
"grad_norm": 1.4970754384994507,
"learning_rate": 2.7100840336134453e-05,
"loss": 1.5851,
"step": 70000
},
{
"epoch": 0.19,
"grad_norm": 1.4157612323760986,
"learning_rate": 2.706932773109244e-05,
"loss": 1.5823,
"step": 70500
},
{
"epoch": 0.19,
"grad_norm": 1.5014020204544067,
"learning_rate": 2.703781512605042e-05,
"loss": 1.5847,
"step": 71000
},
{
"epoch": 0.19,
"grad_norm": 1.4652481079101562,
"learning_rate": 2.7006302521008405e-05,
"loss": 1.5886,
"step": 71500
},
{
"epoch": 0.19,
"grad_norm": 1.5810528993606567,
"learning_rate": 2.6974789915966387e-05,
"loss": 1.5805,
"step": 72000
},
{
"epoch": 0.2,
"grad_norm": 1.4908738136291504,
"learning_rate": 2.6943277310924372e-05,
"loss": 1.5812,
"step": 72500
},
{
"epoch": 0.2,
"grad_norm": 1.4520491361618042,
"learning_rate": 2.6911764705882354e-05,
"loss": 1.5837,
"step": 73000
},
{
"epoch": 0.2,
"grad_norm": 1.46824049949646,
"learning_rate": 2.688025210084034e-05,
"loss": 1.5778,
"step": 73500
},
{
"epoch": 0.2,
"grad_norm": 1.5032325983047485,
"learning_rate": 2.684873949579832e-05,
"loss": 1.5777,
"step": 74000
},
{
"epoch": 0.2,
"grad_norm": 1.5338232517242432,
"learning_rate": 2.6817226890756306e-05,
"loss": 1.5768,
"step": 74500
},
{
"epoch": 0.2,
"grad_norm": 1.5439281463623047,
"learning_rate": 2.6785714285714288e-05,
"loss": 1.5782,
"step": 75000
},
{
"epoch": 0.2,
"grad_norm": 1.536665439605713,
"learning_rate": 2.675420168067227e-05,
"loss": 1.5758,
"step": 75500
},
{
"epoch": 0.21,
"grad_norm": 1.4520212411880493,
"learning_rate": 2.6722689075630255e-05,
"loss": 1.5732,
"step": 76000
},
{
"epoch": 0.21,
"grad_norm": 1.5352224111557007,
"learning_rate": 2.6691176470588233e-05,
"loss": 1.5745,
"step": 76500
},
{
"epoch": 0.21,
"grad_norm": 1.4939314126968384,
"learning_rate": 2.6659663865546218e-05,
"loss": 1.5724,
"step": 77000
},
{
"epoch": 0.21,
"grad_norm": 1.4967976808547974,
"learning_rate": 2.66281512605042e-05,
"loss": 1.5693,
"step": 77500
},
{
"epoch": 0.21,
"grad_norm": 1.4980648756027222,
"learning_rate": 2.6596638655462185e-05,
"loss": 1.5721,
"step": 78000
},
{
"epoch": 0.21,
"grad_norm": 1.5700784921646118,
"learning_rate": 2.6565126050420167e-05,
"loss": 1.5713,
"step": 78500
},
{
"epoch": 0.21,
"grad_norm": 1.5124626159667969,
"learning_rate": 2.6533613445378152e-05,
"loss": 1.5709,
"step": 79000
},
{
"epoch": 0.22,
"grad_norm": 1.465012788772583,
"learning_rate": 2.6502100840336134e-05,
"loss": 1.5702,
"step": 79500
},
{
"epoch": 0.22,
"grad_norm": 1.4589452743530273,
"learning_rate": 2.647058823529412e-05,
"loss": 1.5675,
"step": 80000
},
{
"epoch": 0.22,
"grad_norm": 1.547255516052246,
"learning_rate": 2.64390756302521e-05,
"loss": 1.567,
"step": 80500
},
{
"epoch": 0.22,
"grad_norm": 1.5208017826080322,
"learning_rate": 2.6407563025210086e-05,
"loss": 1.5654,
"step": 81000
},
{
"epoch": 0.22,
"grad_norm": 1.563560128211975,
"learning_rate": 2.6376050420168067e-05,
"loss": 1.5651,
"step": 81500
},
{
"epoch": 0.22,
"grad_norm": 1.4551901817321777,
"learning_rate": 2.634453781512605e-05,
"loss": 1.5692,
"step": 82000
},
{
"epoch": 0.22,
"grad_norm": 3.783536672592163,
"learning_rate": 2.6313025210084034e-05,
"loss": 1.5698,
"step": 82500
},
{
"epoch": 0.22,
"grad_norm": 1.5397638082504272,
"learning_rate": 2.6281512605042016e-05,
"loss": 1.5614,
"step": 83000
},
{
"epoch": 0.23,
"grad_norm": 1.5307060480117798,
"learning_rate": 2.625e-05,
"loss": 1.5596,
"step": 83500
},
{
"epoch": 0.23,
"grad_norm": 1.5148283243179321,
"learning_rate": 2.6218487394957983e-05,
"loss": 1.5612,
"step": 84000
},
{
"epoch": 0.23,
"grad_norm": 1.531973958015442,
"learning_rate": 2.6186974789915968e-05,
"loss": 1.559,
"step": 84500
},
{
"epoch": 0.23,
"grad_norm": 1.5402531623840332,
"learning_rate": 2.615546218487395e-05,
"loss": 1.5624,
"step": 85000
},
{
"epoch": 0.23,
"grad_norm": 1.486365795135498,
"learning_rate": 2.6123949579831935e-05,
"loss": 1.5601,
"step": 85500
},
{
"epoch": 0.23,
"grad_norm": 1.513438105583191,
"learning_rate": 2.6092436974789917e-05,
"loss": 1.5567,
"step": 86000
},
{
"epoch": 0.23,
"grad_norm": 1.5112252235412598,
"learning_rate": 2.6060924369747902e-05,
"loss": 1.5574,
"step": 86500
},
{
"epoch": 0.24,
"grad_norm": 1.4394776821136475,
"learning_rate": 2.6029411764705883e-05,
"loss": 1.5562,
"step": 87000
},
{
"epoch": 0.24,
"grad_norm": 1.6592140197753906,
"learning_rate": 2.599789915966387e-05,
"loss": 1.5551,
"step": 87500
},
{
"epoch": 0.24,
"grad_norm": 1.4790719747543335,
"learning_rate": 2.596638655462185e-05,
"loss": 1.5544,
"step": 88000
},
{
"epoch": 0.24,
"grad_norm": 1.4369221925735474,
"learning_rate": 2.5934873949579832e-05,
"loss": 1.5538,
"step": 88500
},
{
"epoch": 0.24,
"grad_norm": 1.5175668001174927,
"learning_rate": 2.5903361344537817e-05,
"loss": 1.5556,
"step": 89000
},
{
"epoch": 0.24,
"grad_norm": 1.4514554738998413,
"learning_rate": 2.58718487394958e-05,
"loss": 1.5539,
"step": 89500
},
{
"epoch": 0.24,
"grad_norm": 1.4288485050201416,
"learning_rate": 2.5840336134453784e-05,
"loss": 1.5525,
"step": 90000
},
{
"epoch": 0.24,
"grad_norm": 1.546531081199646,
"learning_rate": 2.5808823529411766e-05,
"loss": 1.5527,
"step": 90500
},
{
"epoch": 0.25,
"grad_norm": 1.567368507385254,
"learning_rate": 2.5777310924369748e-05,
"loss": 1.5491,
"step": 91000
},
{
"epoch": 0.25,
"grad_norm": 1.5126845836639404,
"learning_rate": 2.574579831932773e-05,
"loss": 1.5504,
"step": 91500
},
{
"epoch": 0.25,
"grad_norm": 1.5570114850997925,
"learning_rate": 2.5714285714285714e-05,
"loss": 1.5469,
"step": 92000
},
{
"epoch": 0.25,
"grad_norm": 1.4678915739059448,
"learning_rate": 2.5682773109243696e-05,
"loss": 1.5493,
"step": 92500
},
{
"epoch": 0.25,
"grad_norm": 1.4618594646453857,
"learning_rate": 2.565126050420168e-05,
"loss": 1.555,
"step": 93000
},
{
"epoch": 0.25,
"grad_norm": 1.5945430994033813,
"learning_rate": 2.5619747899159663e-05,
"loss": 1.547,
"step": 93500
},
{
"epoch": 0.25,
"grad_norm": 1.4740761518478394,
"learning_rate": 2.5588235294117648e-05,
"loss": 1.5463,
"step": 94000
},
{
"epoch": 0.26,
"grad_norm": 1.4022290706634521,
"learning_rate": 2.555672268907563e-05,
"loss": 1.5449,
"step": 94500
},
{
"epoch": 0.26,
"grad_norm": 2.622828722000122,
"learning_rate": 2.552521008403361e-05,
"loss": 1.55,
"step": 95000
},
{
"epoch": 0.26,
"grad_norm": 1.409568428993225,
"learning_rate": 2.5493697478991597e-05,
"loss": 1.5436,
"step": 95500
},
{
"epoch": 0.26,
"grad_norm": 1.4889922142028809,
"learning_rate": 2.546218487394958e-05,
"loss": 1.5441,
"step": 96000
},
{
"epoch": 0.26,
"grad_norm": 1.4589875936508179,
"learning_rate": 2.5430672268907564e-05,
"loss": 1.5468,
"step": 96500
},
{
"epoch": 0.26,
"grad_norm": 1.4680520296096802,
"learning_rate": 2.5399159663865545e-05,
"loss": 1.5429,
"step": 97000
},
{
"epoch": 0.26,
"grad_norm": 1.4456883668899536,
"learning_rate": 2.536764705882353e-05,
"loss": 1.5458,
"step": 97500
},
{
"epoch": 0.27,
"grad_norm": 1.4655406475067139,
"learning_rate": 2.5336134453781512e-05,
"loss": 1.5399,
"step": 98000
},
{
"epoch": 0.27,
"grad_norm": 7.581863880157471,
"learning_rate": 2.5304621848739497e-05,
"loss": 1.5423,
"step": 98500
},
{
"epoch": 0.27,
"grad_norm": 1.5289582014083862,
"learning_rate": 2.527310924369748e-05,
"loss": 1.5434,
"step": 99000
},
{
"epoch": 0.27,
"grad_norm": 1.475637674331665,
"learning_rate": 2.5241596638655464e-05,
"loss": 1.5415,
"step": 99500
},
{
"epoch": 0.27,
"grad_norm": 1.45746910572052,
"learning_rate": 2.5210084033613446e-05,
"loss": 1.5401,
"step": 100000
},
{
"epoch": 0.27,
"grad_norm": 1.4924384355545044,
"learning_rate": 2.517857142857143e-05,
"loss": 1.5382,
"step": 100500
},
{
"epoch": 0.27,
"grad_norm": 1.4440650939941406,
"learning_rate": 2.5147058823529413e-05,
"loss": 1.539,
"step": 101000
},
{
"epoch": 0.27,
"grad_norm": 1.5022001266479492,
"learning_rate": 2.5115546218487395e-05,
"loss": 1.5375,
"step": 101500
},
{
"epoch": 0.28,
"grad_norm": 1.4573357105255127,
"learning_rate": 2.508403361344538e-05,
"loss": 1.5423,
"step": 102000
},
{
"epoch": 0.28,
"grad_norm": 1.4948347806930542,
"learning_rate": 2.505252100840336e-05,
"loss": 1.538,
"step": 102500
},
{
"epoch": 0.28,
"grad_norm": 1.5028940439224243,
"learning_rate": 2.5021008403361347e-05,
"loss": 1.5368,
"step": 103000
},
{
"epoch": 0.28,
"grad_norm": 1.510446310043335,
"learning_rate": 2.498949579831933e-05,
"loss": 1.534,
"step": 103500
},
{
"epoch": 0.28,
"grad_norm": 1.516194462776184,
"learning_rate": 2.4957983193277314e-05,
"loss": 1.5404,
"step": 104000
},
{
"epoch": 0.28,
"grad_norm": 1.452358365058899,
"learning_rate": 2.4926470588235295e-05,
"loss": 1.5349,
"step": 104500
},
{
"epoch": 0.28,
"grad_norm": 1.4550226926803589,
"learning_rate": 2.489495798319328e-05,
"loss": 1.5373,
"step": 105000
},
{
"epoch": 0.29,
"grad_norm": 1.4559545516967773,
"learning_rate": 2.4863445378151262e-05,
"loss": 1.5341,
"step": 105500
},
{
"epoch": 0.29,
"grad_norm": 1.4436681270599365,
"learning_rate": 2.4831932773109244e-05,
"loss": 1.5344,
"step": 106000
},
{
"epoch": 0.29,
"grad_norm": 1.4642813205718994,
"learning_rate": 2.4800420168067226e-05,
"loss": 1.5333,
"step": 106500
},
{
"epoch": 0.29,
"grad_norm": 1.4824906587600708,
"learning_rate": 2.476890756302521e-05,
"loss": 1.5291,
"step": 107000
},
{
"epoch": 0.29,
"grad_norm": 1.515098214149475,
"learning_rate": 2.4737394957983193e-05,
"loss": 1.5285,
"step": 107500
},
{
"epoch": 0.29,
"grad_norm": 2.073720693588257,
"learning_rate": 2.4705882352941174e-05,
"loss": 1.5348,
"step": 108000
},
{
"epoch": 0.29,
"grad_norm": 1.884777545928955,
"learning_rate": 2.467436974789916e-05,
"loss": 1.5321,
"step": 108500
},
{
"epoch": 0.3,
"grad_norm": 1.4791995286941528,
"learning_rate": 2.464285714285714e-05,
"loss": 1.5305,
"step": 109000
},
{
"epoch": 0.3,
"grad_norm": 1.4546101093292236,
"learning_rate": 2.4611344537815126e-05,
"loss": 1.5308,
"step": 109500
},
{
"epoch": 0.3,
"grad_norm": 1.421767234802246,
"learning_rate": 2.4579831932773108e-05,
"loss": 1.532,
"step": 110000
},
{
"epoch": 0.3,
"grad_norm": 1.476372480392456,
"learning_rate": 2.4548319327731093e-05,
"loss": 1.5303,
"step": 110500
},
{
"epoch": 0.3,
"grad_norm": 1.4746720790863037,
"learning_rate": 2.4516806722689075e-05,
"loss": 1.531,
"step": 111000
},
{
"epoch": 0.3,
"grad_norm": 1.486217975616455,
"learning_rate": 2.448529411764706e-05,
"loss": 1.5277,
"step": 111500
},
{
"epoch": 0.3,
"grad_norm": 1.4249714612960815,
"learning_rate": 2.4453781512605042e-05,
"loss": 1.525,
"step": 112000
},
{
"epoch": 0.3,
"grad_norm": 1.4237457513809204,
"learning_rate": 2.4422268907563027e-05,
"loss": 1.5263,
"step": 112500
},
{
"epoch": 0.31,
"grad_norm": 1.4878206253051758,
"learning_rate": 2.439075630252101e-05,
"loss": 1.5239,
"step": 113000
},
{
"epoch": 0.31,
"grad_norm": 1.4781346321105957,
"learning_rate": 2.4359243697478994e-05,
"loss": 1.528,
"step": 113500
},
{
"epoch": 0.31,
"grad_norm": 1.4943785667419434,
"learning_rate": 2.4327731092436976e-05,
"loss": 1.5231,
"step": 114000
},
{
"epoch": 0.31,
"grad_norm": 1.466009497642517,
"learning_rate": 2.4296218487394957e-05,
"loss": 1.5233,
"step": 114500
},
{
"epoch": 0.31,
"grad_norm": 2.4329051971435547,
"learning_rate": 2.4264705882352942e-05,
"loss": 1.5266,
"step": 115000
},
{
"epoch": 0.31,
"grad_norm": 1.477039098739624,
"learning_rate": 2.4233193277310924e-05,
"loss": 1.5278,
"step": 115500
},
{
"epoch": 0.31,
"grad_norm": 1.5693820714950562,
"learning_rate": 2.420168067226891e-05,
"loss": 1.5254,
"step": 116000
},
{
"epoch": 0.32,
"grad_norm": 1.4393528699874878,
"learning_rate": 2.417016806722689e-05,
"loss": 1.5236,
"step": 116500
},
{
"epoch": 0.32,
"grad_norm": 1.4845529794692993,
"learning_rate": 2.4138655462184876e-05,
"loss": 1.5206,
"step": 117000
},
{
"epoch": 0.32,
"grad_norm": 1.476683259010315,
"learning_rate": 2.4107142857142858e-05,
"loss": 1.5208,
"step": 117500
},
{
"epoch": 0.32,
"grad_norm": 1.428836703300476,
"learning_rate": 2.4075630252100843e-05,
"loss": 1.5234,
"step": 118000
},
{
"epoch": 0.32,
"grad_norm": 1.449540138244629,
"learning_rate": 2.4044117647058825e-05,
"loss": 1.5234,
"step": 118500
},
{
"epoch": 0.32,
"grad_norm": 1.4410090446472168,
"learning_rate": 2.401260504201681e-05,
"loss": 1.5203,
"step": 119000
},
{
"epoch": 0.32,
"grad_norm": 1.4714431762695312,
"learning_rate": 2.398109243697479e-05,
"loss": 1.5208,
"step": 119500
},
{
"epoch": 0.32,
"grad_norm": 1.469762921333313,
"learning_rate": 2.3949579831932777e-05,
"loss": 1.524,
"step": 120000
},
{
"epoch": 0.33,
"grad_norm": 1.5507971048355103,
"learning_rate": 2.391806722689076e-05,
"loss": 1.5224,
"step": 120500
},
{
"epoch": 0.33,
"grad_norm": 1.5093679428100586,
"learning_rate": 2.3886554621848737e-05,
"loss": 1.5235,
"step": 121000
},
{
"epoch": 0.33,
"grad_norm": 1.492244839668274,
"learning_rate": 2.3855042016806722e-05,
"loss": 1.5196,
"step": 121500
},
{
"epoch": 0.33,
"grad_norm": 1.4522676467895508,
"learning_rate": 2.3823529411764704e-05,
"loss": 1.5209,
"step": 122000
},
{
"epoch": 0.33,
"grad_norm": 1.527627944946289,
"learning_rate": 2.379201680672269e-05,
"loss": 1.5198,
"step": 122500
},
{
"epoch": 0.33,
"grad_norm": 1.488146424293518,
"learning_rate": 2.376050420168067e-05,
"loss": 1.5165,
"step": 123000
},
{
"epoch": 0.33,
"grad_norm": 1.4484755992889404,
"learning_rate": 2.3728991596638656e-05,
"loss": 1.5123,
"step": 123500
},
{
"epoch": 0.34,
"grad_norm": 1.5184931755065918,
"learning_rate": 2.3697478991596638e-05,
"loss": 1.5177,
"step": 124000
},
{
"epoch": 0.34,
"grad_norm": 1.4979966878890991,
"learning_rate": 2.3665966386554623e-05,
"loss": 1.5193,
"step": 124500
},
{
"epoch": 0.34,
"grad_norm": 1.4858919382095337,
"learning_rate": 2.3634453781512604e-05,
"loss": 1.5129,
"step": 125000
},
{
"epoch": 0.34,
"grad_norm": 1.6100457906723022,
"learning_rate": 2.360294117647059e-05,
"loss": 1.5153,
"step": 125500
},
{
"epoch": 0.34,
"grad_norm": 1.4573218822479248,
"learning_rate": 2.357142857142857e-05,
"loss": 1.5173,
"step": 126000
},
{
"epoch": 0.34,
"grad_norm": 1.4780622720718384,
"learning_rate": 2.3539915966386556e-05,
"loss": 1.5142,
"step": 126500
},
{
"epoch": 0.34,
"grad_norm": 1.4847768545150757,
"learning_rate": 2.3508403361344538e-05,
"loss": 1.5123,
"step": 127000
},
{
"epoch": 0.35,
"grad_norm": 1.789902925491333,
"learning_rate": 2.347689075630252e-05,
"loss": 1.5128,
"step": 127500
},
{
"epoch": 0.35,
"grad_norm": 1.4414323568344116,
"learning_rate": 2.3445378151260505e-05,
"loss": 1.5112,
"step": 128000
},
{
"epoch": 0.35,
"grad_norm": 1.542536735534668,
"learning_rate": 2.3413865546218487e-05,
"loss": 1.5132,
"step": 128500
},
{
"epoch": 0.35,
"grad_norm": 1.479336142539978,
"learning_rate": 2.3382352941176472e-05,
"loss": 1.5091,
"step": 129000
},
{
"epoch": 0.35,
"grad_norm": 1.5068061351776123,
"learning_rate": 2.3350840336134454e-05,
"loss": 1.5157,
"step": 129500
},
{
"epoch": 0.35,
"grad_norm": 1.5134038925170898,
"learning_rate": 2.331932773109244e-05,
"loss": 1.5145,
"step": 130000
},
{
"epoch": 0.35,
"grad_norm": 2.804521083831787,
"learning_rate": 2.328781512605042e-05,
"loss": 1.71,
"step": 130500
},
{
"epoch": 0.35,
"grad_norm": 9.153915405273438,
"learning_rate": 2.3256302521008406e-05,
"loss": 1.5874,
"step": 131000
},
{
"epoch": 0.36,
"grad_norm": 3.567737579345703,
"learning_rate": 2.3224789915966387e-05,
"loss": 1.5532,
"step": 131500
},
{
"epoch": 0.36,
"grad_norm": 1.5058925151824951,
"learning_rate": 2.3193277310924373e-05,
"loss": 1.5241,
"step": 132000
},
{
"epoch": 0.36,
"grad_norm": 1.48910653591156,
"learning_rate": 2.3161764705882354e-05,
"loss": 1.5197,
"step": 132500
},
{
"epoch": 0.36,
"grad_norm": 1.477921962738037,
"learning_rate": 2.313025210084034e-05,
"loss": 1.5191,
"step": 133000
},
{
"epoch": 0.36,
"grad_norm": 1.503013014793396,
"learning_rate": 2.309873949579832e-05,
"loss": 1.5112,
"step": 133500
},
{
"epoch": 0.36,
"grad_norm": 1.457146406173706,
"learning_rate": 2.3067226890756303e-05,
"loss": 1.5158,
"step": 134000
},
{
"epoch": 0.36,
"grad_norm": 1.8954756259918213,
"learning_rate": 2.3035714285714288e-05,
"loss": 1.5138,
"step": 134500
},
{
"epoch": 0.37,
"grad_norm": 1.5171183347702026,
"learning_rate": 2.300420168067227e-05,
"loss": 1.5201,
"step": 135000
},
{
"epoch": 0.37,
"grad_norm": 1.454849362373352,
"learning_rate": 2.2972689075630255e-05,
"loss": 1.5113,
"step": 135500
},
{
"epoch": 0.37,
"grad_norm": 2.3639023303985596,
"learning_rate": 2.2941176470588233e-05,
"loss": 1.5089,
"step": 136000
},
{
"epoch": 0.37,
"grad_norm": 1.4599758386611938,
"learning_rate": 2.290966386554622e-05,
"loss": 1.5099,
"step": 136500
},
{
"epoch": 0.37,
"grad_norm": 1.5151523351669312,
"learning_rate": 2.28781512605042e-05,
"loss": 1.5077,
"step": 137000
},
{
"epoch": 0.37,
"grad_norm": 1.518723726272583,
"learning_rate": 2.2846638655462185e-05,
"loss": 1.5097,
"step": 137500
},
{
"epoch": 0.37,
"grad_norm": 1.5430985689163208,
"learning_rate": 2.2815126050420167e-05,
"loss": 1.5089,
"step": 138000
},
{
"epoch": 0.37,
"grad_norm": 1.468233585357666,
"learning_rate": 2.2783613445378152e-05,
"loss": 1.5075,
"step": 138500
},
{
"epoch": 0.38,
"grad_norm": 1.540824294090271,
"learning_rate": 2.2752100840336134e-05,
"loss": 1.5095,
"step": 139000
},
{
"epoch": 0.38,
"grad_norm": 1.4792211055755615,
"learning_rate": 2.272058823529412e-05,
"loss": 1.5123,
"step": 139500
},
{
"epoch": 0.38,
"grad_norm": 1.4582479000091553,
"learning_rate": 2.26890756302521e-05,
"loss": 1.5041,
"step": 140000
},
{
"epoch": 0.38,
"grad_norm": 1.4484353065490723,
"learning_rate": 2.2657563025210083e-05,
"loss": 1.5098,
"step": 140500
},
{
"epoch": 0.38,
"grad_norm": 2.090087413787842,
"learning_rate": 2.2626050420168068e-05,
"loss": 1.504,
"step": 141000
},
{
"epoch": 0.38,
"grad_norm": 1.5165677070617676,
"learning_rate": 2.259453781512605e-05,
"loss": 1.5037,
"step": 141500
},
{
"epoch": 0.38,
"grad_norm": 1.4467180967330933,
"learning_rate": 2.2563025210084035e-05,
"loss": 1.5037,
"step": 142000
},
{
"epoch": 0.39,
"grad_norm": 1.53107750415802,
"learning_rate": 2.2531512605042016e-05,
"loss": 1.5048,
"step": 142500
},
{
"epoch": 0.39,
"grad_norm": 1.685832142829895,
"learning_rate": 2.25e-05,
"loss": 1.5051,
"step": 143000
},
{
"epoch": 0.39,
"grad_norm": 1.722901701927185,
"learning_rate": 2.2468487394957983e-05,
"loss": 1.5038,
"step": 143500
},
{
"epoch": 0.39,
"grad_norm": 1.5191560983657837,
"learning_rate": 2.2436974789915968e-05,
"loss": 1.5021,
"step": 144000
},
{
"epoch": 0.39,
"grad_norm": 1.6680717468261719,
"learning_rate": 2.240546218487395e-05,
"loss": 1.5019,
"step": 144500
},
{
"epoch": 0.39,
"grad_norm": 1.5664371252059937,
"learning_rate": 2.2373949579831935e-05,
"loss": 1.5028,
"step": 145000
},
{
"epoch": 0.39,
"grad_norm": 1.484131932258606,
"learning_rate": 2.2342436974789917e-05,
"loss": 1.5028,
"step": 145500
},
{
"epoch": 0.4,
"grad_norm": 1.4882657527923584,
"learning_rate": 2.2310924369747902e-05,
"loss": 1.4993,
"step": 146000
},
{
"epoch": 0.4,
"grad_norm": 1.4583569765090942,
"learning_rate": 2.2279411764705884e-05,
"loss": 1.5037,
"step": 146500
},
{
"epoch": 0.4,
"grad_norm": 1.559399127960205,
"learning_rate": 2.2247899159663866e-05,
"loss": 1.4994,
"step": 147000
},
{
"epoch": 0.4,
"grad_norm": 1.537287950515747,
"learning_rate": 2.221638655462185e-05,
"loss": 1.5008,
"step": 147500
},
{
"epoch": 0.4,
"grad_norm": 1.4840517044067383,
"learning_rate": 2.2184873949579832e-05,
"loss": 1.5003,
"step": 148000
},
{
"epoch": 0.4,
"grad_norm": 1.6292195320129395,
"learning_rate": 2.2153361344537818e-05,
"loss": 1.4975,
"step": 148500
},
{
"epoch": 0.4,
"grad_norm": 1.4870771169662476,
"learning_rate": 2.21218487394958e-05,
"loss": 1.4962,
"step": 149000
},
{
"epoch": 0.4,
"grad_norm": 1.4792907238006592,
"learning_rate": 2.2090336134453784e-05,
"loss": 1.4978,
"step": 149500
},
{
"epoch": 0.41,
"grad_norm": 1.4179558753967285,
"learning_rate": 2.2058823529411766e-05,
"loss": 1.5012,
"step": 150000
},
{
"epoch": 0.41,
"grad_norm": 1.4594039916992188,
"learning_rate": 2.2027310924369748e-05,
"loss": 1.4987,
"step": 150500
},
{
"epoch": 0.41,
"grad_norm": 1.5356736183166504,
"learning_rate": 2.199579831932773e-05,
"loss": 1.4975,
"step": 151000
},
{
"epoch": 0.41,
"grad_norm": 1.4961708784103394,
"learning_rate": 2.1964285714285715e-05,
"loss": 1.4966,
"step": 151500
},
{
"epoch": 0.41,
"grad_norm": 1.5061964988708496,
"learning_rate": 2.1932773109243697e-05,
"loss": 1.4952,
"step": 152000
},
{
"epoch": 0.41,
"grad_norm": 1.4668192863464355,
"learning_rate": 2.190126050420168e-05,
"loss": 1.4955,
"step": 152500
},
{
"epoch": 0.41,
"grad_norm": 1.520202398300171,
"learning_rate": 2.1869747899159663e-05,
"loss": 1.4987,
"step": 153000
},
{
"epoch": 0.42,
"grad_norm": 1.5048165321350098,
"learning_rate": 2.1838235294117645e-05,
"loss": 1.4943,
"step": 153500
},
{
"epoch": 0.42,
"grad_norm": 1.4194804430007935,
"learning_rate": 2.180672268907563e-05,
"loss": 1.4962,
"step": 154000
},
{
"epoch": 0.42,
"grad_norm": 1.4963053464889526,
"learning_rate": 2.1775210084033612e-05,
"loss": 1.4939,
"step": 154500
},
{
"epoch": 0.42,
"grad_norm": 1.5189534425735474,
"learning_rate": 2.1743697478991597e-05,
"loss": 1.4955,
"step": 155000
},
{
"epoch": 0.42,
"grad_norm": 1.844502329826355,
"learning_rate": 2.171218487394958e-05,
"loss": 1.4932,
"step": 155500
},
{
"epoch": 0.42,
"grad_norm": 1.6127697229385376,
"learning_rate": 2.1680672268907564e-05,
"loss": 1.4972,
"step": 156000
},
{
"epoch": 0.42,
"grad_norm": 2.39309024810791,
"learning_rate": 2.1649159663865546e-05,
"loss": 1.4961,
"step": 156500
},
{
"epoch": 0.43,
"grad_norm": 1.7886457443237305,
"learning_rate": 2.161764705882353e-05,
"loss": 1.4981,
"step": 157000
},
{
"epoch": 0.43,
"grad_norm": 1.5055351257324219,
"learning_rate": 2.1586134453781513e-05,
"loss": 1.4937,
"step": 157500
},
{
"epoch": 0.43,
"grad_norm": 2.2209436893463135,
"learning_rate": 2.1554621848739498e-05,
"loss": 1.4958,
"step": 158000
},
{
"epoch": 0.43,
"grad_norm": 1.4863665103912354,
"learning_rate": 2.152310924369748e-05,
"loss": 1.4937,
"step": 158500
},
{
"epoch": 0.43,
"grad_norm": 1.6290695667266846,
"learning_rate": 2.1491596638655465e-05,
"loss": 1.4934,
"step": 159000
},
{
"epoch": 0.43,
"grad_norm": 1.5069892406463623,
"learning_rate": 2.1460084033613446e-05,
"loss": 1.4966,
"step": 159500
},
{
"epoch": 0.43,
"grad_norm": 1.4480432271957397,
"learning_rate": 2.1428571428571428e-05,
"loss": 1.4928,
"step": 160000
},
{
"epoch": 0.43,
"grad_norm": 1.4599815607070923,
"learning_rate": 2.1397058823529413e-05,
"loss": 1.4907,
"step": 160500
},
{
"epoch": 0.44,
"grad_norm": 1.5667592287063599,
"learning_rate": 2.1365546218487395e-05,
"loss": 1.4946,
"step": 161000
},
{
"epoch": 0.44,
"grad_norm": 1.591620683670044,
"learning_rate": 2.133403361344538e-05,
"loss": 1.4932,
"step": 161500
},
{
"epoch": 0.44,
"grad_norm": 1.4108275175094604,
"learning_rate": 2.1302521008403362e-05,
"loss": 1.4918,
"step": 162000
},
{
"epoch": 0.44,
"grad_norm": 1.3984153270721436,
"learning_rate": 2.1271008403361347e-05,
"loss": 1.4912,
"step": 162500
},
{
"epoch": 0.44,
"grad_norm": 1.5187551975250244,
"learning_rate": 2.123949579831933e-05,
"loss": 1.4896,
"step": 163000
},
{
"epoch": 0.44,
"grad_norm": 1.4671634435653687,
"learning_rate": 2.1207983193277314e-05,
"loss": 1.4909,
"step": 163500
},
{
"epoch": 0.44,
"grad_norm": 1.5398577451705933,
"learning_rate": 2.1176470588235296e-05,
"loss": 1.4898,
"step": 164000
},
{
"epoch": 0.45,
"grad_norm": 1.4390913248062134,
"learning_rate": 2.114495798319328e-05,
"loss": 1.4917,
"step": 164500
},
{
"epoch": 0.45,
"grad_norm": 1.466871976852417,
"learning_rate": 2.1113445378151263e-05,
"loss": 1.486,
"step": 165000
},
{
"epoch": 0.45,
"grad_norm": 1.4268947839736938,
"learning_rate": 2.1081932773109244e-05,
"loss": 1.486,
"step": 165500
},
{
"epoch": 0.45,
"grad_norm": 1.473212718963623,
"learning_rate": 2.1050420168067226e-05,
"loss": 1.4906,
"step": 166000
},
{
"epoch": 0.45,
"grad_norm": 1.4817694425582886,
"learning_rate": 2.1018907563025208e-05,
"loss": 1.4876,
"step": 166500
},
{
"epoch": 0.45,
"grad_norm": 1.4899072647094727,
"learning_rate": 2.0987394957983193e-05,
"loss": 1.4853,
"step": 167000
},
{
"epoch": 0.45,
"grad_norm": 1.472068428993225,
"learning_rate": 2.0955882352941175e-05,
"loss": 1.4859,
"step": 167500
},
{
"epoch": 0.45,
"grad_norm": 1.4609180688858032,
"learning_rate": 2.092436974789916e-05,
"loss": 1.4867,
"step": 168000
},
{
"epoch": 0.46,
"grad_norm": 1.3884390592575073,
"learning_rate": 2.089285714285714e-05,
"loss": 1.4845,
"step": 168500
},
{
"epoch": 0.46,
"grad_norm": 1.4505021572113037,
"learning_rate": 2.0861344537815127e-05,
"loss": 1.4804,
"step": 169000
},
{
"epoch": 0.46,
"grad_norm": 1.4579660892486572,
"learning_rate": 2.082983193277311e-05,
"loss": 1.4828,
"step": 169500
},
{
"epoch": 0.46,
"grad_norm": 1.4193936586380005,
"learning_rate": 2.0798319327731094e-05,
"loss": 1.4846,
"step": 170000
},
{
"epoch": 0.46,
"grad_norm": 1.8833608627319336,
"learning_rate": 2.0766806722689075e-05,
"loss": 1.4832,
"step": 170500
},
{
"epoch": 0.46,
"grad_norm": 1.394463062286377,
"learning_rate": 2.073529411764706e-05,
"loss": 1.4858,
"step": 171000
},
{
"epoch": 0.46,
"grad_norm": 1.4402869939804077,
"learning_rate": 2.0703781512605042e-05,
"loss": 1.4853,
"step": 171500
},
{
"epoch": 0.47,
"grad_norm": 1.5677118301391602,
"learning_rate": 2.0672268907563024e-05,
"loss": 1.4828,
"step": 172000
},
{
"epoch": 0.47,
"grad_norm": 1.412744402885437,
"learning_rate": 2.064075630252101e-05,
"loss": 1.4861,
"step": 172500
},
{
"epoch": 0.47,
"grad_norm": 1.578121542930603,
"learning_rate": 2.060924369747899e-05,
"loss": 1.4825,
"step": 173000
},
{
"epoch": 0.47,
"grad_norm": 1.4429398775100708,
"learning_rate": 2.0577731092436976e-05,
"loss": 1.4806,
"step": 173500
},
{
"epoch": 0.47,
"grad_norm": 1.5229464769363403,
"learning_rate": 2.0546218487394958e-05,
"loss": 1.4822,
"step": 174000
},
{
"epoch": 0.47,
"grad_norm": 1.533868670463562,
"learning_rate": 2.0514705882352943e-05,
"loss": 1.4788,
"step": 174500
},
{
"epoch": 0.47,
"grad_norm": 1.4442238807678223,
"learning_rate": 2.0483193277310925e-05,
"loss": 1.4845,
"step": 175000
},
{
"epoch": 0.48,
"grad_norm": 1.8768386840820312,
"learning_rate": 2.045168067226891e-05,
"loss": 1.481,
"step": 175500
},
{
"epoch": 0.48,
"grad_norm": 1.5719354152679443,
"learning_rate": 2.042016806722689e-05,
"loss": 1.4815,
"step": 176000
},
{
"epoch": 0.48,
"grad_norm": 1.6776522397994995,
"learning_rate": 2.0388655462184877e-05,
"loss": 1.4834,
"step": 176500
},
{
"epoch": 0.48,
"grad_norm": 1.462403416633606,
"learning_rate": 2.0357142857142858e-05,
"loss": 1.4829,
"step": 177000
},
{
"epoch": 0.48,
"grad_norm": 1.441434621810913,
"learning_rate": 2.0325630252100843e-05,
"loss": 1.4817,
"step": 177500
},
{
"epoch": 0.48,
"grad_norm": 1.7203949689865112,
"learning_rate": 2.0294117647058825e-05,
"loss": 1.4819,
"step": 178000
},
{
"epoch": 0.48,
"grad_norm": 1.6117925643920898,
"learning_rate": 2.0262605042016807e-05,
"loss": 1.48,
"step": 178500
},
{
"epoch": 0.48,
"grad_norm": 1.4840322732925415,
"learning_rate": 2.0231092436974792e-05,
"loss": 1.4804,
"step": 179000
},
{
"epoch": 0.49,
"grad_norm": 1.4823276996612549,
"learning_rate": 2.0199579831932774e-05,
"loss": 1.4783,
"step": 179500
},
{
"epoch": 0.49,
"grad_norm": 1.467035174369812,
"learning_rate": 2.016806722689076e-05,
"loss": 1.4826,
"step": 180000
},
{
"epoch": 0.49,
"grad_norm": 1.4519331455230713,
"learning_rate": 2.0136554621848737e-05,
"loss": 1.4793,
"step": 180500
},
{
"epoch": 0.49,
"grad_norm": 1.4830392599105835,
"learning_rate": 2.0105042016806722e-05,
"loss": 1.478,
"step": 181000
},
{
"epoch": 0.49,
"grad_norm": 1.4889652729034424,
"learning_rate": 2.0073529411764704e-05,
"loss": 1.4825,
"step": 181500
},
{
"epoch": 0.49,
"grad_norm": 1.4417020082473755,
"learning_rate": 2.004201680672269e-05,
"loss": 1.4781,
"step": 182000
},
{
"epoch": 0.49,
"grad_norm": 1.5612033605575562,
"learning_rate": 2.001050420168067e-05,
"loss": 1.4749,
"step": 182500
},
{
"epoch": 0.5,
"grad_norm": 1.923521637916565,
"learning_rate": 1.9978991596638656e-05,
"loss": 1.4742,
"step": 183000
},
{
"epoch": 0.5,
"grad_norm": 1.4759869575500488,
"learning_rate": 1.9947478991596638e-05,
"loss": 1.4772,
"step": 183500
},
{
"epoch": 0.5,
"grad_norm": 1.4529997110366821,
"learning_rate": 1.9915966386554623e-05,
"loss": 1.4758,
"step": 184000
},
{
"epoch": 0.5,
"grad_norm": 1.4907563924789429,
"learning_rate": 1.9884453781512605e-05,
"loss": 1.477,
"step": 184500
},
{
"epoch": 0.5,
"grad_norm": 1.4529681205749512,
"learning_rate": 1.9852941176470586e-05,
"loss": 1.4754,
"step": 185000
},
{
"epoch": 0.5,
"grad_norm": 1.4950664043426514,
"learning_rate": 1.982142857142857e-05,
"loss": 1.477,
"step": 185500
},
{
"epoch": 0.5,
"grad_norm": 1.5445144176483154,
"learning_rate": 1.9789915966386553e-05,
"loss": 1.4763,
"step": 186000
},
{
"epoch": 0.5,
"grad_norm": 2.2947561740875244,
"learning_rate": 1.975840336134454e-05,
"loss": 1.4771,
"step": 186500
},
{
"epoch": 0.51,
"grad_norm": 1.4762338399887085,
"learning_rate": 1.972689075630252e-05,
"loss": 1.4748,
"step": 187000
},
{
"epoch": 0.51,
"grad_norm": 1.5006557703018188,
"learning_rate": 1.9695378151260505e-05,
"loss": 1.474,
"step": 187500
},
{
"epoch": 0.51,
"grad_norm": 1.5126187801361084,
"learning_rate": 1.9663865546218487e-05,
"loss": 1.4769,
"step": 188000
},
{
"epoch": 0.51,
"grad_norm": 3.9213035106658936,
"learning_rate": 1.9632352941176472e-05,
"loss": 1.4724,
"step": 188500
},
{
"epoch": 0.51,
"grad_norm": 1.3832660913467407,
"learning_rate": 1.9600840336134454e-05,
"loss": 1.4743,
"step": 189000
},
{
"epoch": 0.51,
"grad_norm": 1.438021183013916,
"learning_rate": 1.956932773109244e-05,
"loss": 1.4732,
"step": 189500
},
{
"epoch": 0.51,
"grad_norm": 1.552357792854309,
"learning_rate": 1.953781512605042e-05,
"loss": 1.4693,
"step": 190000
},
{
"epoch": 0.52,
"grad_norm": 1.4992841482162476,
"learning_rate": 1.9506302521008406e-05,
"loss": 1.4741,
"step": 190500
},
{
"epoch": 0.52,
"grad_norm": 1.4546705484390259,
"learning_rate": 1.9474789915966388e-05,
"loss": 1.4709,
"step": 191000
},
{
"epoch": 0.52,
"grad_norm": 1.5536097288131714,
"learning_rate": 1.944327731092437e-05,
"loss": 1.4715,
"step": 191500
},
{
"epoch": 0.52,
"grad_norm": 1.4430129528045654,
"learning_rate": 1.9411764705882355e-05,
"loss": 1.4694,
"step": 192000
},
{
"epoch": 0.52,
"grad_norm": 1.4931637048721313,
"learning_rate": 1.9380252100840336e-05,
"loss": 1.4704,
"step": 192500
},
{
"epoch": 0.52,
"grad_norm": 1.4820243120193481,
"learning_rate": 1.934873949579832e-05,
"loss": 1.4707,
"step": 193000
},
{
"epoch": 0.52,
"grad_norm": 1.5232768058776855,
"learning_rate": 1.9317226890756303e-05,
"loss": 1.4692,
"step": 193500
},
{
"epoch": 0.53,
"grad_norm": 1.517333745956421,
"learning_rate": 1.928571428571429e-05,
"loss": 1.4731,
"step": 194000
},
{
"epoch": 0.53,
"grad_norm": 1.4523952007293701,
"learning_rate": 1.925420168067227e-05,
"loss": 1.4698,
"step": 194500
},
{
"epoch": 0.53,
"grad_norm": 1.4807761907577515,
"learning_rate": 1.9222689075630255e-05,
"loss": 1.4719,
"step": 195000
},
{
"epoch": 0.53,
"grad_norm": 1.4389820098876953,
"learning_rate": 1.9191176470588234e-05,
"loss": 1.4709,
"step": 195500
},
{
"epoch": 0.53,
"grad_norm": 3.7379424571990967,
"learning_rate": 1.915966386554622e-05,
"loss": 1.4663,
"step": 196000
},
{
"epoch": 0.53,
"grad_norm": 1.4896109104156494,
"learning_rate": 1.91281512605042e-05,
"loss": 1.4709,
"step": 196500
},
{
"epoch": 0.53,
"grad_norm": 5.979303359985352,
"learning_rate": 1.9096638655462186e-05,
"loss": 1.4743,
"step": 197000
},
{
"epoch": 0.53,
"grad_norm": 1.4648813009262085,
"learning_rate": 1.9065126050420167e-05,
"loss": 1.4687,
"step": 197500
},
{
"epoch": 0.54,
"grad_norm": 1.739353895187378,
"learning_rate": 1.903361344537815e-05,
"loss": 1.4702,
"step": 198000
},
{
"epoch": 0.54,
"grad_norm": 1.4263814687728882,
"learning_rate": 1.9002100840336134e-05,
"loss": 1.4695,
"step": 198500
},
{
"epoch": 0.54,
"grad_norm": 1.5090336799621582,
"learning_rate": 1.8970588235294116e-05,
"loss": 1.4667,
"step": 199000
},
{
"epoch": 0.54,
"grad_norm": 1.4606796503067017,
"learning_rate": 1.89390756302521e-05,
"loss": 1.4665,
"step": 199500
},
{
"epoch": 0.54,
"grad_norm": 1.4979524612426758,
"learning_rate": 1.8907563025210083e-05,
"loss": 1.4645,
"step": 200000
},
{
"epoch": 0.54,
"grad_norm": 1.5032795667648315,
"learning_rate": 1.8876050420168068e-05,
"loss": 1.4697,
"step": 200500
},
{
"epoch": 0.54,
"grad_norm": 1.4917629957199097,
"learning_rate": 1.884453781512605e-05,
"loss": 1.4654,
"step": 201000
},
{
"epoch": 0.55,
"grad_norm": 1.5047801733016968,
"learning_rate": 1.8813025210084035e-05,
"loss": 1.4665,
"step": 201500
},
{
"epoch": 0.55,
"grad_norm": 1.5550223588943481,
"learning_rate": 1.8781512605042017e-05,
"loss": 1.4669,
"step": 202000
},
{
"epoch": 0.55,
"grad_norm": 1.4432892799377441,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.4652,
"step": 202500
},
{
"epoch": 0.55,
"grad_norm": 1.4227643013000488,
"learning_rate": 1.8718487394957983e-05,
"loss": 1.465,
"step": 203000
},
{
"epoch": 0.55,
"grad_norm": 1.5878413915634155,
"learning_rate": 1.868697478991597e-05,
"loss": 1.4675,
"step": 203500
},
{
"epoch": 0.55,
"grad_norm": 1.5786782503128052,
"learning_rate": 1.865546218487395e-05,
"loss": 1.4596,
"step": 204000
},
{
"epoch": 0.55,
"grad_norm": 1.4224051237106323,
"learning_rate": 1.8623949579831932e-05,
"loss": 1.462,
"step": 204500
},
{
"epoch": 0.55,
"grad_norm": 1.7678115367889404,
"learning_rate": 1.8592436974789917e-05,
"loss": 1.4614,
"step": 205000
},
{
"epoch": 0.56,
"grad_norm": 1.4170020818710327,
"learning_rate": 1.85609243697479e-05,
"loss": 1.4649,
"step": 205500
},
{
"epoch": 0.56,
"grad_norm": 1.5474693775177002,
"learning_rate": 1.8529411764705884e-05,
"loss": 1.464,
"step": 206000
},
{
"epoch": 0.56,
"grad_norm": 1.4655749797821045,
"learning_rate": 1.8497899159663866e-05,
"loss": 1.4654,
"step": 206500
},
{
"epoch": 0.56,
"grad_norm": 1.6294610500335693,
"learning_rate": 1.846638655462185e-05,
"loss": 1.4616,
"step": 207000
},
{
"epoch": 0.56,
"grad_norm": 1.4760308265686035,
"learning_rate": 1.8434873949579833e-05,
"loss": 1.4643,
"step": 207500
},
{
"epoch": 0.56,
"grad_norm": 1.4796357154846191,
"learning_rate": 1.8403361344537818e-05,
"loss": 1.4659,
"step": 208000
},
{
"epoch": 0.56,
"grad_norm": 1.9592546224594116,
"learning_rate": 1.83718487394958e-05,
"loss": 1.4611,
"step": 208500
},
{
"epoch": 0.57,
"grad_norm": 1.493324637413025,
"learning_rate": 1.8340336134453785e-05,
"loss": 1.4626,
"step": 209000
},
{
"epoch": 0.57,
"grad_norm": 1.453369379043579,
"learning_rate": 1.8308823529411766e-05,
"loss": 1.4603,
"step": 209500
},
{
"epoch": 0.57,
"grad_norm": 1.5146046876907349,
"learning_rate": 1.8277310924369748e-05,
"loss": 1.4594,
"step": 210000
},
{
"epoch": 0.57,
"grad_norm": 1.424707293510437,
"learning_rate": 1.824579831932773e-05,
"loss": 1.4631,
"step": 210500
},
{
"epoch": 0.57,
"grad_norm": 1.464998722076416,
"learning_rate": 1.8214285714285712e-05,
"loss": 1.4617,
"step": 211000
},
{
"epoch": 0.57,
"grad_norm": 1.4314439296722412,
"learning_rate": 1.8182773109243697e-05,
"loss": 1.4611,
"step": 211500
},
{
"epoch": 0.57,
"grad_norm": 1.4533342123031616,
"learning_rate": 1.815126050420168e-05,
"loss": 1.4591,
"step": 212000
},
{
"epoch": 0.58,
"grad_norm": 1.5328502655029297,
"learning_rate": 1.8119747899159664e-05,
"loss": 1.4606,
"step": 212500
},
{
"epoch": 0.58,
"grad_norm": 1.4684851169586182,
"learning_rate": 1.8088235294117645e-05,
"loss": 1.463,
"step": 213000
},
{
"epoch": 0.58,
"grad_norm": 1.512421727180481,
"learning_rate": 1.805672268907563e-05,
"loss": 1.4585,
"step": 213500
},
{
"epoch": 0.58,
"grad_norm": 1.5069866180419922,
"learning_rate": 1.8025210084033612e-05,
"loss": 1.4565,
"step": 214000
},
{
"epoch": 0.58,
"grad_norm": 1.4224152565002441,
"learning_rate": 1.7993697478991597e-05,
"loss": 1.4575,
"step": 214500
},
{
"epoch": 0.58,
"grad_norm": 1.6329984664916992,
"learning_rate": 1.796218487394958e-05,
"loss": 1.4541,
"step": 215000
},
{
"epoch": 0.58,
"grad_norm": 1.587007761001587,
"learning_rate": 1.7930672268907564e-05,
"loss": 1.4572,
"step": 215500
},
{
"epoch": 0.58,
"grad_norm": 1.4805065393447876,
"learning_rate": 1.7899159663865546e-05,
"loss": 1.4618,
"step": 216000
},
{
"epoch": 0.59,
"grad_norm": 1.517993450164795,
"learning_rate": 1.786764705882353e-05,
"loss": 1.4538,
"step": 216500
},
{
"epoch": 0.59,
"grad_norm": 1.4399406909942627,
"learning_rate": 1.7836134453781513e-05,
"loss": 1.4576,
"step": 217000
},
{
"epoch": 0.59,
"grad_norm": 1.4458235502243042,
"learning_rate": 1.7804621848739495e-05,
"loss": 1.4558,
"step": 217500
},
{
"epoch": 0.59,
"grad_norm": 1.5840320587158203,
"learning_rate": 1.777310924369748e-05,
"loss": 1.4562,
"step": 218000
},
{
"epoch": 0.59,
"grad_norm": 1.4832299947738647,
"learning_rate": 1.774159663865546e-05,
"loss": 1.456,
"step": 218500
},
{
"epoch": 0.59,
"grad_norm": 1.4003788232803345,
"learning_rate": 1.7710084033613447e-05,
"loss": 1.4555,
"step": 219000
},
{
"epoch": 0.59,
"grad_norm": 1.5091036558151245,
"learning_rate": 1.767857142857143e-05,
"loss": 1.4596,
"step": 219500
},
{
"epoch": 0.6,
"grad_norm": 1.4758837223052979,
"learning_rate": 1.7647058823529414e-05,
"loss": 1.4566,
"step": 220000
},
{
"epoch": 0.6,
"grad_norm": 1.4372687339782715,
"learning_rate": 1.7615546218487395e-05,
"loss": 1.4524,
"step": 220500
},
{
"epoch": 0.6,
"grad_norm": 1.4391896724700928,
"learning_rate": 1.758403361344538e-05,
"loss": 1.4565,
"step": 221000
},
{
"epoch": 0.6,
"grad_norm": 1.4493831396102905,
"learning_rate": 1.7552521008403362e-05,
"loss": 1.4543,
"step": 221500
},
{
"epoch": 0.6,
"grad_norm": 2.0319833755493164,
"learning_rate": 1.7521008403361347e-05,
"loss": 1.4536,
"step": 222000
},
{
"epoch": 0.6,
"grad_norm": 1.4861342906951904,
"learning_rate": 1.748949579831933e-05,
"loss": 1.454,
"step": 222500
},
{
"epoch": 0.6,
"grad_norm": 1.4432348012924194,
"learning_rate": 1.7457983193277314e-05,
"loss": 1.4546,
"step": 223000
},
{
"epoch": 0.61,
"grad_norm": 1.4457755088806152,
"learning_rate": 1.7426470588235296e-05,
"loss": 1.4542,
"step": 223500
},
{
"epoch": 0.61,
"grad_norm": 1.4785292148590088,
"learning_rate": 1.7394957983193278e-05,
"loss": 1.4539,
"step": 224000
},
{
"epoch": 0.61,
"grad_norm": 1.4646965265274048,
"learning_rate": 1.7363445378151263e-05,
"loss": 1.4557,
"step": 224500
},
{
"epoch": 0.61,
"grad_norm": 1.3340420722961426,
"learning_rate": 1.733193277310924e-05,
"loss": 1.4512,
"step": 225000
},
{
"epoch": 0.61,
"grad_norm": 1.4864197969436646,
"learning_rate": 1.7300420168067226e-05,
"loss": 1.4514,
"step": 225500
},
{
"epoch": 0.61,
"grad_norm": 1.441954493522644,
"learning_rate": 1.7268907563025208e-05,
"loss": 1.4565,
"step": 226000
},
{
"epoch": 0.61,
"grad_norm": 1.4796494245529175,
"learning_rate": 1.7237394957983193e-05,
"loss": 1.4549,
"step": 226500
},
{
"epoch": 0.61,
"grad_norm": 1.5095195770263672,
"learning_rate": 1.7205882352941175e-05,
"loss": 1.4538,
"step": 227000
},
{
"epoch": 0.62,
"grad_norm": 1.6988993883132935,
"learning_rate": 1.717436974789916e-05,
"loss": 1.4552,
"step": 227500
},
{
"epoch": 0.62,
"grad_norm": 1.4422426223754883,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.4514,
"step": 228000
},
{
"epoch": 0.62,
"grad_norm": 1.4488030672073364,
"learning_rate": 1.7111344537815127e-05,
"loss": 1.4545,
"step": 228500
},
{
"epoch": 0.62,
"grad_norm": 1.4784460067749023,
"learning_rate": 1.707983193277311e-05,
"loss": 1.4527,
"step": 229000
},
{
"epoch": 0.62,
"grad_norm": 1.4642586708068848,
"learning_rate": 1.7048319327731094e-05,
"loss": 1.4483,
"step": 229500
},
{
"epoch": 0.62,
"grad_norm": 1.509343147277832,
"learning_rate": 1.7016806722689076e-05,
"loss": 1.4543,
"step": 230000
},
{
"epoch": 0.62,
"grad_norm": 1.3862849473953247,
"learning_rate": 1.6985294117647057e-05,
"loss": 1.4531,
"step": 230500
},
{
"epoch": 0.63,
"grad_norm": 1.4223895072937012,
"learning_rate": 1.6953781512605042e-05,
"loss": 1.451,
"step": 231000
},
{
"epoch": 0.63,
"grad_norm": 1.4616318941116333,
"learning_rate": 1.6922268907563024e-05,
"loss": 1.4511,
"step": 231500
},
{
"epoch": 0.63,
"grad_norm": 1.4746378660202026,
"learning_rate": 1.689075630252101e-05,
"loss": 1.4497,
"step": 232000
},
{
"epoch": 0.63,
"grad_norm": 1.461519479751587,
"learning_rate": 1.685924369747899e-05,
"loss": 1.4516,
"step": 232500
},
{
"epoch": 0.63,
"grad_norm": 1.3925315141677856,
"learning_rate": 1.6827731092436976e-05,
"loss": 1.4507,
"step": 233000
},
{
"epoch": 0.63,
"grad_norm": 1.4032963514328003,
"learning_rate": 1.6796218487394958e-05,
"loss": 1.4497,
"step": 233500
},
{
"epoch": 0.63,
"grad_norm": 1.4162888526916504,
"learning_rate": 1.6764705882352943e-05,
"loss": 1.4482,
"step": 234000
},
{
"epoch": 0.63,
"grad_norm": 1.3672780990600586,
"learning_rate": 1.6733193277310925e-05,
"loss": 1.4518,
"step": 234500
},
{
"epoch": 0.64,
"grad_norm": 1.522310733795166,
"learning_rate": 1.670168067226891e-05,
"loss": 1.4516,
"step": 235000
},
{
"epoch": 0.64,
"grad_norm": 1.3994154930114746,
"learning_rate": 1.6670168067226892e-05,
"loss": 1.4468,
"step": 235500
},
{
"epoch": 0.64,
"grad_norm": 1.4941591024398804,
"learning_rate": 1.6638655462184877e-05,
"loss": 1.4491,
"step": 236000
},
{
"epoch": 0.64,
"grad_norm": 1.4521230459213257,
"learning_rate": 1.660714285714286e-05,
"loss": 1.4475,
"step": 236500
},
{
"epoch": 0.64,
"grad_norm": 1.528152585029602,
"learning_rate": 1.657563025210084e-05,
"loss": 1.4473,
"step": 237000
},
{
"epoch": 0.64,
"grad_norm": 1.4769060611724854,
"learning_rate": 1.6544117647058825e-05,
"loss": 1.4463,
"step": 237500
},
{
"epoch": 0.64,
"grad_norm": 1.4506659507751465,
"learning_rate": 1.6512605042016807e-05,
"loss": 1.4458,
"step": 238000
},
{
"epoch": 0.65,
"grad_norm": 1.491810917854309,
"learning_rate": 1.6481092436974792e-05,
"loss": 1.4498,
"step": 238500
},
{
"epoch": 0.65,
"grad_norm": 1.4600553512573242,
"learning_rate": 1.6449579831932774e-05,
"loss": 1.4444,
"step": 239000
},
{
"epoch": 0.65,
"grad_norm": 1.4451686143875122,
"learning_rate": 1.641806722689076e-05,
"loss": 1.4441,
"step": 239500
},
{
"epoch": 0.65,
"grad_norm": 1.4227120876312256,
"learning_rate": 1.6386554621848738e-05,
"loss": 1.4448,
"step": 240000
},
{
"epoch": 0.65,
"grad_norm": 1.5668320655822754,
"learning_rate": 1.6355042016806723e-05,
"loss": 1.4456,
"step": 240500
},
{
"epoch": 0.65,
"grad_norm": 1.3923659324645996,
"learning_rate": 1.6323529411764704e-05,
"loss": 1.4477,
"step": 241000
},
{
"epoch": 0.65,
"grad_norm": 1.4962598085403442,
"learning_rate": 1.629201680672269e-05,
"loss": 1.4454,
"step": 241500
},
{
"epoch": 0.66,
"grad_norm": 1.4878734350204468,
"learning_rate": 1.626050420168067e-05,
"loss": 1.4461,
"step": 242000
},
{
"epoch": 0.66,
"grad_norm": 1.4973180294036865,
"learning_rate": 1.6228991596638656e-05,
"loss": 1.4464,
"step": 242500
},
{
"epoch": 0.66,
"grad_norm": 1.4737753868103027,
"learning_rate": 1.6197478991596638e-05,
"loss": 1.444,
"step": 243000
},
{
"epoch": 0.66,
"grad_norm": 1.4609256982803345,
"learning_rate": 1.616596638655462e-05,
"loss": 1.4479,
"step": 243500
},
{
"epoch": 0.66,
"grad_norm": 1.4048258066177368,
"learning_rate": 1.6134453781512605e-05,
"loss": 1.4428,
"step": 244000
},
{
"epoch": 0.66,
"grad_norm": 1.399703025817871,
"learning_rate": 1.6102941176470587e-05,
"loss": 1.4433,
"step": 244500
},
{
"epoch": 0.66,
"grad_norm": 1.5445500612258911,
"learning_rate": 1.6071428571428572e-05,
"loss": 1.4455,
"step": 245000
},
{
"epoch": 0.66,
"grad_norm": 1.4742292165756226,
"learning_rate": 1.6039915966386554e-05,
"loss": 1.4428,
"step": 245500
},
{
"epoch": 0.67,
"grad_norm": 1.4535382986068726,
"learning_rate": 1.600840336134454e-05,
"loss": 1.4453,
"step": 246000
},
{
"epoch": 0.67,
"grad_norm": 1.467373013496399,
"learning_rate": 1.597689075630252e-05,
"loss": 1.4459,
"step": 246500
},
{
"epoch": 0.67,
"grad_norm": 1.4863603115081787,
"learning_rate": 1.5945378151260506e-05,
"loss": 1.4444,
"step": 247000
},
{
"epoch": 0.67,
"grad_norm": 1.5373426675796509,
"learning_rate": 1.5913865546218487e-05,
"loss": 1.4418,
"step": 247500
},
{
"epoch": 0.67,
"grad_norm": 1.4747397899627686,
"learning_rate": 1.5882352941176473e-05,
"loss": 1.4423,
"step": 248000
},
{
"epoch": 0.67,
"grad_norm": 1.5024008750915527,
"learning_rate": 1.5850840336134454e-05,
"loss": 1.4466,
"step": 248500
},
{
"epoch": 0.67,
"grad_norm": 1.481330394744873,
"learning_rate": 1.581932773109244e-05,
"loss": 1.4395,
"step": 249000
},
{
"epoch": 0.68,
"grad_norm": 1.419636607170105,
"learning_rate": 1.578781512605042e-05,
"loss": 1.4416,
"step": 249500
},
{
"epoch": 0.68,
"grad_norm": 1.4620583057403564,
"learning_rate": 1.5756302521008403e-05,
"loss": 1.447,
"step": 250000
},
{
"epoch": 0.68,
"grad_norm": 1.4666600227355957,
"learning_rate": 1.5724789915966388e-05,
"loss": 1.4378,
"step": 250500
},
{
"epoch": 0.68,
"grad_norm": 1.4554154872894287,
"learning_rate": 1.569327731092437e-05,
"loss": 1.4439,
"step": 251000
},
{
"epoch": 0.68,
"grad_norm": 1.4908123016357422,
"learning_rate": 1.5661764705882355e-05,
"loss": 1.4427,
"step": 251500
},
{
"epoch": 0.68,
"grad_norm": 1.471479892730713,
"learning_rate": 1.5630252100840337e-05,
"loss": 1.4433,
"step": 252000
},
{
"epoch": 0.68,
"grad_norm": 1.4541757106781006,
"learning_rate": 1.5598739495798322e-05,
"loss": 1.4438,
"step": 252500
},
{
"epoch": 0.68,
"grad_norm": 1.7064818143844604,
"learning_rate": 1.5567226890756304e-05,
"loss": 1.4409,
"step": 253000
},
{
"epoch": 0.69,
"grad_norm": 1.5056750774383545,
"learning_rate": 1.553571428571429e-05,
"loss": 1.4405,
"step": 253500
},
{
"epoch": 0.69,
"grad_norm": 1.4601994752883911,
"learning_rate": 1.550420168067227e-05,
"loss": 1.4407,
"step": 254000
},
{
"epoch": 0.69,
"grad_norm": 1.4508180618286133,
"learning_rate": 1.5472689075630256e-05,
"loss": 1.4471,
"step": 254500
},
{
"epoch": 0.69,
"grad_norm": 1.476529598236084,
"learning_rate": 1.5441176470588234e-05,
"loss": 1.4416,
"step": 255000
},
{
"epoch": 0.69,
"grad_norm": 1.5242764949798584,
"learning_rate": 1.540966386554622e-05,
"loss": 1.4406,
"step": 255500
},
{
"epoch": 0.69,
"grad_norm": 1.405678153038025,
"learning_rate": 1.53781512605042e-05,
"loss": 1.4399,
"step": 256000
},
{
"epoch": 0.69,
"grad_norm": 1.4689253568649292,
"learning_rate": 1.5346638655462183e-05,
"loss": 1.4409,
"step": 256500
},
{
"epoch": 0.7,
"grad_norm": 1.5302820205688477,
"learning_rate": 1.5315126050420168e-05,
"loss": 1.4435,
"step": 257000
},
{
"epoch": 0.7,
"grad_norm": 1.4745590686798096,
"learning_rate": 1.528361344537815e-05,
"loss": 1.4411,
"step": 257500
},
{
"epoch": 0.7,
"grad_norm": 1.5703048706054688,
"learning_rate": 1.5252100840336135e-05,
"loss": 1.4372,
"step": 258000
},
{
"epoch": 0.7,
"grad_norm": 1.4982346296310425,
"learning_rate": 1.5220588235294118e-05,
"loss": 1.4342,
"step": 258500
},
{
"epoch": 0.7,
"grad_norm": 1.4562139511108398,
"learning_rate": 1.51890756302521e-05,
"loss": 1.4403,
"step": 259000
},
{
"epoch": 0.7,
"grad_norm": 1.5004678964614868,
"learning_rate": 1.5157563025210083e-05,
"loss": 1.4405,
"step": 259500
},
{
"epoch": 0.7,
"grad_norm": 1.4451349973678589,
"learning_rate": 1.5126050420168067e-05,
"loss": 1.436,
"step": 260000
},
{
"epoch": 0.71,
"grad_norm": 1.420857548713684,
"learning_rate": 1.509453781512605e-05,
"loss": 1.4402,
"step": 260500
},
{
"epoch": 0.71,
"grad_norm": 1.4772206544876099,
"learning_rate": 1.5063025210084034e-05,
"loss": 1.4373,
"step": 261000
},
{
"epoch": 0.71,
"grad_norm": 1.4933620691299438,
"learning_rate": 1.5031512605042017e-05,
"loss": 1.4392,
"step": 261500
},
{
"epoch": 0.71,
"grad_norm": 1.5023765563964844,
"learning_rate": 1.5e-05,
"loss": 1.438,
"step": 262000
},
{
"epoch": 0.71,
"grad_norm": 1.4560567140579224,
"learning_rate": 1.4968487394957984e-05,
"loss": 1.439,
"step": 262500
},
{
"epoch": 0.71,
"grad_norm": 1.5497692823410034,
"learning_rate": 1.4936974789915967e-05,
"loss": 1.4347,
"step": 263000
},
{
"epoch": 0.71,
"grad_norm": 1.5201669931411743,
"learning_rate": 1.490546218487395e-05,
"loss": 1.4365,
"step": 263500
},
{
"epoch": 0.71,
"grad_norm": 1.4907211065292358,
"learning_rate": 1.4873949579831934e-05,
"loss": 1.4334,
"step": 264000
},
{
"epoch": 0.72,
"grad_norm": 1.4821357727050781,
"learning_rate": 1.4842436974789918e-05,
"loss": 1.4361,
"step": 264500
},
{
"epoch": 0.72,
"grad_norm": 1.4968074560165405,
"learning_rate": 1.4810924369747901e-05,
"loss": 1.4352,
"step": 265000
},
{
"epoch": 0.72,
"grad_norm": 1.475728154182434,
"learning_rate": 1.4779411764705883e-05,
"loss": 1.4365,
"step": 265500
},
{
"epoch": 0.72,
"grad_norm": 1.560935378074646,
"learning_rate": 1.4747899159663864e-05,
"loss": 1.4381,
"step": 266000
},
{
"epoch": 0.72,
"grad_norm": 1.4216580390930176,
"learning_rate": 1.4716386554621848e-05,
"loss": 1.4322,
"step": 266500
},
{
"epoch": 0.72,
"grad_norm": 1.499648094177246,
"learning_rate": 1.4684873949579831e-05,
"loss": 1.4378,
"step": 267000
},
{
"epoch": 0.72,
"grad_norm": 1.4971799850463867,
"learning_rate": 1.4653361344537815e-05,
"loss": 1.4334,
"step": 267500
},
{
"epoch": 0.73,
"grad_norm": 1.5106513500213623,
"learning_rate": 1.4621848739495798e-05,
"loss": 1.4347,
"step": 268000
},
{
"epoch": 0.73,
"grad_norm": 1.488006353378296,
"learning_rate": 1.4590336134453782e-05,
"loss": 1.4361,
"step": 268500
},
{
"epoch": 0.73,
"grad_norm": 1.484994888305664,
"learning_rate": 1.4558823529411765e-05,
"loss": 1.4389,
"step": 269000
},
{
"epoch": 0.73,
"grad_norm": 1.4334303140640259,
"learning_rate": 1.4527310924369749e-05,
"loss": 1.4366,
"step": 269500
},
{
"epoch": 0.73,
"grad_norm": 1.4980212450027466,
"learning_rate": 1.4495798319327732e-05,
"loss": 1.4335,
"step": 270000
},
{
"epoch": 0.73,
"grad_norm": 1.4758628606796265,
"learning_rate": 1.4464285714285715e-05,
"loss": 1.4367,
"step": 270500
},
{
"epoch": 0.73,
"grad_norm": 1.4914411306381226,
"learning_rate": 1.4432773109243699e-05,
"loss": 1.4373,
"step": 271000
},
{
"epoch": 0.73,
"grad_norm": 1.5274006128311157,
"learning_rate": 1.4401260504201682e-05,
"loss": 1.4364,
"step": 271500
},
{
"epoch": 0.74,
"grad_norm": 1.4571418762207031,
"learning_rate": 1.4369747899159664e-05,
"loss": 1.4354,
"step": 272000
},
{
"epoch": 0.74,
"grad_norm": 1.5726255178451538,
"learning_rate": 1.4338235294117647e-05,
"loss": 1.4338,
"step": 272500
},
{
"epoch": 0.74,
"grad_norm": 1.5626286268234253,
"learning_rate": 1.4306722689075631e-05,
"loss": 1.4345,
"step": 273000
},
{
"epoch": 0.74,
"grad_norm": 1.4581658840179443,
"learning_rate": 1.4275210084033613e-05,
"loss": 1.4339,
"step": 273500
},
{
"epoch": 0.74,
"grad_norm": 1.4836556911468506,
"learning_rate": 1.4243697478991596e-05,
"loss": 1.4331,
"step": 274000
},
{
"epoch": 0.74,
"grad_norm": 1.4955805540084839,
"learning_rate": 1.421218487394958e-05,
"loss": 1.434,
"step": 274500
},
{
"epoch": 0.74,
"grad_norm": 1.5095798969268799,
"learning_rate": 1.4180672268907563e-05,
"loss": 1.4335,
"step": 275000
},
{
"epoch": 0.75,
"grad_norm": 1.517565131187439,
"learning_rate": 1.4149159663865546e-05,
"loss": 1.4339,
"step": 275500
},
{
"epoch": 0.75,
"grad_norm": 1.5089333057403564,
"learning_rate": 1.411764705882353e-05,
"loss": 1.4303,
"step": 276000
},
{
"epoch": 0.75,
"grad_norm": 1.490110993385315,
"learning_rate": 1.4086134453781513e-05,
"loss": 1.4378,
"step": 276500
},
{
"epoch": 0.75,
"grad_norm": 1.4934676885604858,
"learning_rate": 1.4054621848739497e-05,
"loss": 1.4309,
"step": 277000
},
{
"epoch": 0.75,
"grad_norm": 1.453904628753662,
"learning_rate": 1.402310924369748e-05,
"loss": 1.4345,
"step": 277500
},
{
"epoch": 0.75,
"grad_norm": 1.4364333152770996,
"learning_rate": 1.3991596638655464e-05,
"loss": 1.4347,
"step": 278000
},
{
"epoch": 0.75,
"grad_norm": 1.5105829238891602,
"learning_rate": 1.3960084033613445e-05,
"loss": 1.4373,
"step": 278500
},
{
"epoch": 0.76,
"grad_norm": 1.5879383087158203,
"learning_rate": 1.3928571428571429e-05,
"loss": 1.4337,
"step": 279000
},
{
"epoch": 0.76,
"grad_norm": 1.4907859563827515,
"learning_rate": 1.3897058823529412e-05,
"loss": 1.4378,
"step": 279500
},
{
"epoch": 0.76,
"grad_norm": 1.4965413808822632,
"learning_rate": 1.3865546218487396e-05,
"loss": 1.4332,
"step": 280000
},
{
"epoch": 0.76,
"grad_norm": 1.4512360095977783,
"learning_rate": 1.3834033613445379e-05,
"loss": 1.4293,
"step": 280500
},
{
"epoch": 0.76,
"grad_norm": 1.5323312282562256,
"learning_rate": 1.3802521008403361e-05,
"loss": 1.4348,
"step": 281000
},
{
"epoch": 0.76,
"grad_norm": 1.515937089920044,
"learning_rate": 1.3771008403361344e-05,
"loss": 1.435,
"step": 281500
},
{
"epoch": 0.76,
"grad_norm": 1.5589243173599243,
"learning_rate": 1.3739495798319328e-05,
"loss": 1.4276,
"step": 282000
},
{
"epoch": 0.76,
"grad_norm": 1.4904866218566895,
"learning_rate": 1.3707983193277311e-05,
"loss": 1.4317,
"step": 282500
},
{
"epoch": 0.77,
"grad_norm": 1.4851187467575073,
"learning_rate": 1.3676470588235295e-05,
"loss": 1.4297,
"step": 283000
},
{
"epoch": 0.77,
"grad_norm": 1.3728834390640259,
"learning_rate": 1.3644957983193278e-05,
"loss": 1.4322,
"step": 283500
},
{
"epoch": 0.77,
"grad_norm": 1.738533854484558,
"learning_rate": 1.3613445378151261e-05,
"loss": 1.4293,
"step": 284000
},
{
"epoch": 0.77,
"grad_norm": 1.5092045068740845,
"learning_rate": 1.3581932773109245e-05,
"loss": 1.4292,
"step": 284500
},
{
"epoch": 0.77,
"grad_norm": 1.5049362182617188,
"learning_rate": 1.3550420168067227e-05,
"loss": 1.4286,
"step": 285000
},
{
"epoch": 0.77,
"grad_norm": 1.4427067041397095,
"learning_rate": 1.351890756302521e-05,
"loss": 1.4279,
"step": 285500
},
{
"epoch": 0.77,
"grad_norm": 1.4460445642471313,
"learning_rate": 1.3487394957983194e-05,
"loss": 1.4301,
"step": 286000
},
{
"epoch": 0.78,
"grad_norm": 1.5012342929840088,
"learning_rate": 1.3455882352941177e-05,
"loss": 1.4287,
"step": 286500
},
{
"epoch": 0.78,
"grad_norm": 1.4399917125701904,
"learning_rate": 1.342436974789916e-05,
"loss": 1.4308,
"step": 287000
},
{
"epoch": 0.78,
"grad_norm": 1.4089640378952026,
"learning_rate": 1.3392857142857144e-05,
"loss": 1.4264,
"step": 287500
},
{
"epoch": 0.78,
"grad_norm": 1.5012991428375244,
"learning_rate": 1.3361344537815127e-05,
"loss": 1.4296,
"step": 288000
},
{
"epoch": 0.78,
"grad_norm": 1.4144240617752075,
"learning_rate": 1.3329831932773109e-05,
"loss": 1.4259,
"step": 288500
},
{
"epoch": 0.78,
"grad_norm": 1.4895191192626953,
"learning_rate": 1.3298319327731092e-05,
"loss": 1.4312,
"step": 289000
},
{
"epoch": 0.78,
"grad_norm": 1.5855236053466797,
"learning_rate": 1.3266806722689076e-05,
"loss": 1.4275,
"step": 289500
},
{
"epoch": 0.79,
"grad_norm": 1.4119740724563599,
"learning_rate": 1.323529411764706e-05,
"loss": 1.428,
"step": 290000
},
{
"epoch": 0.79,
"grad_norm": 1.5101768970489502,
"learning_rate": 1.3203781512605043e-05,
"loss": 1.4289,
"step": 290500
},
{
"epoch": 0.79,
"grad_norm": 1.4803494215011597,
"learning_rate": 1.3172268907563025e-05,
"loss": 1.4273,
"step": 291000
},
{
"epoch": 0.79,
"grad_norm": 1.5688806772232056,
"learning_rate": 1.3140756302521008e-05,
"loss": 1.4276,
"step": 291500
},
{
"epoch": 0.79,
"grad_norm": 2.2357559204101562,
"learning_rate": 1.3109243697478991e-05,
"loss": 1.4294,
"step": 292000
},
{
"epoch": 0.79,
"grad_norm": 1.4668666124343872,
"learning_rate": 1.3077731092436975e-05,
"loss": 1.4293,
"step": 292500
},
{
"epoch": 0.79,
"grad_norm": 1.46941339969635,
"learning_rate": 1.3046218487394958e-05,
"loss": 1.4321,
"step": 293000
},
{
"epoch": 0.79,
"grad_norm": 1.633657455444336,
"learning_rate": 1.3014705882352942e-05,
"loss": 1.4272,
"step": 293500
},
{
"epoch": 0.8,
"grad_norm": 1.6233292818069458,
"learning_rate": 1.2983193277310925e-05,
"loss": 1.4268,
"step": 294000
},
{
"epoch": 0.8,
"grad_norm": 1.4441863298416138,
"learning_rate": 1.2951680672268909e-05,
"loss": 1.4262,
"step": 294500
},
{
"epoch": 0.8,
"grad_norm": 1.5020571947097778,
"learning_rate": 1.2920168067226892e-05,
"loss": 1.4247,
"step": 295000
},
{
"epoch": 0.8,
"grad_norm": 1.476090669631958,
"learning_rate": 1.2888655462184874e-05,
"loss": 1.426,
"step": 295500
},
{
"epoch": 0.8,
"grad_norm": 1.4784507751464844,
"learning_rate": 1.2857142857142857e-05,
"loss": 1.4262,
"step": 296000
},
{
"epoch": 0.8,
"grad_norm": 1.4484635591506958,
"learning_rate": 1.282563025210084e-05,
"loss": 1.426,
"step": 296500
},
{
"epoch": 0.8,
"grad_norm": 1.5106843709945679,
"learning_rate": 1.2794117647058824e-05,
"loss": 1.4282,
"step": 297000
},
{
"epoch": 0.81,
"grad_norm": 1.401078701019287,
"learning_rate": 1.2762605042016806e-05,
"loss": 1.4229,
"step": 297500
},
{
"epoch": 0.81,
"grad_norm": 1.4721170663833618,
"learning_rate": 1.273109243697479e-05,
"loss": 1.4281,
"step": 298000
},
{
"epoch": 0.81,
"grad_norm": 1.5121667385101318,
"learning_rate": 1.2699579831932773e-05,
"loss": 1.4272,
"step": 298500
},
{
"epoch": 0.81,
"grad_norm": 1.4307163953781128,
"learning_rate": 1.2668067226890756e-05,
"loss": 1.4269,
"step": 299000
},
{
"epoch": 0.81,
"grad_norm": 1.520992398262024,
"learning_rate": 1.263655462184874e-05,
"loss": 1.426,
"step": 299500
},
{
"epoch": 0.81,
"grad_norm": 1.4671803712844849,
"learning_rate": 1.2605042016806723e-05,
"loss": 1.4207,
"step": 300000
},
{
"epoch": 0.81,
"grad_norm": 1.4773739576339722,
"learning_rate": 1.2573529411764706e-05,
"loss": 1.4248,
"step": 300500
},
{
"epoch": 0.81,
"grad_norm": 1.4782676696777344,
"learning_rate": 1.254201680672269e-05,
"loss": 1.4265,
"step": 301000
},
{
"epoch": 0.82,
"grad_norm": 1.5411614179611206,
"learning_rate": 1.2510504201680673e-05,
"loss": 1.4223,
"step": 301500
},
{
"epoch": 0.82,
"grad_norm": 1.4932873249053955,
"learning_rate": 1.2478991596638657e-05,
"loss": 1.4252,
"step": 302000
},
{
"epoch": 0.82,
"grad_norm": 1.451866626739502,
"learning_rate": 1.244747899159664e-05,
"loss": 1.4234,
"step": 302500
},
{
"epoch": 0.82,
"grad_norm": 1.4181545972824097,
"learning_rate": 1.2415966386554622e-05,
"loss": 1.4249,
"step": 303000
},
{
"epoch": 0.82,
"grad_norm": 1.460598349571228,
"learning_rate": 1.2384453781512605e-05,
"loss": 1.4237,
"step": 303500
},
{
"epoch": 0.82,
"grad_norm": 1.4560647010803223,
"learning_rate": 1.2352941176470587e-05,
"loss": 1.4199,
"step": 304000
},
{
"epoch": 0.82,
"grad_norm": 1.4535589218139648,
"learning_rate": 1.232142857142857e-05,
"loss": 1.4248,
"step": 304500
},
{
"epoch": 0.83,
"grad_norm": 1.4643712043762207,
"learning_rate": 1.2289915966386554e-05,
"loss": 1.4257,
"step": 305000
},
{
"epoch": 0.83,
"grad_norm": 1.5106630325317383,
"learning_rate": 1.2258403361344537e-05,
"loss": 1.4248,
"step": 305500
},
{
"epoch": 0.83,
"grad_norm": 1.489579439163208,
"learning_rate": 1.2226890756302521e-05,
"loss": 1.4215,
"step": 306000
},
{
"epoch": 0.83,
"grad_norm": 1.4746323823928833,
"learning_rate": 1.2195378151260504e-05,
"loss": 1.4202,
"step": 306500
},
{
"epoch": 0.83,
"grad_norm": 1.4702941179275513,
"learning_rate": 1.2163865546218488e-05,
"loss": 1.4214,
"step": 307000
},
{
"epoch": 0.83,
"grad_norm": 1.5852062702178955,
"learning_rate": 1.2132352941176471e-05,
"loss": 1.4229,
"step": 307500
},
{
"epoch": 0.83,
"grad_norm": 1.5045883655548096,
"learning_rate": 1.2100840336134455e-05,
"loss": 1.4245,
"step": 308000
},
{
"epoch": 0.84,
"grad_norm": 1.4635881185531616,
"learning_rate": 1.2069327731092438e-05,
"loss": 1.425,
"step": 308500
},
{
"epoch": 0.84,
"grad_norm": 1.4574062824249268,
"learning_rate": 1.2037815126050422e-05,
"loss": 1.4241,
"step": 309000
},
{
"epoch": 0.84,
"grad_norm": 1.4566025733947754,
"learning_rate": 1.2006302521008405e-05,
"loss": 1.4204,
"step": 309500
},
{
"epoch": 0.84,
"grad_norm": 1.525225281715393,
"learning_rate": 1.1974789915966388e-05,
"loss": 1.4218,
"step": 310000
},
{
"epoch": 0.84,
"grad_norm": 1.4726413488388062,
"learning_rate": 1.1943277310924368e-05,
"loss": 1.422,
"step": 310500
},
{
"epoch": 0.84,
"grad_norm": 1.4462370872497559,
"learning_rate": 1.1911764705882352e-05,
"loss": 1.4174,
"step": 311000
},
{
"epoch": 0.84,
"grad_norm": 1.4930446147918701,
"learning_rate": 1.1880252100840335e-05,
"loss": 1.4168,
"step": 311500
},
{
"epoch": 0.84,
"grad_norm": 2.050973892211914,
"learning_rate": 1.1848739495798319e-05,
"loss": 1.4205,
"step": 312000
},
{
"epoch": 0.85,
"grad_norm": 1.514642596244812,
"learning_rate": 1.1817226890756302e-05,
"loss": 1.42,
"step": 312500
},
{
"epoch": 0.85,
"grad_norm": 1.4417085647583008,
"learning_rate": 1.1785714285714286e-05,
"loss": 1.42,
"step": 313000
},
{
"epoch": 0.85,
"grad_norm": 1.473029375076294,
"learning_rate": 1.1754201680672269e-05,
"loss": 1.4228,
"step": 313500
},
{
"epoch": 0.85,
"grad_norm": 1.573533296585083,
"learning_rate": 1.1722689075630253e-05,
"loss": 1.4193,
"step": 314000
},
{
"epoch": 0.85,
"grad_norm": 1.5040185451507568,
"learning_rate": 1.1691176470588236e-05,
"loss": 1.4209,
"step": 314500
},
{
"epoch": 0.85,
"grad_norm": 1.472280740737915,
"learning_rate": 1.165966386554622e-05,
"loss": 1.4203,
"step": 315000
},
{
"epoch": 0.85,
"grad_norm": 1.4371939897537231,
"learning_rate": 1.1628151260504203e-05,
"loss": 1.4197,
"step": 315500
},
{
"epoch": 0.86,
"grad_norm": 1.74043607711792,
"learning_rate": 1.1596638655462186e-05,
"loss": 1.4189,
"step": 316000
},
{
"epoch": 0.86,
"grad_norm": 1.5340248346328735,
"learning_rate": 1.156512605042017e-05,
"loss": 1.4178,
"step": 316500
},
{
"epoch": 0.86,
"grad_norm": 1.4650968313217163,
"learning_rate": 1.1533613445378151e-05,
"loss": 1.4157,
"step": 317000
},
{
"epoch": 0.86,
"grad_norm": 1.6052621603012085,
"learning_rate": 1.1502100840336135e-05,
"loss": 1.4221,
"step": 317500
},
{
"epoch": 0.86,
"grad_norm": 1.4934183359146118,
"learning_rate": 1.1470588235294117e-05,
"loss": 1.4219,
"step": 318000
},
{
"epoch": 0.86,
"grad_norm": 1.6604057550430298,
"learning_rate": 1.14390756302521e-05,
"loss": 1.4165,
"step": 318500
},
{
"epoch": 0.86,
"grad_norm": 1.448686957359314,
"learning_rate": 1.1407563025210084e-05,
"loss": 1.4167,
"step": 319000
},
{
"epoch": 0.86,
"grad_norm": 1.4600298404693604,
"learning_rate": 1.1376050420168067e-05,
"loss": 1.4196,
"step": 319500
},
{
"epoch": 0.87,
"grad_norm": 1.4856675863265991,
"learning_rate": 1.134453781512605e-05,
"loss": 1.4188,
"step": 320000
},
{
"epoch": 0.87,
"grad_norm": 1.5987657308578491,
"learning_rate": 1.1313025210084034e-05,
"loss": 1.4176,
"step": 320500
},
{
"epoch": 0.87,
"grad_norm": 1.4707138538360596,
"learning_rate": 1.1281512605042017e-05,
"loss": 1.4177,
"step": 321000
},
{
"epoch": 0.87,
"grad_norm": 1.4592325687408447,
"learning_rate": 1.125e-05,
"loss": 1.419,
"step": 321500
},
{
"epoch": 0.87,
"grad_norm": 1.477171540260315,
"learning_rate": 1.1218487394957984e-05,
"loss": 1.4118,
"step": 322000
},
{
"epoch": 0.87,
"grad_norm": 1.5284925699234009,
"learning_rate": 1.1186974789915968e-05,
"loss": 1.418,
"step": 322500
},
{
"epoch": 0.87,
"grad_norm": 1.5696572065353394,
"learning_rate": 1.1155462184873951e-05,
"loss": 1.4175,
"step": 323000
},
{
"epoch": 0.88,
"grad_norm": 1.5421068668365479,
"learning_rate": 1.1123949579831933e-05,
"loss": 1.4134,
"step": 323500
},
{
"epoch": 0.88,
"grad_norm": 1.5944511890411377,
"learning_rate": 1.1092436974789916e-05,
"loss": 1.4139,
"step": 324000
},
{
"epoch": 0.88,
"grad_norm": 1.4496880769729614,
"learning_rate": 1.10609243697479e-05,
"loss": 1.4131,
"step": 324500
},
{
"epoch": 0.88,
"grad_norm": 1.5021952390670776,
"learning_rate": 1.1029411764705883e-05,
"loss": 1.4144,
"step": 325000
},
{
"epoch": 0.88,
"grad_norm": 1.5261799097061157,
"learning_rate": 1.0997899159663865e-05,
"loss": 1.4149,
"step": 325500
},
{
"epoch": 0.88,
"grad_norm": 1.396974802017212,
"learning_rate": 1.0966386554621848e-05,
"loss": 1.4149,
"step": 326000
},
{
"epoch": 0.88,
"grad_norm": 1.561023235321045,
"learning_rate": 1.0934873949579832e-05,
"loss": 1.4183,
"step": 326500
},
{
"epoch": 0.89,
"grad_norm": 1.509398102760315,
"learning_rate": 1.0903361344537815e-05,
"loss": 1.4158,
"step": 327000
},
{
"epoch": 0.89,
"grad_norm": 1.5046377182006836,
"learning_rate": 1.0871848739495799e-05,
"loss": 1.4137,
"step": 327500
},
{
"epoch": 0.89,
"grad_norm": 1.504531979560852,
"learning_rate": 1.0840336134453782e-05,
"loss": 1.4155,
"step": 328000
},
{
"epoch": 0.89,
"grad_norm": 1.6807337999343872,
"learning_rate": 1.0808823529411765e-05,
"loss": 1.4161,
"step": 328500
},
{
"epoch": 0.89,
"grad_norm": 1.4374127388000488,
"learning_rate": 1.0777310924369749e-05,
"loss": 1.4162,
"step": 329000
},
{
"epoch": 0.89,
"grad_norm": 1.4737296104431152,
"learning_rate": 1.0745798319327732e-05,
"loss": 1.4176,
"step": 329500
},
{
"epoch": 0.89,
"grad_norm": 1.5063775777816772,
"learning_rate": 1.0714285714285714e-05,
"loss": 1.4128,
"step": 330000
},
{
"epoch": 0.89,
"grad_norm": 1.506156325340271,
"learning_rate": 1.0682773109243698e-05,
"loss": 1.4176,
"step": 330500
},
{
"epoch": 0.9,
"grad_norm": 1.5394564867019653,
"learning_rate": 1.0651260504201681e-05,
"loss": 1.4119,
"step": 331000
},
{
"epoch": 0.9,
"grad_norm": 1.4483675956726074,
"learning_rate": 1.0619747899159664e-05,
"loss": 1.4138,
"step": 331500
},
{
"epoch": 0.9,
"grad_norm": 2.412644147872925,
"learning_rate": 1.0588235294117648e-05,
"loss": 1.4146,
"step": 332000
},
{
"epoch": 0.9,
"grad_norm": 1.9123421907424927,
"learning_rate": 1.0556722689075631e-05,
"loss": 1.4194,
"step": 332500
},
{
"epoch": 0.9,
"grad_norm": 1.4911080598831177,
"learning_rate": 1.0525210084033613e-05,
"loss": 1.418,
"step": 333000
},
{
"epoch": 0.9,
"grad_norm": 1.511194109916687,
"learning_rate": 1.0493697478991596e-05,
"loss": 1.4114,
"step": 333500
},
{
"epoch": 0.9,
"grad_norm": 1.4733537435531616,
"learning_rate": 1.046218487394958e-05,
"loss": 1.4149,
"step": 334000
},
{
"epoch": 0.91,
"grad_norm": 1.4742454290390015,
"learning_rate": 1.0430672268907563e-05,
"loss": 1.4163,
"step": 334500
},
{
"epoch": 0.91,
"grad_norm": 1.4842146635055542,
"learning_rate": 1.0399159663865547e-05,
"loss": 1.4118,
"step": 335000
},
{
"epoch": 0.91,
"grad_norm": 1.5346875190734863,
"learning_rate": 1.036764705882353e-05,
"loss": 1.4148,
"step": 335500
},
{
"epoch": 0.91,
"grad_norm": 1.6554747819900513,
"learning_rate": 1.0336134453781512e-05,
"loss": 1.416,
"step": 336000
},
{
"epoch": 0.91,
"grad_norm": 1.5015145540237427,
"learning_rate": 1.0304621848739495e-05,
"loss": 1.4146,
"step": 336500
},
{
"epoch": 0.91,
"grad_norm": 1.4634381532669067,
"learning_rate": 1.0273109243697479e-05,
"loss": 1.4199,
"step": 337000
},
{
"epoch": 0.91,
"grad_norm": 1.7802950143814087,
"learning_rate": 1.0241596638655462e-05,
"loss": 1.4127,
"step": 337500
},
{
"epoch": 0.91,
"grad_norm": 3.0422604084014893,
"learning_rate": 1.0210084033613446e-05,
"loss": 1.4121,
"step": 338000
},
{
"epoch": 0.92,
"grad_norm": 1.4957752227783203,
"learning_rate": 1.0178571428571429e-05,
"loss": 1.4151,
"step": 338500
},
{
"epoch": 0.92,
"grad_norm": 1.6368649005889893,
"learning_rate": 1.0147058823529413e-05,
"loss": 1.4211,
"step": 339000
},
{
"epoch": 0.92,
"grad_norm": 1.493455410003662,
"learning_rate": 1.0115546218487396e-05,
"loss": 1.4131,
"step": 339500
},
{
"epoch": 0.92,
"grad_norm": 1.5789108276367188,
"learning_rate": 1.008403361344538e-05,
"loss": 1.413,
"step": 340000
},
{
"epoch": 0.92,
"grad_norm": 1.4984022378921509,
"learning_rate": 1.0052521008403361e-05,
"loss": 1.4156,
"step": 340500
},
{
"epoch": 0.92,
"grad_norm": 1.443871021270752,
"learning_rate": 1.0021008403361345e-05,
"loss": 1.4123,
"step": 341000
},
{
"epoch": 0.92,
"grad_norm": 1.532205581665039,
"learning_rate": 9.989495798319328e-06,
"loss": 1.4145,
"step": 341500
},
{
"epoch": 0.93,
"grad_norm": 1.487888216972351,
"learning_rate": 9.957983193277312e-06,
"loss": 1.4132,
"step": 342000
},
{
"epoch": 0.93,
"grad_norm": 1.5009286403656006,
"learning_rate": 9.926470588235293e-06,
"loss": 1.4132,
"step": 342500
},
{
"epoch": 0.93,
"grad_norm": 1.53665292263031,
"learning_rate": 9.894957983193277e-06,
"loss": 1.4114,
"step": 343000
},
{
"epoch": 0.93,
"grad_norm": 1.4559004306793213,
"learning_rate": 9.86344537815126e-06,
"loss": 1.4128,
"step": 343500
},
{
"epoch": 0.93,
"grad_norm": 1.472882628440857,
"learning_rate": 9.831932773109244e-06,
"loss": 1.4106,
"step": 344000
},
{
"epoch": 0.93,
"grad_norm": 1.528029203414917,
"learning_rate": 9.800420168067227e-06,
"loss": 1.4133,
"step": 344500
},
{
"epoch": 0.93,
"grad_norm": 1.4509416818618774,
"learning_rate": 9.76890756302521e-06,
"loss": 1.4099,
"step": 345000
},
{
"epoch": 0.94,
"grad_norm": 1.644581913948059,
"learning_rate": 9.737394957983194e-06,
"loss": 1.4102,
"step": 345500
},
{
"epoch": 0.94,
"grad_norm": 1.5054335594177246,
"learning_rate": 9.705882352941177e-06,
"loss": 1.4119,
"step": 346000
},
{
"epoch": 0.94,
"grad_norm": 1.47361421585083,
"learning_rate": 9.67436974789916e-06,
"loss": 1.4094,
"step": 346500
},
{
"epoch": 0.94,
"grad_norm": 1.461796522140503,
"learning_rate": 9.642857142857144e-06,
"loss": 1.4108,
"step": 347000
},
{
"epoch": 0.94,
"grad_norm": 1.6115666627883911,
"learning_rate": 9.611344537815128e-06,
"loss": 1.4096,
"step": 347500
},
{
"epoch": 0.94,
"grad_norm": 1.526082992553711,
"learning_rate": 9.57983193277311e-06,
"loss": 1.4094,
"step": 348000
},
{
"epoch": 0.94,
"grad_norm": 1.4482905864715576,
"learning_rate": 9.548319327731093e-06,
"loss": 1.4082,
"step": 348500
},
{
"epoch": 0.94,
"grad_norm": 1.5066174268722534,
"learning_rate": 9.516806722689075e-06,
"loss": 1.4122,
"step": 349000
},
{
"epoch": 0.95,
"grad_norm": 1.5225650072097778,
"learning_rate": 9.485294117647058e-06,
"loss": 1.4069,
"step": 349500
},
{
"epoch": 0.95,
"grad_norm": 1.4794243574142456,
"learning_rate": 9.453781512605041e-06,
"loss": 1.4087,
"step": 350000
},
{
"epoch": 0.95,
"grad_norm": 1.4825611114501953,
"learning_rate": 9.422268907563025e-06,
"loss": 1.4098,
"step": 350500
},
{
"epoch": 0.95,
"grad_norm": 1.50911283493042,
"learning_rate": 9.390756302521008e-06,
"loss": 1.4066,
"step": 351000
},
{
"epoch": 0.95,
"grad_norm": 1.5070313215255737,
"learning_rate": 9.359243697478992e-06,
"loss": 1.4067,
"step": 351500
},
{
"epoch": 0.95,
"grad_norm": 1.4434587955474854,
"learning_rate": 9.327731092436975e-06,
"loss": 1.4074,
"step": 352000
},
{
"epoch": 0.95,
"grad_norm": 1.4484858512878418,
"learning_rate": 9.296218487394959e-06,
"loss": 1.4056,
"step": 352500
},
{
"epoch": 0.96,
"grad_norm": 1.6141736507415771,
"learning_rate": 9.264705882352942e-06,
"loss": 1.4084,
"step": 353000
},
{
"epoch": 0.96,
"grad_norm": 1.4847619533538818,
"learning_rate": 9.233193277310925e-06,
"loss": 1.4092,
"step": 353500
},
{
"epoch": 0.96,
"grad_norm": 1.4862167835235596,
"learning_rate": 9.201680672268909e-06,
"loss": 1.4086,
"step": 354000
},
{
"epoch": 0.96,
"grad_norm": 1.5454356670379639,
"learning_rate": 9.170168067226892e-06,
"loss": 1.4088,
"step": 354500
},
{
"epoch": 0.96,
"grad_norm": 1.4676494598388672,
"learning_rate": 9.138655462184874e-06,
"loss": 1.4094,
"step": 355000
},
{
"epoch": 0.96,
"grad_norm": 1.4859504699707031,
"learning_rate": 9.107142857142856e-06,
"loss": 1.4076,
"step": 355500
},
{
"epoch": 0.96,
"grad_norm": 1.499040961265564,
"learning_rate": 9.07563025210084e-06,
"loss": 1.4104,
"step": 356000
},
{
"epoch": 0.97,
"grad_norm": 1.4864604473114014,
"learning_rate": 9.044117647058823e-06,
"loss": 1.4061,
"step": 356500
},
{
"epoch": 0.97,
"grad_norm": 1.4507191181182861,
"learning_rate": 9.012605042016806e-06,
"loss": 1.4062,
"step": 357000
},
{
"epoch": 0.97,
"grad_norm": 1.468526840209961,
"learning_rate": 8.98109243697479e-06,
"loss": 1.4081,
"step": 357500
},
{
"epoch": 0.97,
"grad_norm": 1.6709305047988892,
"learning_rate": 8.949579831932773e-06,
"loss": 1.4126,
"step": 358000
},
{
"epoch": 0.97,
"grad_norm": 1.9611443281173706,
"learning_rate": 8.918067226890756e-06,
"loss": 1.4079,
"step": 358500
},
{
"epoch": 0.97,
"grad_norm": 1.6809275150299072,
"learning_rate": 8.88655462184874e-06,
"loss": 1.4114,
"step": 359000
},
{
"epoch": 0.97,
"grad_norm": 5.746359825134277,
"learning_rate": 8.855042016806723e-06,
"loss": 1.4084,
"step": 359500
},
{
"epoch": 0.97,
"grad_norm": 5.197726726531982,
"learning_rate": 8.823529411764707e-06,
"loss": 1.4066,
"step": 360000
},
{
"epoch": 0.98,
"grad_norm": 1.4346739053726196,
"learning_rate": 8.79201680672269e-06,
"loss": 1.4066,
"step": 360500
},
{
"epoch": 0.98,
"grad_norm": 1.571542739868164,
"learning_rate": 8.760504201680674e-06,
"loss": 1.4097,
"step": 361000
},
{
"epoch": 0.98,
"grad_norm": 1.5356281995773315,
"learning_rate": 8.728991596638657e-06,
"loss": 1.4045,
"step": 361500
},
{
"epoch": 0.98,
"grad_norm": 1.7401924133300781,
"learning_rate": 8.697478991596639e-06,
"loss": 1.4067,
"step": 362000
},
{
"epoch": 0.98,
"grad_norm": 1.5491187572479248,
"learning_rate": 8.66596638655462e-06,
"loss": 1.4042,
"step": 362500
},
{
"epoch": 0.98,
"grad_norm": 1.5863696336746216,
"learning_rate": 8.634453781512604e-06,
"loss": 1.4074,
"step": 363000
},
{
"epoch": 0.98,
"grad_norm": 1.450952410697937,
"learning_rate": 8.602941176470587e-06,
"loss": 1.4076,
"step": 363500
},
{
"epoch": 0.99,
"grad_norm": 1.5750932693481445,
"learning_rate": 8.571428571428571e-06,
"loss": 1.41,
"step": 364000
},
{
"epoch": 0.99,
"grad_norm": 1.4661774635314941,
"learning_rate": 8.539915966386554e-06,
"loss": 1.4091,
"step": 364500
},
{
"epoch": 0.99,
"grad_norm": 1.540864109992981,
"learning_rate": 8.508403361344538e-06,
"loss": 1.4052,
"step": 365000
},
{
"epoch": 0.99,
"grad_norm": 1.5120595693588257,
"learning_rate": 8.476890756302521e-06,
"loss": 1.4072,
"step": 365500
},
{
"epoch": 0.99,
"grad_norm": 1.5357037782669067,
"learning_rate": 8.445378151260505e-06,
"loss": 1.4097,
"step": 366000
},
{
"epoch": 0.99,
"grad_norm": 1.5010443925857544,
"learning_rate": 8.413865546218488e-06,
"loss": 1.4094,
"step": 366500
},
{
"epoch": 0.99,
"grad_norm": 1.4643309116363525,
"learning_rate": 8.382352941176472e-06,
"loss": 1.4077,
"step": 367000
},
{
"epoch": 0.99,
"grad_norm": 1.4524095058441162,
"learning_rate": 8.350840336134455e-06,
"loss": 1.4065,
"step": 367500
},
{
"epoch": 1.0,
"grad_norm": 1.5203324556350708,
"learning_rate": 8.319327731092438e-06,
"loss": 1.4035,
"step": 368000
},
{
"epoch": 1.0,
"grad_norm": 1.4688167572021484,
"learning_rate": 8.28781512605042e-06,
"loss": 1.4067,
"step": 368500
},
{
"epoch": 1.0,
"grad_norm": 1.5595752000808716,
"learning_rate": 8.256302521008404e-06,
"loss": 1.4059,
"step": 369000
},
{
"epoch": 1.0,
"grad_norm": 1.4404747486114502,
"learning_rate": 8.224789915966387e-06,
"loss": 1.4035,
"step": 369500
},
{
"epoch": 1.0,
"grad_norm": 1.6032897233963013,
"learning_rate": 8.193277310924369e-06,
"loss": 1.4001,
"step": 370000
},
{
"epoch": 1.0,
"grad_norm": 1.6836262941360474,
"learning_rate": 8.161764705882352e-06,
"loss": 1.3981,
"step": 370500
},
{
"epoch": 1.0,
"grad_norm": 1.5205241441726685,
"learning_rate": 8.130252100840336e-06,
"loss": 1.3994,
"step": 371000
},
{
"epoch": 1.01,
"grad_norm": 1.7194490432739258,
"learning_rate": 8.098739495798319e-06,
"loss": 1.4027,
"step": 371500
},
{
"epoch": 1.01,
"grad_norm": 1.4517977237701416,
"learning_rate": 8.067226890756303e-06,
"loss": 1.4022,
"step": 372000
},
{
"epoch": 1.01,
"grad_norm": 1.6818935871124268,
"learning_rate": 8.035714285714286e-06,
"loss": 1.4028,
"step": 372500
},
{
"epoch": 1.01,
"grad_norm": 1.5117074251174927,
"learning_rate": 8.00420168067227e-06,
"loss": 1.4021,
"step": 373000
},
{
"epoch": 1.01,
"grad_norm": 1.4689205884933472,
"learning_rate": 7.972689075630253e-06,
"loss": 1.4057,
"step": 373500
},
{
"epoch": 1.01,
"grad_norm": 1.525889277458191,
"learning_rate": 7.941176470588236e-06,
"loss": 1.4041,
"step": 374000
},
{
"epoch": 1.01,
"grad_norm": 1.4896938800811768,
"learning_rate": 7.90966386554622e-06,
"loss": 1.4027,
"step": 374500
},
{
"epoch": 1.02,
"grad_norm": 1.4765034914016724,
"learning_rate": 7.878151260504201e-06,
"loss": 1.4005,
"step": 375000
},
{
"epoch": 1.02,
"grad_norm": 1.5386637449264526,
"learning_rate": 7.846638655462185e-06,
"loss": 1.397,
"step": 375500
},
{
"epoch": 1.02,
"grad_norm": 1.4808331727981567,
"learning_rate": 7.815126050420168e-06,
"loss": 1.401,
"step": 376000
},
{
"epoch": 1.02,
"grad_norm": 1.517560362815857,
"learning_rate": 7.783613445378152e-06,
"loss": 1.4037,
"step": 376500
},
{
"epoch": 1.02,
"grad_norm": 1.6733453273773193,
"learning_rate": 7.752100840336135e-06,
"loss": 1.3976,
"step": 377000
},
{
"epoch": 1.02,
"grad_norm": 1.480815052986145,
"learning_rate": 7.720588235294117e-06,
"loss": 1.4,
"step": 377500
},
{
"epoch": 1.02,
"grad_norm": 1.4836503267288208,
"learning_rate": 7.6890756302521e-06,
"loss": 1.3977,
"step": 378000
},
{
"epoch": 1.02,
"grad_norm": 1.442256212234497,
"learning_rate": 7.657563025210084e-06,
"loss": 1.399,
"step": 378500
},
{
"epoch": 1.03,
"grad_norm": 1.8496633768081665,
"learning_rate": 7.626050420168067e-06,
"loss": 1.4038,
"step": 379000
},
{
"epoch": 1.03,
"grad_norm": 1.4886460304260254,
"learning_rate": 7.59453781512605e-06,
"loss": 1.4061,
"step": 379500
},
{
"epoch": 1.03,
"grad_norm": 1.550764799118042,
"learning_rate": 7.563025210084033e-06,
"loss": 1.4003,
"step": 380000
},
{
"epoch": 1.03,
"grad_norm": 1.5111615657806396,
"learning_rate": 7.531512605042017e-06,
"loss": 1.4021,
"step": 380500
},
{
"epoch": 1.03,
"grad_norm": 1.5873339176177979,
"learning_rate": 7.5e-06,
"loss": 1.4003,
"step": 381000
},
{
"epoch": 1.03,
"grad_norm": 1.5139081478118896,
"learning_rate": 7.468487394957984e-06,
"loss": 1.3974,
"step": 381500
},
{
"epoch": 1.03,
"grad_norm": 1.4700753688812256,
"learning_rate": 7.436974789915967e-06,
"loss": 1.4009,
"step": 382000
},
{
"epoch": 1.04,
"grad_norm": 1.4294934272766113,
"learning_rate": 7.4054621848739505e-06,
"loss": 1.3997,
"step": 382500
},
{
"epoch": 1.04,
"grad_norm": 1.432667851448059,
"learning_rate": 7.373949579831932e-06,
"loss": 1.3992,
"step": 383000
},
{
"epoch": 1.04,
"grad_norm": 1.6012872457504272,
"learning_rate": 7.342436974789916e-06,
"loss": 1.3988,
"step": 383500
},
{
"epoch": 1.04,
"grad_norm": 1.5000537633895874,
"learning_rate": 7.310924369747899e-06,
"loss": 1.399,
"step": 384000
},
{
"epoch": 1.04,
"grad_norm": 1.5064808130264282,
"learning_rate": 7.2794117647058826e-06,
"loss": 1.4022,
"step": 384500
},
{
"epoch": 1.04,
"grad_norm": 1.5001455545425415,
"learning_rate": 7.247899159663866e-06,
"loss": 1.3947,
"step": 385000
},
{
"epoch": 1.04,
"grad_norm": 1.4360790252685547,
"learning_rate": 7.2163865546218494e-06,
"loss": 1.3983,
"step": 385500
},
{
"epoch": 1.04,
"grad_norm": 1.4993146657943726,
"learning_rate": 7.184873949579832e-06,
"loss": 1.3987,
"step": 386000
},
{
"epoch": 1.05,
"grad_norm": 1.4621449708938599,
"learning_rate": 7.1533613445378155e-06,
"loss": 1.3974,
"step": 386500
},
{
"epoch": 1.05,
"grad_norm": 1.7409414052963257,
"learning_rate": 7.121848739495798e-06,
"loss": 1.4004,
"step": 387000
},
{
"epoch": 1.05,
"grad_norm": 1.4486150741577148,
"learning_rate": 7.0903361344537815e-06,
"loss": 1.3982,
"step": 387500
},
{
"epoch": 1.05,
"grad_norm": 1.5252596139907837,
"learning_rate": 7.058823529411765e-06,
"loss": 1.4013,
"step": 388000
},
{
"epoch": 1.05,
"grad_norm": 1.4874343872070312,
"learning_rate": 7.027310924369748e-06,
"loss": 1.3995,
"step": 388500
},
{
"epoch": 1.05,
"grad_norm": 1.5078623294830322,
"learning_rate": 6.995798319327732e-06,
"loss": 1.3985,
"step": 389000
},
{
"epoch": 1.05,
"grad_norm": 1.5256296396255493,
"learning_rate": 6.964285714285714e-06,
"loss": 1.4005,
"step": 389500
},
{
"epoch": 1.06,
"grad_norm": 1.5369598865509033,
"learning_rate": 6.932773109243698e-06,
"loss": 1.3929,
"step": 390000
},
{
"epoch": 1.06,
"grad_norm": 1.4955265522003174,
"learning_rate": 6.9012605042016804e-06,
"loss": 1.3968,
"step": 390500
},
{
"epoch": 1.06,
"grad_norm": 1.501406192779541,
"learning_rate": 6.869747899159664e-06,
"loss": 1.3982,
"step": 391000
},
{
"epoch": 1.06,
"grad_norm": 1.5695279836654663,
"learning_rate": 6.838235294117647e-06,
"loss": 1.3986,
"step": 391500
},
{
"epoch": 1.06,
"grad_norm": 1.590920329093933,
"learning_rate": 6.806722689075631e-06,
"loss": 1.3989,
"step": 392000
},
{
"epoch": 1.06,
"grad_norm": 1.4469817876815796,
"learning_rate": 6.775210084033613e-06,
"loss": 1.3958,
"step": 392500
},
{
"epoch": 1.06,
"grad_norm": 1.4517157077789307,
"learning_rate": 6.743697478991597e-06,
"loss": 1.3948,
"step": 393000
},
{
"epoch": 1.07,
"grad_norm": 1.477184534072876,
"learning_rate": 6.71218487394958e-06,
"loss": 1.3955,
"step": 393500
},
{
"epoch": 1.07,
"grad_norm": 2.1850063800811768,
"learning_rate": 6.680672268907564e-06,
"loss": 1.3977,
"step": 394000
},
{
"epoch": 1.07,
"grad_norm": 1.4544538259506226,
"learning_rate": 6.649159663865546e-06,
"loss": 1.3974,
"step": 394500
},
{
"epoch": 1.07,
"grad_norm": 1.4682557582855225,
"learning_rate": 6.61764705882353e-06,
"loss": 1.3976,
"step": 395000
},
{
"epoch": 1.07,
"grad_norm": 1.4401472806930542,
"learning_rate": 6.586134453781512e-06,
"loss": 1.4002,
"step": 395500
},
{
"epoch": 1.07,
"grad_norm": 1.5497291088104248,
"learning_rate": 6.554621848739496e-06,
"loss": 1.3945,
"step": 396000
},
{
"epoch": 1.07,
"grad_norm": 1.525145173072815,
"learning_rate": 6.523109243697479e-06,
"loss": 1.4006,
"step": 396500
},
{
"epoch": 1.07,
"grad_norm": 1.5119032859802246,
"learning_rate": 6.491596638655463e-06,
"loss": 1.3984,
"step": 397000
},
{
"epoch": 1.08,
"grad_norm": 1.7145532369613647,
"learning_rate": 6.460084033613446e-06,
"loss": 1.398,
"step": 397500
},
{
"epoch": 1.08,
"grad_norm": 1.5175354480743408,
"learning_rate": 6.428571428571429e-06,
"loss": 1.3971,
"step": 398000
},
{
"epoch": 1.08,
"grad_norm": 1.4529006481170654,
"learning_rate": 6.397058823529412e-06,
"loss": 1.3986,
"step": 398500
},
{
"epoch": 1.08,
"grad_norm": 1.4779740571975708,
"learning_rate": 6.365546218487395e-06,
"loss": 1.3985,
"step": 399000
},
{
"epoch": 1.08,
"grad_norm": 1.591557502746582,
"learning_rate": 6.334033613445378e-06,
"loss": 1.3971,
"step": 399500
},
{
"epoch": 1.08,
"grad_norm": 1.5829887390136719,
"learning_rate": 6.3025210084033615e-06,
"loss": 1.3989,
"step": 400000
},
{
"epoch": 1.08,
"grad_norm": 1.546576976776123,
"learning_rate": 6.271008403361345e-06,
"loss": 1.398,
"step": 400500
},
{
"epoch": 1.09,
"grad_norm": 1.4360915422439575,
"learning_rate": 6.239495798319328e-06,
"loss": 1.3933,
"step": 401000
},
{
"epoch": 1.09,
"grad_norm": 1.555240273475647,
"learning_rate": 6.207983193277311e-06,
"loss": 1.3964,
"step": 401500
},
{
"epoch": 1.09,
"grad_norm": 1.5486465692520142,
"learning_rate": 6.176470588235294e-06,
"loss": 1.3922,
"step": 402000
},
{
"epoch": 1.09,
"grad_norm": 1.6140353679656982,
"learning_rate": 6.144957983193277e-06,
"loss": 1.3941,
"step": 402500
},
{
"epoch": 1.09,
"grad_norm": 1.422938346862793,
"learning_rate": 6.1134453781512605e-06,
"loss": 1.3946,
"step": 403000
},
{
"epoch": 1.09,
"grad_norm": 1.673789620399475,
"learning_rate": 6.081932773109244e-06,
"loss": 1.3965,
"step": 403500
},
{
"epoch": 1.09,
"grad_norm": 1.52051842212677,
"learning_rate": 6.050420168067227e-06,
"loss": 1.3935,
"step": 404000
},
{
"epoch": 1.09,
"grad_norm": 1.5157978534698486,
"learning_rate": 6.018907563025211e-06,
"loss": 1.3938,
"step": 404500
},
{
"epoch": 1.1,
"grad_norm": 1.5434610843658447,
"learning_rate": 5.987394957983194e-06,
"loss": 1.3931,
"step": 405000
},
{
"epoch": 1.1,
"grad_norm": 1.7399873733520508,
"learning_rate": 5.955882352941176e-06,
"loss": 1.3924,
"step": 405500
},
{
"epoch": 1.1,
"grad_norm": 1.482820749282837,
"learning_rate": 5.924369747899159e-06,
"loss": 1.3923,
"step": 406000
},
{
"epoch": 1.1,
"grad_norm": 4.893394947052002,
"learning_rate": 5.892857142857143e-06,
"loss": 1.393,
"step": 406500
},
{
"epoch": 1.1,
"grad_norm": 1.538550615310669,
"learning_rate": 5.861344537815126e-06,
"loss": 1.3938,
"step": 407000
},
{
"epoch": 1.1,
"grad_norm": 1.4997118711471558,
"learning_rate": 5.82983193277311e-06,
"loss": 1.3934,
"step": 407500
},
{
"epoch": 1.1,
"grad_norm": 1.5265237092971802,
"learning_rate": 5.798319327731093e-06,
"loss": 1.3915,
"step": 408000
},
{
"epoch": 1.11,
"grad_norm": 1.6841180324554443,
"learning_rate": 5.766806722689076e-06,
"loss": 1.3946,
"step": 408500
},
{
"epoch": 1.11,
"grad_norm": 1.4722718000411987,
"learning_rate": 5.735294117647058e-06,
"loss": 1.3949,
"step": 409000
},
{
"epoch": 1.11,
"grad_norm": 2.087042808532715,
"learning_rate": 5.703781512605042e-06,
"loss": 1.3925,
"step": 409500
},
{
"epoch": 1.11,
"grad_norm": 1.4858590364456177,
"learning_rate": 5.672268907563025e-06,
"loss": 1.3943,
"step": 410000
},
{
"epoch": 1.11,
"grad_norm": 1.4591546058654785,
"learning_rate": 5.640756302521009e-06,
"loss": 1.3924,
"step": 410500
},
{
"epoch": 1.11,
"grad_norm": 1.4490437507629395,
"learning_rate": 5.609243697478992e-06,
"loss": 1.3949,
"step": 411000
},
{
"epoch": 1.11,
"grad_norm": 1.5795851945877075,
"learning_rate": 5.5777310924369755e-06,
"loss": 1.3951,
"step": 411500
},
{
"epoch": 1.12,
"grad_norm": 1.5447410345077515,
"learning_rate": 5.546218487394958e-06,
"loss": 1.396,
"step": 412000
},
{
"epoch": 1.12,
"grad_norm": 1.510696530342102,
"learning_rate": 5.5147058823529415e-06,
"loss": 1.3929,
"step": 412500
},
{
"epoch": 1.12,
"grad_norm": 1.52991783618927,
"learning_rate": 5.483193277310924e-06,
"loss": 1.393,
"step": 413000
},
{
"epoch": 1.12,
"grad_norm": 1.5724798440933228,
"learning_rate": 5.4516806722689076e-06,
"loss": 1.3933,
"step": 413500
},
{
"epoch": 1.12,
"grad_norm": 1.9198040962219238,
"learning_rate": 5.420168067226891e-06,
"loss": 1.3934,
"step": 414000
},
{
"epoch": 1.12,
"grad_norm": 1.5322943925857544,
"learning_rate": 5.3886554621848744e-06,
"loss": 1.3925,
"step": 414500
},
{
"epoch": 1.12,
"grad_norm": 1.4684040546417236,
"learning_rate": 5.357142857142857e-06,
"loss": 1.3933,
"step": 415000
},
{
"epoch": 1.12,
"grad_norm": 1.4797214269638062,
"learning_rate": 5.3256302521008405e-06,
"loss": 1.3925,
"step": 415500
},
{
"epoch": 1.13,
"grad_norm": 1.524305820465088,
"learning_rate": 5.294117647058824e-06,
"loss": 1.3929,
"step": 416000
},
{
"epoch": 1.13,
"grad_norm": 1.4858139753341675,
"learning_rate": 5.2626050420168065e-06,
"loss": 1.3881,
"step": 416500
},
{
"epoch": 1.13,
"grad_norm": 1.5586313009262085,
"learning_rate": 5.23109243697479e-06,
"loss": 1.393,
"step": 417000
},
{
"epoch": 1.13,
"grad_norm": 1.54250168800354,
"learning_rate": 5.199579831932773e-06,
"loss": 1.3926,
"step": 417500
},
{
"epoch": 1.13,
"grad_norm": 9.902482986450195,
"learning_rate": 5.168067226890756e-06,
"loss": 1.3923,
"step": 418000
},
{
"epoch": 1.13,
"grad_norm": 3.239046573638916,
"learning_rate": 5.136554621848739e-06,
"loss": 1.3925,
"step": 418500
},
{
"epoch": 1.13,
"grad_norm": 1.5059127807617188,
"learning_rate": 5.105042016806723e-06,
"loss": 1.3936,
"step": 419000
},
{
"epoch": 1.14,
"grad_norm": 1.5107486248016357,
"learning_rate": 5.073529411764706e-06,
"loss": 1.3942,
"step": 419500
},
{
"epoch": 1.14,
"grad_norm": 1.577019214630127,
"learning_rate": 5.04201680672269e-06,
"loss": 1.3896,
"step": 420000
},
{
"epoch": 1.14,
"grad_norm": 1.4538390636444092,
"learning_rate": 5.010504201680672e-06,
"loss": 1.387,
"step": 420500
},
{
"epoch": 1.14,
"grad_norm": 1.593549132347107,
"learning_rate": 4.978991596638656e-06,
"loss": 1.3908,
"step": 421000
},
{
"epoch": 1.14,
"grad_norm": 1.4725204706192017,
"learning_rate": 4.947478991596638e-06,
"loss": 1.3904,
"step": 421500
},
{
"epoch": 1.14,
"grad_norm": 1.4892488718032837,
"learning_rate": 4.915966386554622e-06,
"loss": 1.3896,
"step": 422000
},
{
"epoch": 1.14,
"grad_norm": 1.503003478050232,
"learning_rate": 4.884453781512605e-06,
"loss": 1.3901,
"step": 422500
},
{
"epoch": 1.15,
"grad_norm": 1.5650583505630493,
"learning_rate": 4.852941176470589e-06,
"loss": 1.3879,
"step": 423000
},
{
"epoch": 1.15,
"grad_norm": 1.5746469497680664,
"learning_rate": 4.821428571428572e-06,
"loss": 1.3898,
"step": 423500
},
{
"epoch": 1.15,
"grad_norm": 1.4636718034744263,
"learning_rate": 4.789915966386555e-06,
"loss": 1.3946,
"step": 424000
},
{
"epoch": 1.15,
"grad_norm": 1.5072635412216187,
"learning_rate": 4.758403361344537e-06,
"loss": 1.3936,
"step": 424500
},
{
"epoch": 1.15,
"grad_norm": 1.9211359024047852,
"learning_rate": 4.726890756302521e-06,
"loss": 1.3919,
"step": 425000
},
{
"epoch": 1.15,
"grad_norm": 1.6186763048171997,
"learning_rate": 4.695378151260504e-06,
"loss": 1.3874,
"step": 425500
},
{
"epoch": 1.15,
"grad_norm": 1.6086759567260742,
"learning_rate": 4.663865546218488e-06,
"loss": 1.3911,
"step": 426000
},
{
"epoch": 1.15,
"grad_norm": 1.4456268548965454,
"learning_rate": 4.632352941176471e-06,
"loss": 1.3888,
"step": 426500
},
{
"epoch": 1.16,
"grad_norm": 1.5766582489013672,
"learning_rate": 4.6008403361344545e-06,
"loss": 1.3884,
"step": 427000
},
{
"epoch": 1.16,
"grad_norm": 1.4081532955169678,
"learning_rate": 4.569327731092437e-06,
"loss": 1.3904,
"step": 427500
},
{
"epoch": 1.16,
"grad_norm": 1.4901301860809326,
"learning_rate": 4.53781512605042e-06,
"loss": 1.389,
"step": 428000
},
{
"epoch": 1.16,
"grad_norm": 1.5027050971984863,
"learning_rate": 4.506302521008403e-06,
"loss": 1.3931,
"step": 428500
},
{
"epoch": 1.16,
"grad_norm": 1.4869219064712524,
"learning_rate": 4.4747899159663865e-06,
"loss": 1.3888,
"step": 429000
},
{
"epoch": 1.16,
"grad_norm": 1.439729928970337,
"learning_rate": 4.44327731092437e-06,
"loss": 1.3897,
"step": 429500
},
{
"epoch": 1.16,
"grad_norm": 1.5325324535369873,
"learning_rate": 4.411764705882353e-06,
"loss": 1.3891,
"step": 430000
},
{
"epoch": 1.17,
"grad_norm": 1.5293645858764648,
"learning_rate": 4.380252100840337e-06,
"loss": 1.3902,
"step": 430500
},
{
"epoch": 1.17,
"grad_norm": 1.4475960731506348,
"learning_rate": 4.3487394957983194e-06,
"loss": 1.388,
"step": 431000
},
{
"epoch": 1.17,
"grad_norm": 1.5612802505493164,
"learning_rate": 4.317226890756302e-06,
"loss": 1.3885,
"step": 431500
},
{
"epoch": 1.17,
"grad_norm": 1.682928204536438,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.3899,
"step": 432000
},
{
"epoch": 1.17,
"grad_norm": 1.5231236219406128,
"learning_rate": 4.254201680672269e-06,
"loss": 1.3877,
"step": 432500
},
{
"epoch": 1.17,
"grad_norm": 1.446148157119751,
"learning_rate": 4.222689075630252e-06,
"loss": 1.3901,
"step": 433000
},
{
"epoch": 1.17,
"grad_norm": 1.4778817892074585,
"learning_rate": 4.191176470588236e-06,
"loss": 1.3865,
"step": 433500
},
{
"epoch": 1.17,
"grad_norm": 1.5888080596923828,
"learning_rate": 4.159663865546219e-06,
"loss": 1.3872,
"step": 434000
},
{
"epoch": 1.18,
"grad_norm": 1.6371558904647827,
"learning_rate": 4.128151260504202e-06,
"loss": 1.3893,
"step": 434500
},
{
"epoch": 1.18,
"grad_norm": 1.4442592859268188,
"learning_rate": 4.096638655462184e-06,
"loss": 1.3934,
"step": 435000
},
{
"epoch": 1.18,
"grad_norm": 1.7637091875076294,
"learning_rate": 4.065126050420168e-06,
"loss": 1.3892,
"step": 435500
},
{
"epoch": 1.18,
"grad_norm": 1.4838693141937256,
"learning_rate": 4.033613445378151e-06,
"loss": 1.3866,
"step": 436000
},
{
"epoch": 1.18,
"grad_norm": 1.5558868646621704,
"learning_rate": 4.002100840336135e-06,
"loss": 1.3914,
"step": 436500
},
{
"epoch": 1.18,
"grad_norm": 1.8331657648086548,
"learning_rate": 3.970588235294118e-06,
"loss": 1.389,
"step": 437000
},
{
"epoch": 1.18,
"grad_norm": 1.7367424964904785,
"learning_rate": 3.939075630252101e-06,
"loss": 1.3897,
"step": 437500
},
{
"epoch": 1.19,
"grad_norm": 1.5316094160079956,
"learning_rate": 3.907563025210084e-06,
"loss": 1.3871,
"step": 438000
},
{
"epoch": 1.19,
"grad_norm": 1.5062899589538574,
"learning_rate": 3.876050420168068e-06,
"loss": 1.3876,
"step": 438500
},
{
"epoch": 1.19,
"grad_norm": 1.5399343967437744,
"learning_rate": 3.84453781512605e-06,
"loss": 1.3873,
"step": 439000
},
{
"epoch": 1.19,
"grad_norm": 1.8311206102371216,
"learning_rate": 3.8130252100840336e-06,
"loss": 1.3886,
"step": 439500
},
{
"epoch": 1.19,
"grad_norm": 1.5011011362075806,
"learning_rate": 3.7815126050420167e-06,
"loss": 1.3877,
"step": 440000
},
{
"epoch": 1.19,
"grad_norm": 1.5647181272506714,
"learning_rate": 3.75e-06,
"loss": 1.3895,
"step": 440500
},
{
"epoch": 1.19,
"grad_norm": 1.9663615226745605,
"learning_rate": 3.7184873949579835e-06,
"loss": 1.3884,
"step": 441000
},
{
"epoch": 1.2,
"grad_norm": 2.4808692932128906,
"learning_rate": 3.686974789915966e-06,
"loss": 1.3879,
"step": 441500
},
{
"epoch": 1.2,
"grad_norm": 1.4271633625030518,
"learning_rate": 3.6554621848739496e-06,
"loss": 1.3913,
"step": 442000
},
{
"epoch": 1.2,
"grad_norm": 1.5341715812683105,
"learning_rate": 3.623949579831933e-06,
"loss": 1.3874,
"step": 442500
},
{
"epoch": 1.2,
"grad_norm": 1.4926517009735107,
"learning_rate": 3.592436974789916e-06,
"loss": 1.3873,
"step": 443000
},
{
"epoch": 1.2,
"grad_norm": 1.4709627628326416,
"learning_rate": 3.560924369747899e-06,
"loss": 1.3856,
"step": 443500
},
{
"epoch": 1.2,
"grad_norm": 1.4797513484954834,
"learning_rate": 3.5294117647058825e-06,
"loss": 1.3874,
"step": 444000
},
{
"epoch": 1.2,
"grad_norm": 1.506548523902893,
"learning_rate": 3.497899159663866e-06,
"loss": 1.3859,
"step": 444500
},
{
"epoch": 1.2,
"grad_norm": 1.4667857885360718,
"learning_rate": 3.466386554621849e-06,
"loss": 1.3889,
"step": 445000
},
{
"epoch": 1.21,
"grad_norm": 1.4796762466430664,
"learning_rate": 3.434873949579832e-06,
"loss": 1.3912,
"step": 445500
},
{
"epoch": 1.21,
"grad_norm": 1.534725546836853,
"learning_rate": 3.4033613445378154e-06,
"loss": 1.3881,
"step": 446000
},
{
"epoch": 1.21,
"grad_norm": 1.6512054204940796,
"learning_rate": 3.3718487394957984e-06,
"loss": 1.3874,
"step": 446500
},
{
"epoch": 1.21,
"grad_norm": 1.4926962852478027,
"learning_rate": 3.340336134453782e-06,
"loss": 1.3844,
"step": 447000
},
{
"epoch": 1.21,
"grad_norm": 1.479819416999817,
"learning_rate": 3.308823529411765e-06,
"loss": 1.3862,
"step": 447500
},
{
"epoch": 1.21,
"grad_norm": 1.429606318473816,
"learning_rate": 3.277310924369748e-06,
"loss": 1.3864,
"step": 448000
},
{
"epoch": 1.21,
"grad_norm": 1.526227593421936,
"learning_rate": 3.2457983193277313e-06,
"loss": 1.388,
"step": 448500
},
{
"epoch": 1.22,
"grad_norm": 1.5270380973815918,
"learning_rate": 3.2142857142857143e-06,
"loss": 1.3898,
"step": 449000
},
{
"epoch": 1.22,
"grad_norm": 1.6459033489227295,
"learning_rate": 3.1827731092436973e-06,
"loss": 1.3872,
"step": 449500
},
{
"epoch": 1.22,
"grad_norm": 1.5082780122756958,
"learning_rate": 3.1512605042016808e-06,
"loss": 1.3864,
"step": 450000
},
{
"epoch": 1.22,
"grad_norm": 1.4675207138061523,
"learning_rate": 3.119747899159664e-06,
"loss": 1.3858,
"step": 450500
},
{
"epoch": 1.22,
"grad_norm": 1.5487087965011597,
"learning_rate": 3.088235294117647e-06,
"loss": 1.3859,
"step": 451000
},
{
"epoch": 1.22,
"grad_norm": 1.5166810750961304,
"learning_rate": 3.0567226890756302e-06,
"loss": 1.3838,
"step": 451500
},
{
"epoch": 1.22,
"grad_norm": 1.4788706302642822,
"learning_rate": 3.0252100840336137e-06,
"loss": 1.3836,
"step": 452000
},
{
"epoch": 1.22,
"grad_norm": 1.6381962299346924,
"learning_rate": 2.993697478991597e-06,
"loss": 1.3853,
"step": 452500
},
{
"epoch": 1.23,
"grad_norm": 1.4548882246017456,
"learning_rate": 2.9621848739495797e-06,
"loss": 1.3878,
"step": 453000
},
{
"epoch": 1.23,
"grad_norm": 1.5543279647827148,
"learning_rate": 2.930672268907563e-06,
"loss": 1.3885,
"step": 453500
},
{
"epoch": 1.23,
"grad_norm": 1.5119037628173828,
"learning_rate": 2.8991596638655466e-06,
"loss": 1.3865,
"step": 454000
},
{
"epoch": 1.23,
"grad_norm": 1.5338330268859863,
"learning_rate": 2.867647058823529e-06,
"loss": 1.3825,
"step": 454500
},
{
"epoch": 1.23,
"grad_norm": 2.100884437561035,
"learning_rate": 2.8361344537815126e-06,
"loss": 1.3894,
"step": 455000
},
{
"epoch": 1.23,
"grad_norm": 1.4853757619857788,
"learning_rate": 2.804621848739496e-06,
"loss": 1.385,
"step": 455500
},
{
"epoch": 1.23,
"grad_norm": 1.545937180519104,
"learning_rate": 2.773109243697479e-06,
"loss": 1.3875,
"step": 456000
},
{
"epoch": 1.24,
"grad_norm": 1.4860107898712158,
"learning_rate": 2.741596638655462e-06,
"loss": 1.3839,
"step": 456500
},
{
"epoch": 1.24,
"grad_norm": 1.5260435342788696,
"learning_rate": 2.7100840336134455e-06,
"loss": 1.3815,
"step": 457000
},
{
"epoch": 1.24,
"grad_norm": 1.5752997398376465,
"learning_rate": 2.6785714285714285e-06,
"loss": 1.3845,
"step": 457500
},
{
"epoch": 1.24,
"grad_norm": 1.5157984495162964,
"learning_rate": 2.647058823529412e-06,
"loss": 1.3831,
"step": 458000
},
{
"epoch": 1.24,
"grad_norm": 1.5206942558288574,
"learning_rate": 2.615546218487395e-06,
"loss": 1.3866,
"step": 458500
},
{
"epoch": 1.24,
"grad_norm": 1.524672508239746,
"learning_rate": 2.584033613445378e-06,
"loss": 1.3869,
"step": 459000
},
{
"epoch": 1.24,
"grad_norm": 6.727693557739258,
"learning_rate": 2.5525210084033614e-06,
"loss": 1.3805,
"step": 459500
},
{
"epoch": 1.25,
"grad_norm": 1.5827701091766357,
"learning_rate": 2.521008403361345e-06,
"loss": 1.39,
"step": 460000
},
{
"epoch": 1.25,
"grad_norm": 1.4831866025924683,
"learning_rate": 2.489495798319328e-06,
"loss": 1.3886,
"step": 460500
},
{
"epoch": 1.25,
"grad_norm": 1.5272330045700073,
"learning_rate": 2.457983193277311e-06,
"loss": 1.3889,
"step": 461000
},
{
"epoch": 1.25,
"grad_norm": 1.478623628616333,
"learning_rate": 2.4264705882352943e-06,
"loss": 1.3878,
"step": 461500
},
{
"epoch": 1.25,
"grad_norm": 1.5272207260131836,
"learning_rate": 2.3949579831932773e-06,
"loss": 1.3834,
"step": 462000
},
{
"epoch": 1.25,
"grad_norm": 1.574120044708252,
"learning_rate": 2.3634453781512604e-06,
"loss": 1.3852,
"step": 462500
},
{
"epoch": 1.25,
"grad_norm": 1.5751044750213623,
"learning_rate": 2.331932773109244e-06,
"loss": 1.3829,
"step": 463000
},
{
"epoch": 1.25,
"grad_norm": 1.4704902172088623,
"learning_rate": 2.3004201680672272e-06,
"loss": 1.3817,
"step": 463500
},
{
"epoch": 1.26,
"grad_norm": 2.406973123550415,
"learning_rate": 2.26890756302521e-06,
"loss": 1.3872,
"step": 464000
},
{
"epoch": 1.26,
"grad_norm": 1.4869129657745361,
"learning_rate": 2.2373949579831933e-06,
"loss": 1.3825,
"step": 464500
},
{
"epoch": 1.26,
"grad_norm": 1.5050959587097168,
"learning_rate": 2.2058823529411767e-06,
"loss": 1.3821,
"step": 465000
},
{
"epoch": 1.26,
"grad_norm": 1.4652327299118042,
"learning_rate": 2.1743697478991597e-06,
"loss": 1.3831,
"step": 465500
},
{
"epoch": 1.26,
"grad_norm": 1.6011298894882202,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.3824,
"step": 466000
},
{
"epoch": 1.26,
"grad_norm": 1.589460015296936,
"learning_rate": 2.111344537815126e-06,
"loss": 1.3816,
"step": 466500
},
{
"epoch": 1.26,
"grad_norm": 1.679612636566162,
"learning_rate": 2.0798319327731096e-06,
"loss": 1.383,
"step": 467000
},
{
"epoch": 1.27,
"grad_norm": 5.37538480758667,
"learning_rate": 2.048319327731092e-06,
"loss": 1.3818,
"step": 467500
},
{
"epoch": 1.27,
"grad_norm": 1.5256156921386719,
"learning_rate": 2.0168067226890756e-06,
"loss": 1.383,
"step": 468000
},
{
"epoch": 1.27,
"grad_norm": 1.546476125717163,
"learning_rate": 1.985294117647059e-06,
"loss": 1.3841,
"step": 468500
},
{
"epoch": 1.27,
"grad_norm": 1.429592251777649,
"learning_rate": 1.953781512605042e-06,
"loss": 1.3828,
"step": 469000
},
{
"epoch": 1.27,
"grad_norm": 1.4674160480499268,
"learning_rate": 1.922268907563025e-06,
"loss": 1.3847,
"step": 469500
},
{
"epoch": 1.27,
"grad_norm": 2.370859384536743,
"learning_rate": 1.8907563025210083e-06,
"loss": 1.3816,
"step": 470000
},
{
"epoch": 1.27,
"grad_norm": 1.5106278657913208,
"learning_rate": 1.8592436974789918e-06,
"loss": 1.3783,
"step": 470500
},
{
"epoch": 1.28,
"grad_norm": 1.5777826309204102,
"learning_rate": 1.8277310924369748e-06,
"loss": 1.3817,
"step": 471000
},
{
"epoch": 1.28,
"grad_norm": 1.4805636405944824,
"learning_rate": 1.796218487394958e-06,
"loss": 1.3831,
"step": 471500
},
{
"epoch": 1.28,
"grad_norm": 1.5154469013214111,
"learning_rate": 1.7647058823529412e-06,
"loss": 1.383,
"step": 472000
},
{
"epoch": 1.28,
"grad_norm": 1.54281747341156,
"learning_rate": 1.7331932773109245e-06,
"loss": 1.3852,
"step": 472500
},
{
"epoch": 1.28,
"grad_norm": 1.7247158288955688,
"learning_rate": 1.7016806722689077e-06,
"loss": 1.3819,
"step": 473000
},
{
"epoch": 1.28,
"grad_norm": 1.4723429679870605,
"learning_rate": 1.670168067226891e-06,
"loss": 1.38,
"step": 473500
},
{
"epoch": 1.28,
"grad_norm": 1.5267595052719116,
"learning_rate": 1.638655462184874e-06,
"loss": 1.3822,
"step": 474000
},
{
"epoch": 1.28,
"grad_norm": 1.566758155822754,
"learning_rate": 1.6071428571428572e-06,
"loss": 1.3837,
"step": 474500
},
{
"epoch": 1.29,
"grad_norm": 2.029449939727783,
"learning_rate": 1.5756302521008404e-06,
"loss": 1.3853,
"step": 475000
},
{
"epoch": 1.29,
"grad_norm": 1.4750381708145142,
"learning_rate": 1.5441176470588234e-06,
"loss": 1.3838,
"step": 475500
},
{
"epoch": 1.29,
"grad_norm": 1.5221339464187622,
"learning_rate": 1.5126050420168068e-06,
"loss": 1.3859,
"step": 476000
},
{
"epoch": 1.29,
"grad_norm": 1.518754243850708,
"learning_rate": 1.4810924369747898e-06,
"loss": 1.3783,
"step": 476500
},
{
"epoch": 1.29,
"grad_norm": 1.4300239086151123,
"learning_rate": 1.4495798319327733e-06,
"loss": 1.3769,
"step": 477000
},
{
"epoch": 1.29,
"grad_norm": 1.5566083192825317,
"learning_rate": 1.4180672268907563e-06,
"loss": 1.3788,
"step": 477500
},
{
"epoch": 1.29,
"grad_norm": 1.415859580039978,
"learning_rate": 1.3865546218487395e-06,
"loss": 1.385,
"step": 478000
},
{
"epoch": 1.3,
"grad_norm": 1.4944028854370117,
"learning_rate": 1.3550420168067228e-06,
"loss": 1.3815,
"step": 478500
},
{
"epoch": 1.3,
"grad_norm": 1.4514822959899902,
"learning_rate": 1.323529411764706e-06,
"loss": 1.3827,
"step": 479000
},
{
"epoch": 1.3,
"grad_norm": 1.5512882471084595,
"learning_rate": 1.292016806722689e-06,
"loss": 1.384,
"step": 479500
},
{
"epoch": 1.3,
"grad_norm": 1.574981689453125,
"learning_rate": 1.2605042016806724e-06,
"loss": 1.382,
"step": 480000
},
{
"epoch": 1.3,
"grad_norm": 1.570827603340149,
"learning_rate": 1.2289915966386554e-06,
"loss": 1.382,
"step": 480500
},
{
"epoch": 1.3,
"grad_norm": 1.5336010456085205,
"learning_rate": 1.1974789915966387e-06,
"loss": 1.3803,
"step": 481000
},
{
"epoch": 1.3,
"grad_norm": 1.4452096223831177,
"learning_rate": 1.165966386554622e-06,
"loss": 1.3804,
"step": 481500
},
{
"epoch": 1.3,
"grad_norm": 1.5529412031173706,
"learning_rate": 1.134453781512605e-06,
"loss": 1.3813,
"step": 482000
},
{
"epoch": 1.31,
"grad_norm": 1.5553141832351685,
"learning_rate": 1.1029411764705884e-06,
"loss": 1.3822,
"step": 482500
},
{
"epoch": 1.31,
"grad_norm": 1.5250602960586548,
"learning_rate": 1.0714285714285714e-06,
"loss": 1.379,
"step": 483000
},
{
"epoch": 1.31,
"grad_norm": 1.4803342819213867,
"learning_rate": 1.0399159663865548e-06,
"loss": 1.3846,
"step": 483500
},
{
"epoch": 1.31,
"grad_norm": 1.4097282886505127,
"learning_rate": 1.0084033613445378e-06,
"loss": 1.3855,
"step": 484000
},
{
"epoch": 1.31,
"grad_norm": 1.535632848739624,
"learning_rate": 9.76890756302521e-07,
"loss": 1.3807,
"step": 484500
},
{
"epoch": 1.31,
"grad_norm": 1.5535025596618652,
"learning_rate": 9.453781512605042e-07,
"loss": 1.379,
"step": 485000
},
{
"epoch": 1.31,
"grad_norm": 1.5092753171920776,
"learning_rate": 9.138655462184874e-07,
"loss": 1.3777,
"step": 485500
},
{
"epoch": 1.32,
"grad_norm": 1.5026346445083618,
"learning_rate": 8.823529411764706e-07,
"loss": 1.3844,
"step": 486000
},
{
"epoch": 1.32,
"grad_norm": 2.1724424362182617,
"learning_rate": 8.508403361344538e-07,
"loss": 1.3808,
"step": 486500
},
{
"epoch": 1.32,
"grad_norm": 1.5653128623962402,
"learning_rate": 8.19327731092437e-07,
"loss": 1.3826,
"step": 487000
},
{
"epoch": 1.32,
"grad_norm": 1.8672337532043457,
"learning_rate": 7.878151260504202e-07,
"loss": 1.3804,
"step": 487500
},
{
"epoch": 1.32,
"grad_norm": 1.5125828981399536,
"learning_rate": 7.563025210084034e-07,
"loss": 1.3785,
"step": 488000
},
{
"epoch": 1.32,
"grad_norm": 1.5895177125930786,
"learning_rate": 7.247899159663866e-07,
"loss": 1.3806,
"step": 488500
},
{
"epoch": 1.32,
"grad_norm": 1.505618929862976,
"learning_rate": 6.932773109243698e-07,
"loss": 1.3822,
"step": 489000
},
{
"epoch": 1.33,
"grad_norm": 1.4767976999282837,
"learning_rate": 6.61764705882353e-07,
"loss": 1.3809,
"step": 489500
},
{
"epoch": 1.33,
"grad_norm": 1.4713040590286255,
"learning_rate": 6.302521008403362e-07,
"loss": 1.38,
"step": 490000
},
{
"epoch": 1.33,
"grad_norm": 1.5712190866470337,
"learning_rate": 5.987394957983193e-07,
"loss": 1.3821,
"step": 490500
},
{
"epoch": 1.33,
"grad_norm": 1.520726203918457,
"learning_rate": 5.672268907563025e-07,
"loss": 1.3817,
"step": 491000
},
{
"epoch": 1.33,
"grad_norm": 1.4978504180908203,
"learning_rate": 5.357142857142857e-07,
"loss": 1.3825,
"step": 491500
},
{
"epoch": 1.33,
"grad_norm": 1.5783872604370117,
"learning_rate": 5.042016806722689e-07,
"loss": 1.3825,
"step": 492000
},
{
"epoch": 1.33,
"grad_norm": 1.5126821994781494,
"learning_rate": 4.726890756302521e-07,
"loss": 1.3803,
"step": 492500
},
{
"epoch": 1.33,
"grad_norm": 1.4677457809448242,
"learning_rate": 4.411764705882353e-07,
"loss": 1.3804,
"step": 493000
},
{
"epoch": 1.34,
"grad_norm": 1.5842092037200928,
"learning_rate": 4.096638655462185e-07,
"loss": 1.3818,
"step": 493500
},
{
"epoch": 1.34,
"grad_norm": 1.5152337551116943,
"learning_rate": 3.781512605042017e-07,
"loss": 1.3797,
"step": 494000
},
{
"epoch": 1.34,
"grad_norm": 1.5868217945098877,
"learning_rate": 3.466386554621849e-07,
"loss": 1.3829,
"step": 494500
},
{
"epoch": 1.34,
"grad_norm": 1.4543733596801758,
"learning_rate": 3.151260504201681e-07,
"loss": 1.3811,
"step": 495000
},
{
"epoch": 1.34,
"grad_norm": 1.5251801013946533,
"learning_rate": 2.8361344537815123e-07,
"loss": 1.3793,
"step": 495500
},
{
"epoch": 1.34,
"grad_norm": 1.5227956771850586,
"learning_rate": 2.5210084033613445e-07,
"loss": 1.3848,
"step": 496000
},
{
"epoch": 1.34,
"grad_norm": 1.506102204322815,
"learning_rate": 2.2058823529411765e-07,
"loss": 1.3789,
"step": 496500
},
{
"epoch": 1.35,
"grad_norm": 1.4776455163955688,
"learning_rate": 1.8907563025210085e-07,
"loss": 1.3837,
"step": 497000
},
{
"epoch": 1.35,
"grad_norm": 1.5449495315551758,
"learning_rate": 1.5756302521008405e-07,
"loss": 1.3823,
"step": 497500
},
{
"epoch": 1.35,
"grad_norm": 1.4903110265731812,
"learning_rate": 1.2605042016806723e-07,
"loss": 1.3816,
"step": 498000
},
{
"epoch": 1.35,
"grad_norm": 1.4964358806610107,
"learning_rate": 9.453781512605043e-08,
"loss": 1.3783,
"step": 498500
},
{
"epoch": 1.35,
"grad_norm": 1.6141352653503418,
"learning_rate": 6.302521008403361e-08,
"loss": 1.3819,
"step": 499000
},
{
"epoch": 1.35,
"grad_norm": 1.5006154775619507,
"learning_rate": 3.151260504201681e-08,
"loss": 1.3771,
"step": 499500
},
{
"epoch": 1.35,
"grad_norm": 1.5279935598373413,
"learning_rate": 0.0,
"loss": 1.3805,
"step": 500000
},
{
"epoch": 1.35,
"step": 500000,
"total_flos": 2.9824904071075946e+19,
"train_loss": 1.5473345408935546,
"train_runtime": 243315.0329,
"train_samples_per_second": 526.067,
"train_steps_per_second": 2.055
}
],
"logging_steps": 500,
"max_steps": 500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10000,
"total_flos": 2.9824904071075946e+19,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}