|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3535169785169785, |
|
"eval_steps": 500, |
|
"global_step": 500000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.596311569213867, |
|
"learning_rate": 6.249999999999999e-07, |
|
"loss": 9.2114, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.16270637512207, |
|
"learning_rate": 1.2499999999999999e-06, |
|
"loss": 7.3081, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.267263412475586, |
|
"learning_rate": 1.875e-06, |
|
"loss": 6.1634, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1591992378234863, |
|
"learning_rate": 2.4999999999999998e-06, |
|
"loss": 5.421, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.160722494125366, |
|
"learning_rate": 3.125e-06, |
|
"loss": 4.856, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.330343723297119, |
|
"learning_rate": 3.75e-06, |
|
"loss": 4.3826, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1275618076324463, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 3.9848, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1402294635772705, |
|
"learning_rate": 4.9999999999999996e-06, |
|
"loss": 3.6491, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.159619092941284, |
|
"learning_rate": 5.625e-06, |
|
"loss": 3.3861, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.5018796920776367, |
|
"learning_rate": 6.25e-06, |
|
"loss": 3.1845, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.9996334314346313, |
|
"learning_rate": 6.875e-06, |
|
"loss": 3.0335, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.103320598602295, |
|
"learning_rate": 7.5e-06, |
|
"loss": 2.9096, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.025847911834717, |
|
"learning_rate": 8.125e-06, |
|
"loss": 2.8088, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.030522108078003, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 2.7156, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.992558479309082, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 2.6262, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.1512062549591064, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 2.5432, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.0734474658966064, |
|
"learning_rate": 1.0625e-05, |
|
"loss": 2.4722, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9478808641433716, |
|
"learning_rate": 1.125e-05, |
|
"loss": 2.4111, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.762665033340454, |
|
"learning_rate": 1.1874999999999999e-05, |
|
"loss": 2.3521, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8274019956588745, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.3099, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.905918002128601, |
|
"learning_rate": 1.3125e-05, |
|
"loss": 2.2629, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8081414699554443, |
|
"learning_rate": 1.375e-05, |
|
"loss": 2.2239, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.712226867675781, |
|
"learning_rate": 1.4375e-05, |
|
"loss": 2.1907, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6963427066802979, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.1602, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.717537522315979, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 2.1384, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.745806336402893, |
|
"learning_rate": 1.625e-05, |
|
"loss": 2.1061, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7633601427078247, |
|
"learning_rate": 1.6875e-05, |
|
"loss": 2.0838, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7061880826950073, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 2.0648, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7471063137054443, |
|
"learning_rate": 1.8125e-05, |
|
"loss": 2.0462, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.705340027809143, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 2.0281, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 41.675968170166016, |
|
"learning_rate": 1.9375e-05, |
|
"loss": 2.003, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.737722396850586, |
|
"learning_rate": 1.9999999999999998e-05, |
|
"loss": 1.9914, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8232406377792358, |
|
"learning_rate": 2.0625e-05, |
|
"loss": 1.9724, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8312487602233887, |
|
"learning_rate": 2.125e-05, |
|
"loss": 1.9577, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.025630235671997, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 1.9411, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.9454607963562012, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.9263, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.637341856956482, |
|
"learning_rate": 2.3125000000000003e-05, |
|
"loss": 1.9221, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.846366286277771, |
|
"learning_rate": 2.3749999999999998e-05, |
|
"loss": 1.9086, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.802040457725525, |
|
"learning_rate": 2.4375e-05, |
|
"loss": 1.8961, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7378031015396118, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.8893, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6410856246948242, |
|
"learning_rate": 2.5625e-05, |
|
"loss": 1.8752, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7153388261795044, |
|
"learning_rate": 2.625e-05, |
|
"loss": 1.862, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6210004091262817, |
|
"learning_rate": 2.6875000000000003e-05, |
|
"loss": 1.855, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6593818664550781, |
|
"learning_rate": 2.75e-05, |
|
"loss": 1.8478, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.659287691116333, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 1.8353, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.703875184059143, |
|
"learning_rate": 2.875e-05, |
|
"loss": 1.8288, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7122712135314941, |
|
"learning_rate": 2.9375e-05, |
|
"loss": 1.8289, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6744304895401, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8219, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7783963680267334, |
|
"learning_rate": 2.9968487394957983e-05, |
|
"loss": 1.8141, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7388477325439453, |
|
"learning_rate": 2.9936974789915968e-05, |
|
"loss": 1.805, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6574689149856567, |
|
"learning_rate": 2.990546218487395e-05, |
|
"loss": 1.8005, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6803966760635376, |
|
"learning_rate": 2.9873949579831935e-05, |
|
"loss": 1.7902, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6314315795898438, |
|
"learning_rate": 2.9842436974789916e-05, |
|
"loss": 1.7832, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6180912256240845, |
|
"learning_rate": 2.98109243697479e-05, |
|
"loss": 1.7774, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6669533252716064, |
|
"learning_rate": 2.9779411764705883e-05, |
|
"loss": 1.774, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5653916597366333, |
|
"learning_rate": 2.9747899159663868e-05, |
|
"loss": 1.7673, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6632215976715088, |
|
"learning_rate": 2.971638655462185e-05, |
|
"loss": 1.7639, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6262154579162598, |
|
"learning_rate": 2.9684873949579835e-05, |
|
"loss": 1.757, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.847783088684082, |
|
"learning_rate": 2.9653361344537817e-05, |
|
"loss": 1.9286, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.6416807174682617, |
|
"learning_rate": 2.9621848739495802e-05, |
|
"loss": 3.7773, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.4526023864746094, |
|
"learning_rate": 2.9590336134453784e-05, |
|
"loss": 4.3611, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 65.76104736328125, |
|
"learning_rate": 2.9558823529411766e-05, |
|
"loss": 4.5628, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.145516395568848, |
|
"learning_rate": 2.9527310924369747e-05, |
|
"loss": 4.388, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.991481781005859, |
|
"learning_rate": 2.949579831932773e-05, |
|
"loss": 4.1991, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.632403612136841, |
|
"learning_rate": 2.9464285714285714e-05, |
|
"loss": 3.7935, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.691666841506958, |
|
"learning_rate": 2.9432773109243696e-05, |
|
"loss": 3.4704, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 14.81291675567627, |
|
"learning_rate": 2.940126050420168e-05, |
|
"loss": 2.6663, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.4295215606689453, |
|
"learning_rate": 2.9369747899159663e-05, |
|
"loss": 2.4661, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 52.97163391113281, |
|
"learning_rate": 2.9338235294117648e-05, |
|
"loss": 2.1129, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.337153196334839, |
|
"learning_rate": 2.930672268907563e-05, |
|
"loss": 1.7961, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.669353008270264, |
|
"learning_rate": 2.9275210084033615e-05, |
|
"loss": 1.7907, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5874249935150146, |
|
"learning_rate": 2.9243697478991596e-05, |
|
"loss": 1.7663, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.7114965915679932, |
|
"learning_rate": 2.921218487394958e-05, |
|
"loss": 1.7439, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.8134816884994507, |
|
"learning_rate": 2.9180672268907563e-05, |
|
"loss": 1.7361, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.505012035369873, |
|
"learning_rate": 2.9149159663865545e-05, |
|
"loss": 1.7323, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.6047751903533936, |
|
"learning_rate": 2.911764705882353e-05, |
|
"loss": 1.7212, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5497486591339111, |
|
"learning_rate": 2.9086134453781512e-05, |
|
"loss": 1.7215, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5367647409439087, |
|
"learning_rate": 2.9054621848739497e-05, |
|
"loss": 1.7027, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.223250865936279, |
|
"learning_rate": 2.902310924369748e-05, |
|
"loss": 1.6914, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5872981548309326, |
|
"learning_rate": 2.8991596638655464e-05, |
|
"loss": 1.6878, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5480022430419922, |
|
"learning_rate": 2.8960084033613446e-05, |
|
"loss": 1.6816, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5464568138122559, |
|
"learning_rate": 2.892857142857143e-05, |
|
"loss": 1.6796, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.557543158531189, |
|
"learning_rate": 2.8897058823529413e-05, |
|
"loss": 1.6709, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5462812185287476, |
|
"learning_rate": 2.8865546218487398e-05, |
|
"loss": 1.6728, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5833927392959595, |
|
"learning_rate": 2.883403361344538e-05, |
|
"loss": 1.6676, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.63410222530365, |
|
"learning_rate": 2.8802521008403365e-05, |
|
"loss": 1.6696, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.4682618379592896, |
|
"learning_rate": 2.8771008403361346e-05, |
|
"loss": 1.6693, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5386840105056763, |
|
"learning_rate": 2.8739495798319328e-05, |
|
"loss": 1.6602, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5572445392608643, |
|
"learning_rate": 2.8707983193277313e-05, |
|
"loss": 1.6581, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5247888565063477, |
|
"learning_rate": 2.8676470588235295e-05, |
|
"loss": 1.6546, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5297437906265259, |
|
"learning_rate": 2.864495798319328e-05, |
|
"loss": 1.6467, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5252556800842285, |
|
"learning_rate": 2.8613445378151262e-05, |
|
"loss": 1.6504, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.4626063108444214, |
|
"learning_rate": 2.8581932773109244e-05, |
|
"loss": 1.6441, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.511093020439148, |
|
"learning_rate": 2.8550420168067225e-05, |
|
"loss": 1.6433, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.572654366493225, |
|
"learning_rate": 2.851890756302521e-05, |
|
"loss": 1.6527, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.5643205642700195, |
|
"learning_rate": 2.8487394957983192e-05, |
|
"loss": 1.6376, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.497128963470459, |
|
"learning_rate": 2.8455882352941177e-05, |
|
"loss": 1.6397, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.464203953742981, |
|
"learning_rate": 2.842436974789916e-05, |
|
"loss": 1.6358, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.8414405584335327, |
|
"learning_rate": 2.8392857142857144e-05, |
|
"loss": 1.6366, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.7834322452545166, |
|
"learning_rate": 2.8361344537815126e-05, |
|
"loss": 1.642, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.477858304977417, |
|
"learning_rate": 2.8329831932773108e-05, |
|
"loss": 1.6342, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5328236818313599, |
|
"learning_rate": 2.8298319327731093e-05, |
|
"loss": 1.6333, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.540300965309143, |
|
"learning_rate": 2.8266806722689075e-05, |
|
"loss": 1.6352, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.8767386674880981, |
|
"learning_rate": 2.823529411764706e-05, |
|
"loss": 1.6328, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5387629270553589, |
|
"learning_rate": 2.820378151260504e-05, |
|
"loss": 1.632, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.6315770149230957, |
|
"learning_rate": 2.8172268907563027e-05, |
|
"loss": 1.627, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.726038455963135, |
|
"learning_rate": 2.814075630252101e-05, |
|
"loss": 1.6293, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5697258710861206, |
|
"learning_rate": 2.8109243697478993e-05, |
|
"loss": 1.6211, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5938401222229004, |
|
"learning_rate": 2.8077731092436975e-05, |
|
"loss": 1.6196, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5256606340408325, |
|
"learning_rate": 2.804621848739496e-05, |
|
"loss": 1.6177, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.223390817642212, |
|
"learning_rate": 2.8014705882352942e-05, |
|
"loss": 1.6246, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.4948030710220337, |
|
"learning_rate": 2.7983193277310927e-05, |
|
"loss": 1.6239, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5147298574447632, |
|
"learning_rate": 2.795168067226891e-05, |
|
"loss": 1.6164, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5068755149841309, |
|
"learning_rate": 2.792016806722689e-05, |
|
"loss": 1.612, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.5074622631072998, |
|
"learning_rate": 2.7888655462184876e-05, |
|
"loss": 1.6113, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.4880355596542358, |
|
"learning_rate": 2.7857142857142858e-05, |
|
"loss": 1.6102, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.6379941701889038, |
|
"learning_rate": 2.7825630252100843e-05, |
|
"loss": 1.6084, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.4973347187042236, |
|
"learning_rate": 2.7794117647058824e-05, |
|
"loss": 1.6007, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.5474885702133179, |
|
"learning_rate": 2.776260504201681e-05, |
|
"loss": 1.6042, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.602220058441162, |
|
"learning_rate": 2.773109243697479e-05, |
|
"loss": 1.6106, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.6185747385025024, |
|
"learning_rate": 2.7699579831932776e-05, |
|
"loss": 1.6058, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.56905996799469, |
|
"learning_rate": 2.7668067226890758e-05, |
|
"loss": 1.6013, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.5619949102401733, |
|
"learning_rate": 2.763655462184874e-05, |
|
"loss": 1.6034, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.504239559173584, |
|
"learning_rate": 2.7605042016806722e-05, |
|
"loss": 1.6057, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.4879348278045654, |
|
"learning_rate": 2.7573529411764707e-05, |
|
"loss": 1.6021, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.5099623203277588, |
|
"learning_rate": 2.754201680672269e-05, |
|
"loss": 1.6026, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.4979091882705688, |
|
"learning_rate": 2.751050420168067e-05, |
|
"loss": 1.5986, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.4825040102005005, |
|
"learning_rate": 2.7478991596638655e-05, |
|
"loss": 1.5957, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.493453860282898, |
|
"learning_rate": 2.7447478991596637e-05, |
|
"loss": 1.5989, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.530388593673706, |
|
"learning_rate": 2.7415966386554622e-05, |
|
"loss": 1.5953, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5459638833999634, |
|
"learning_rate": 2.7384453781512604e-05, |
|
"loss": 1.5957, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.0421242713928223, |
|
"learning_rate": 2.735294117647059e-05, |
|
"loss": 1.5984, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.4634993076324463, |
|
"learning_rate": 2.732142857142857e-05, |
|
"loss": 1.5897, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.530594825744629, |
|
"learning_rate": 2.7289915966386556e-05, |
|
"loss": 1.5902, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5332798957824707, |
|
"learning_rate": 2.7258403361344538e-05, |
|
"loss": 1.5874, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.753754734992981, |
|
"learning_rate": 2.7226890756302523e-05, |
|
"loss": 1.59, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5545145273208618, |
|
"learning_rate": 2.7195378151260505e-05, |
|
"loss": 1.5949, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5194141864776611, |
|
"learning_rate": 2.716386554621849e-05, |
|
"loss": 1.588, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.532632827758789, |
|
"learning_rate": 2.713235294117647e-05, |
|
"loss": 1.5918, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4970754384994507, |
|
"learning_rate": 2.7100840336134453e-05, |
|
"loss": 1.5851, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4157612323760986, |
|
"learning_rate": 2.706932773109244e-05, |
|
"loss": 1.5823, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5014020204544067, |
|
"learning_rate": 2.703781512605042e-05, |
|
"loss": 1.5847, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4652481079101562, |
|
"learning_rate": 2.7006302521008405e-05, |
|
"loss": 1.5886, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5810528993606567, |
|
"learning_rate": 2.6974789915966387e-05, |
|
"loss": 1.5805, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.4908738136291504, |
|
"learning_rate": 2.6943277310924372e-05, |
|
"loss": 1.5812, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.4520491361618042, |
|
"learning_rate": 2.6911764705882354e-05, |
|
"loss": 1.5837, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.46824049949646, |
|
"learning_rate": 2.688025210084034e-05, |
|
"loss": 1.5778, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.5032325983047485, |
|
"learning_rate": 2.684873949579832e-05, |
|
"loss": 1.5777, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.5338232517242432, |
|
"learning_rate": 2.6817226890756306e-05, |
|
"loss": 1.5768, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.5439281463623047, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 1.5782, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.536665439605713, |
|
"learning_rate": 2.675420168067227e-05, |
|
"loss": 1.5758, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4520212411880493, |
|
"learning_rate": 2.6722689075630255e-05, |
|
"loss": 1.5732, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.5352224111557007, |
|
"learning_rate": 2.6691176470588233e-05, |
|
"loss": 1.5745, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4939314126968384, |
|
"learning_rate": 2.6659663865546218e-05, |
|
"loss": 1.5724, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4967976808547974, |
|
"learning_rate": 2.66281512605042e-05, |
|
"loss": 1.5693, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4980648756027222, |
|
"learning_rate": 2.6596638655462185e-05, |
|
"loss": 1.5721, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.5700784921646118, |
|
"learning_rate": 2.6565126050420167e-05, |
|
"loss": 1.5713, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.5124626159667969, |
|
"learning_rate": 2.6533613445378152e-05, |
|
"loss": 1.5709, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.465012788772583, |
|
"learning_rate": 2.6502100840336134e-05, |
|
"loss": 1.5702, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.4589452743530273, |
|
"learning_rate": 2.647058823529412e-05, |
|
"loss": 1.5675, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.547255516052246, |
|
"learning_rate": 2.64390756302521e-05, |
|
"loss": 1.567, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.5208017826080322, |
|
"learning_rate": 2.6407563025210086e-05, |
|
"loss": 1.5654, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.563560128211975, |
|
"learning_rate": 2.6376050420168067e-05, |
|
"loss": 1.5651, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.4551901817321777, |
|
"learning_rate": 2.634453781512605e-05, |
|
"loss": 1.5692, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.783536672592163, |
|
"learning_rate": 2.6313025210084034e-05, |
|
"loss": 1.5698, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.5397638082504272, |
|
"learning_rate": 2.6281512605042016e-05, |
|
"loss": 1.5614, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5307060480117798, |
|
"learning_rate": 2.625e-05, |
|
"loss": 1.5596, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5148283243179321, |
|
"learning_rate": 2.6218487394957983e-05, |
|
"loss": 1.5612, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.531973958015442, |
|
"learning_rate": 2.6186974789915968e-05, |
|
"loss": 1.559, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5402531623840332, |
|
"learning_rate": 2.615546218487395e-05, |
|
"loss": 1.5624, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.486365795135498, |
|
"learning_rate": 2.6123949579831935e-05, |
|
"loss": 1.5601, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.513438105583191, |
|
"learning_rate": 2.6092436974789917e-05, |
|
"loss": 1.5567, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5112252235412598, |
|
"learning_rate": 2.6060924369747902e-05, |
|
"loss": 1.5574, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4394776821136475, |
|
"learning_rate": 2.6029411764705883e-05, |
|
"loss": 1.5562, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.6592140197753906, |
|
"learning_rate": 2.599789915966387e-05, |
|
"loss": 1.5551, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4790719747543335, |
|
"learning_rate": 2.596638655462185e-05, |
|
"loss": 1.5544, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4369221925735474, |
|
"learning_rate": 2.5934873949579832e-05, |
|
"loss": 1.5538, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.5175668001174927, |
|
"learning_rate": 2.5903361344537817e-05, |
|
"loss": 1.5556, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4514554738998413, |
|
"learning_rate": 2.58718487394958e-05, |
|
"loss": 1.5539, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4288485050201416, |
|
"learning_rate": 2.5840336134453784e-05, |
|
"loss": 1.5525, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.546531081199646, |
|
"learning_rate": 2.5808823529411766e-05, |
|
"loss": 1.5527, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.567368507385254, |
|
"learning_rate": 2.5777310924369748e-05, |
|
"loss": 1.5491, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5126845836639404, |
|
"learning_rate": 2.574579831932773e-05, |
|
"loss": 1.5504, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5570114850997925, |
|
"learning_rate": 2.5714285714285714e-05, |
|
"loss": 1.5469, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.4678915739059448, |
|
"learning_rate": 2.5682773109243696e-05, |
|
"loss": 1.5493, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.4618594646453857, |
|
"learning_rate": 2.565126050420168e-05, |
|
"loss": 1.555, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5945430994033813, |
|
"learning_rate": 2.5619747899159663e-05, |
|
"loss": 1.547, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.4740761518478394, |
|
"learning_rate": 2.5588235294117648e-05, |
|
"loss": 1.5463, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4022290706634521, |
|
"learning_rate": 2.555672268907563e-05, |
|
"loss": 1.5449, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.622828722000122, |
|
"learning_rate": 2.552521008403361e-05, |
|
"loss": 1.55, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.409568428993225, |
|
"learning_rate": 2.5493697478991597e-05, |
|
"loss": 1.5436, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4889922142028809, |
|
"learning_rate": 2.546218487394958e-05, |
|
"loss": 1.5441, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4589875936508179, |
|
"learning_rate": 2.5430672268907564e-05, |
|
"loss": 1.5468, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4680520296096802, |
|
"learning_rate": 2.5399159663865545e-05, |
|
"loss": 1.5429, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4456883668899536, |
|
"learning_rate": 2.536764705882353e-05, |
|
"loss": 1.5458, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.4655406475067139, |
|
"learning_rate": 2.5336134453781512e-05, |
|
"loss": 1.5399, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 7.581863880157471, |
|
"learning_rate": 2.5304621848739497e-05, |
|
"loss": 1.5423, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.5289582014083862, |
|
"learning_rate": 2.527310924369748e-05, |
|
"loss": 1.5434, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.475637674331665, |
|
"learning_rate": 2.5241596638655464e-05, |
|
"loss": 1.5415, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.45746910572052, |
|
"learning_rate": 2.5210084033613446e-05, |
|
"loss": 1.5401, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.4924384355545044, |
|
"learning_rate": 2.517857142857143e-05, |
|
"loss": 1.5382, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.4440650939941406, |
|
"learning_rate": 2.5147058823529413e-05, |
|
"loss": 1.539, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.5022001266479492, |
|
"learning_rate": 2.5115546218487395e-05, |
|
"loss": 1.5375, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4573357105255127, |
|
"learning_rate": 2.508403361344538e-05, |
|
"loss": 1.5423, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4948347806930542, |
|
"learning_rate": 2.505252100840336e-05, |
|
"loss": 1.538, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.5028940439224243, |
|
"learning_rate": 2.5021008403361347e-05, |
|
"loss": 1.5368, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.510446310043335, |
|
"learning_rate": 2.498949579831933e-05, |
|
"loss": 1.534, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.516194462776184, |
|
"learning_rate": 2.4957983193277314e-05, |
|
"loss": 1.5404, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.452358365058899, |
|
"learning_rate": 2.4926470588235295e-05, |
|
"loss": 1.5349, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4550226926803589, |
|
"learning_rate": 2.489495798319328e-05, |
|
"loss": 1.5373, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.4559545516967773, |
|
"learning_rate": 2.4863445378151262e-05, |
|
"loss": 1.5341, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.4436681270599365, |
|
"learning_rate": 2.4831932773109244e-05, |
|
"loss": 1.5344, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.4642813205718994, |
|
"learning_rate": 2.4800420168067226e-05, |
|
"loss": 1.5333, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.4824906587600708, |
|
"learning_rate": 2.476890756302521e-05, |
|
"loss": 1.5291, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.515098214149475, |
|
"learning_rate": 2.4737394957983193e-05, |
|
"loss": 1.5285, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.073720693588257, |
|
"learning_rate": 2.4705882352941174e-05, |
|
"loss": 1.5348, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.884777545928955, |
|
"learning_rate": 2.467436974789916e-05, |
|
"loss": 1.5321, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4791995286941528, |
|
"learning_rate": 2.464285714285714e-05, |
|
"loss": 1.5305, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4546101093292236, |
|
"learning_rate": 2.4611344537815126e-05, |
|
"loss": 1.5308, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.421767234802246, |
|
"learning_rate": 2.4579831932773108e-05, |
|
"loss": 1.532, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.476372480392456, |
|
"learning_rate": 2.4548319327731093e-05, |
|
"loss": 1.5303, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4746720790863037, |
|
"learning_rate": 2.4516806722689075e-05, |
|
"loss": 1.531, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.486217975616455, |
|
"learning_rate": 2.448529411764706e-05, |
|
"loss": 1.5277, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4249714612960815, |
|
"learning_rate": 2.4453781512605042e-05, |
|
"loss": 1.525, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4237457513809204, |
|
"learning_rate": 2.4422268907563027e-05, |
|
"loss": 1.5263, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.4878206253051758, |
|
"learning_rate": 2.439075630252101e-05, |
|
"loss": 1.5239, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.4781346321105957, |
|
"learning_rate": 2.4359243697478994e-05, |
|
"loss": 1.528, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.4943785667419434, |
|
"learning_rate": 2.4327731092436976e-05, |
|
"loss": 1.5231, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.466009497642517, |
|
"learning_rate": 2.4296218487394957e-05, |
|
"loss": 1.5233, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.4329051971435547, |
|
"learning_rate": 2.4264705882352942e-05, |
|
"loss": 1.5266, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.477039098739624, |
|
"learning_rate": 2.4233193277310924e-05, |
|
"loss": 1.5278, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.5693820714950562, |
|
"learning_rate": 2.420168067226891e-05, |
|
"loss": 1.5254, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4393528699874878, |
|
"learning_rate": 2.417016806722689e-05, |
|
"loss": 1.5236, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4845529794692993, |
|
"learning_rate": 2.4138655462184876e-05, |
|
"loss": 1.5206, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.476683259010315, |
|
"learning_rate": 2.4107142857142858e-05, |
|
"loss": 1.5208, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.428836703300476, |
|
"learning_rate": 2.4075630252100843e-05, |
|
"loss": 1.5234, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.449540138244629, |
|
"learning_rate": 2.4044117647058825e-05, |
|
"loss": 1.5234, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4410090446472168, |
|
"learning_rate": 2.401260504201681e-05, |
|
"loss": 1.5203, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4714431762695312, |
|
"learning_rate": 2.398109243697479e-05, |
|
"loss": 1.5208, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.469762921333313, |
|
"learning_rate": 2.3949579831932777e-05, |
|
"loss": 1.524, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.5507971048355103, |
|
"learning_rate": 2.391806722689076e-05, |
|
"loss": 1.5224, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.5093679428100586, |
|
"learning_rate": 2.3886554621848737e-05, |
|
"loss": 1.5235, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.492244839668274, |
|
"learning_rate": 2.3855042016806722e-05, |
|
"loss": 1.5196, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.4522676467895508, |
|
"learning_rate": 2.3823529411764704e-05, |
|
"loss": 1.5209, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.527627944946289, |
|
"learning_rate": 2.379201680672269e-05, |
|
"loss": 1.5198, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.488146424293518, |
|
"learning_rate": 2.376050420168067e-05, |
|
"loss": 1.5165, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.4484755992889404, |
|
"learning_rate": 2.3728991596638656e-05, |
|
"loss": 1.5123, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.5184931755065918, |
|
"learning_rate": 2.3697478991596638e-05, |
|
"loss": 1.5177, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.4979966878890991, |
|
"learning_rate": 2.3665966386554623e-05, |
|
"loss": 1.5193, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.4858919382095337, |
|
"learning_rate": 2.3634453781512604e-05, |
|
"loss": 1.5129, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.6100457906723022, |
|
"learning_rate": 2.360294117647059e-05, |
|
"loss": 1.5153, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.4573218822479248, |
|
"learning_rate": 2.357142857142857e-05, |
|
"loss": 1.5173, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.4780622720718384, |
|
"learning_rate": 2.3539915966386556e-05, |
|
"loss": 1.5142, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.4847768545150757, |
|
"learning_rate": 2.3508403361344538e-05, |
|
"loss": 1.5123, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.789902925491333, |
|
"learning_rate": 2.347689075630252e-05, |
|
"loss": 1.5128, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.4414323568344116, |
|
"learning_rate": 2.3445378151260505e-05, |
|
"loss": 1.5112, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.542536735534668, |
|
"learning_rate": 2.3413865546218487e-05, |
|
"loss": 1.5132, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.479336142539978, |
|
"learning_rate": 2.3382352941176472e-05, |
|
"loss": 1.5091, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.5068061351776123, |
|
"learning_rate": 2.3350840336134454e-05, |
|
"loss": 1.5157, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.5134038925170898, |
|
"learning_rate": 2.331932773109244e-05, |
|
"loss": 1.5145, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.804521083831787, |
|
"learning_rate": 2.328781512605042e-05, |
|
"loss": 1.71, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 9.153915405273438, |
|
"learning_rate": 2.3256302521008406e-05, |
|
"loss": 1.5874, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.567737579345703, |
|
"learning_rate": 2.3224789915966387e-05, |
|
"loss": 1.5532, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5058925151824951, |
|
"learning_rate": 2.3193277310924373e-05, |
|
"loss": 1.5241, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.48910653591156, |
|
"learning_rate": 2.3161764705882354e-05, |
|
"loss": 1.5197, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.477921962738037, |
|
"learning_rate": 2.313025210084034e-05, |
|
"loss": 1.5191, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.503013014793396, |
|
"learning_rate": 2.309873949579832e-05, |
|
"loss": 1.5112, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.457146406173706, |
|
"learning_rate": 2.3067226890756303e-05, |
|
"loss": 1.5158, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.8954756259918213, |
|
"learning_rate": 2.3035714285714288e-05, |
|
"loss": 1.5138, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.5171183347702026, |
|
"learning_rate": 2.300420168067227e-05, |
|
"loss": 1.5201, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.454849362373352, |
|
"learning_rate": 2.2972689075630255e-05, |
|
"loss": 1.5113, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.3639023303985596, |
|
"learning_rate": 2.2941176470588233e-05, |
|
"loss": 1.5089, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.4599758386611938, |
|
"learning_rate": 2.290966386554622e-05, |
|
"loss": 1.5099, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.5151523351669312, |
|
"learning_rate": 2.28781512605042e-05, |
|
"loss": 1.5077, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.518723726272583, |
|
"learning_rate": 2.2846638655462185e-05, |
|
"loss": 1.5097, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.5430985689163208, |
|
"learning_rate": 2.2815126050420167e-05, |
|
"loss": 1.5089, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.468233585357666, |
|
"learning_rate": 2.2783613445378152e-05, |
|
"loss": 1.5075, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.540824294090271, |
|
"learning_rate": 2.2752100840336134e-05, |
|
"loss": 1.5095, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4792211055755615, |
|
"learning_rate": 2.272058823529412e-05, |
|
"loss": 1.5123, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4582479000091553, |
|
"learning_rate": 2.26890756302521e-05, |
|
"loss": 1.5041, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4484353065490723, |
|
"learning_rate": 2.2657563025210083e-05, |
|
"loss": 1.5098, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.090087413787842, |
|
"learning_rate": 2.2626050420168068e-05, |
|
"loss": 1.504, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.5165677070617676, |
|
"learning_rate": 2.259453781512605e-05, |
|
"loss": 1.5037, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4467180967330933, |
|
"learning_rate": 2.2563025210084035e-05, |
|
"loss": 1.5037, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.53107750415802, |
|
"learning_rate": 2.2531512605042016e-05, |
|
"loss": 1.5048, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.685832142829895, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.5051, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.722901701927185, |
|
"learning_rate": 2.2468487394957983e-05, |
|
"loss": 1.5038, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5191560983657837, |
|
"learning_rate": 2.2436974789915968e-05, |
|
"loss": 1.5021, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.6680717468261719, |
|
"learning_rate": 2.240546218487395e-05, |
|
"loss": 1.5019, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5664371252059937, |
|
"learning_rate": 2.2373949579831935e-05, |
|
"loss": 1.5028, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.484131932258606, |
|
"learning_rate": 2.2342436974789917e-05, |
|
"loss": 1.5028, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4882657527923584, |
|
"learning_rate": 2.2310924369747902e-05, |
|
"loss": 1.4993, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4583569765090942, |
|
"learning_rate": 2.2279411764705884e-05, |
|
"loss": 1.5037, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.559399127960205, |
|
"learning_rate": 2.2247899159663866e-05, |
|
"loss": 1.4994, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.537287950515747, |
|
"learning_rate": 2.221638655462185e-05, |
|
"loss": 1.5008, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4840517044067383, |
|
"learning_rate": 2.2184873949579832e-05, |
|
"loss": 1.5003, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.6292195320129395, |
|
"learning_rate": 2.2153361344537818e-05, |
|
"loss": 1.4975, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4870771169662476, |
|
"learning_rate": 2.21218487394958e-05, |
|
"loss": 1.4962, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4792907238006592, |
|
"learning_rate": 2.2090336134453784e-05, |
|
"loss": 1.4978, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4179558753967285, |
|
"learning_rate": 2.2058823529411766e-05, |
|
"loss": 1.5012, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4594039916992188, |
|
"learning_rate": 2.2027310924369748e-05, |
|
"loss": 1.4987, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.5356736183166504, |
|
"learning_rate": 2.199579831932773e-05, |
|
"loss": 1.4975, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4961708784103394, |
|
"learning_rate": 2.1964285714285715e-05, |
|
"loss": 1.4966, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.5061964988708496, |
|
"learning_rate": 2.1932773109243697e-05, |
|
"loss": 1.4952, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4668192863464355, |
|
"learning_rate": 2.190126050420168e-05, |
|
"loss": 1.4955, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.520202398300171, |
|
"learning_rate": 2.1869747899159663e-05, |
|
"loss": 1.4987, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5048165321350098, |
|
"learning_rate": 2.1838235294117645e-05, |
|
"loss": 1.4943, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.4194804430007935, |
|
"learning_rate": 2.180672268907563e-05, |
|
"loss": 1.4962, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.4963053464889526, |
|
"learning_rate": 2.1775210084033612e-05, |
|
"loss": 1.4939, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5189534425735474, |
|
"learning_rate": 2.1743697478991597e-05, |
|
"loss": 1.4955, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.844502329826355, |
|
"learning_rate": 2.171218487394958e-05, |
|
"loss": 1.4932, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.6127697229385376, |
|
"learning_rate": 2.1680672268907564e-05, |
|
"loss": 1.4972, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.39309024810791, |
|
"learning_rate": 2.1649159663865546e-05, |
|
"loss": 1.4961, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.7886457443237305, |
|
"learning_rate": 2.161764705882353e-05, |
|
"loss": 1.4981, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.5055351257324219, |
|
"learning_rate": 2.1586134453781513e-05, |
|
"loss": 1.4937, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.2209436893463135, |
|
"learning_rate": 2.1554621848739498e-05, |
|
"loss": 1.4958, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4863665103912354, |
|
"learning_rate": 2.152310924369748e-05, |
|
"loss": 1.4937, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.6290695667266846, |
|
"learning_rate": 2.1491596638655465e-05, |
|
"loss": 1.4934, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.5069892406463623, |
|
"learning_rate": 2.1460084033613446e-05, |
|
"loss": 1.4966, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4480432271957397, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 1.4928, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4599815607070923, |
|
"learning_rate": 2.1397058823529413e-05, |
|
"loss": 1.4907, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5667592287063599, |
|
"learning_rate": 2.1365546218487395e-05, |
|
"loss": 1.4946, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.591620683670044, |
|
"learning_rate": 2.133403361344538e-05, |
|
"loss": 1.4932, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.4108275175094604, |
|
"learning_rate": 2.1302521008403362e-05, |
|
"loss": 1.4918, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.3984153270721436, |
|
"learning_rate": 2.1271008403361347e-05, |
|
"loss": 1.4912, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5187551975250244, |
|
"learning_rate": 2.123949579831933e-05, |
|
"loss": 1.4896, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.4671634435653687, |
|
"learning_rate": 2.1207983193277314e-05, |
|
"loss": 1.4909, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5398577451705933, |
|
"learning_rate": 2.1176470588235296e-05, |
|
"loss": 1.4898, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4390913248062134, |
|
"learning_rate": 2.114495798319328e-05, |
|
"loss": 1.4917, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.466871976852417, |
|
"learning_rate": 2.1113445378151263e-05, |
|
"loss": 1.486, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4268947839736938, |
|
"learning_rate": 2.1081932773109244e-05, |
|
"loss": 1.486, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.473212718963623, |
|
"learning_rate": 2.1050420168067226e-05, |
|
"loss": 1.4906, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4817694425582886, |
|
"learning_rate": 2.1018907563025208e-05, |
|
"loss": 1.4876, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4899072647094727, |
|
"learning_rate": 2.0987394957983193e-05, |
|
"loss": 1.4853, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.472068428993225, |
|
"learning_rate": 2.0955882352941175e-05, |
|
"loss": 1.4859, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4609180688858032, |
|
"learning_rate": 2.092436974789916e-05, |
|
"loss": 1.4867, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3884390592575073, |
|
"learning_rate": 2.089285714285714e-05, |
|
"loss": 1.4845, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4505021572113037, |
|
"learning_rate": 2.0861344537815127e-05, |
|
"loss": 1.4804, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4579660892486572, |
|
"learning_rate": 2.082983193277311e-05, |
|
"loss": 1.4828, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4193936586380005, |
|
"learning_rate": 2.0798319327731094e-05, |
|
"loss": 1.4846, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.8833608627319336, |
|
"learning_rate": 2.0766806722689075e-05, |
|
"loss": 1.4832, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.394463062286377, |
|
"learning_rate": 2.073529411764706e-05, |
|
"loss": 1.4858, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4402869939804077, |
|
"learning_rate": 2.0703781512605042e-05, |
|
"loss": 1.4853, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.5677118301391602, |
|
"learning_rate": 2.0672268907563024e-05, |
|
"loss": 1.4828, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.412744402885437, |
|
"learning_rate": 2.064075630252101e-05, |
|
"loss": 1.4861, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.578121542930603, |
|
"learning_rate": 2.060924369747899e-05, |
|
"loss": 1.4825, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.4429398775100708, |
|
"learning_rate": 2.0577731092436976e-05, |
|
"loss": 1.4806, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.5229464769363403, |
|
"learning_rate": 2.0546218487394958e-05, |
|
"loss": 1.4822, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.533868670463562, |
|
"learning_rate": 2.0514705882352943e-05, |
|
"loss": 1.4788, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.4442238807678223, |
|
"learning_rate": 2.0483193277310925e-05, |
|
"loss": 1.4845, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.8768386840820312, |
|
"learning_rate": 2.045168067226891e-05, |
|
"loss": 1.481, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.5719354152679443, |
|
"learning_rate": 2.042016806722689e-05, |
|
"loss": 1.4815, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.6776522397994995, |
|
"learning_rate": 2.0388655462184877e-05, |
|
"loss": 1.4834, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.462403416633606, |
|
"learning_rate": 2.0357142857142858e-05, |
|
"loss": 1.4829, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.441434621810913, |
|
"learning_rate": 2.0325630252100843e-05, |
|
"loss": 1.4817, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.7203949689865112, |
|
"learning_rate": 2.0294117647058825e-05, |
|
"loss": 1.4819, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.6117925643920898, |
|
"learning_rate": 2.0262605042016807e-05, |
|
"loss": 1.48, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4840322732925415, |
|
"learning_rate": 2.0231092436974792e-05, |
|
"loss": 1.4804, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4823276996612549, |
|
"learning_rate": 2.0199579831932774e-05, |
|
"loss": 1.4783, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.467035174369812, |
|
"learning_rate": 2.016806722689076e-05, |
|
"loss": 1.4826, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4519331455230713, |
|
"learning_rate": 2.0136554621848737e-05, |
|
"loss": 1.4793, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4830392599105835, |
|
"learning_rate": 2.0105042016806722e-05, |
|
"loss": 1.478, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4889652729034424, |
|
"learning_rate": 2.0073529411764704e-05, |
|
"loss": 1.4825, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4417020082473755, |
|
"learning_rate": 2.004201680672269e-05, |
|
"loss": 1.4781, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.5612033605575562, |
|
"learning_rate": 2.001050420168067e-05, |
|
"loss": 1.4749, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.923521637916565, |
|
"learning_rate": 1.9978991596638656e-05, |
|
"loss": 1.4742, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4759869575500488, |
|
"learning_rate": 1.9947478991596638e-05, |
|
"loss": 1.4772, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4529997110366821, |
|
"learning_rate": 1.9915966386554623e-05, |
|
"loss": 1.4758, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4907563924789429, |
|
"learning_rate": 1.9884453781512605e-05, |
|
"loss": 1.477, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4529681205749512, |
|
"learning_rate": 1.9852941176470586e-05, |
|
"loss": 1.4754, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4950664043426514, |
|
"learning_rate": 1.982142857142857e-05, |
|
"loss": 1.477, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.5445144176483154, |
|
"learning_rate": 1.9789915966386553e-05, |
|
"loss": 1.4763, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.2947561740875244, |
|
"learning_rate": 1.975840336134454e-05, |
|
"loss": 1.4771, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.4762338399887085, |
|
"learning_rate": 1.972689075630252e-05, |
|
"loss": 1.4748, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5006557703018188, |
|
"learning_rate": 1.9695378151260505e-05, |
|
"loss": 1.474, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5126187801361084, |
|
"learning_rate": 1.9663865546218487e-05, |
|
"loss": 1.4769, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.9213035106658936, |
|
"learning_rate": 1.9632352941176472e-05, |
|
"loss": 1.4724, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.3832660913467407, |
|
"learning_rate": 1.9600840336134454e-05, |
|
"loss": 1.4743, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.438021183013916, |
|
"learning_rate": 1.956932773109244e-05, |
|
"loss": 1.4732, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.552357792854309, |
|
"learning_rate": 1.953781512605042e-05, |
|
"loss": 1.4693, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4992841482162476, |
|
"learning_rate": 1.9506302521008406e-05, |
|
"loss": 1.4741, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4546705484390259, |
|
"learning_rate": 1.9474789915966388e-05, |
|
"loss": 1.4709, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5536097288131714, |
|
"learning_rate": 1.944327731092437e-05, |
|
"loss": 1.4715, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4430129528045654, |
|
"learning_rate": 1.9411764705882355e-05, |
|
"loss": 1.4694, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4931637048721313, |
|
"learning_rate": 1.9380252100840336e-05, |
|
"loss": 1.4704, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4820243120193481, |
|
"learning_rate": 1.934873949579832e-05, |
|
"loss": 1.4707, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5232768058776855, |
|
"learning_rate": 1.9317226890756303e-05, |
|
"loss": 1.4692, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.517333745956421, |
|
"learning_rate": 1.928571428571429e-05, |
|
"loss": 1.4731, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.4523952007293701, |
|
"learning_rate": 1.925420168067227e-05, |
|
"loss": 1.4698, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.4807761907577515, |
|
"learning_rate": 1.9222689075630255e-05, |
|
"loss": 1.4719, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.4389820098876953, |
|
"learning_rate": 1.9191176470588234e-05, |
|
"loss": 1.4709, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.7379424571990967, |
|
"learning_rate": 1.915966386554622e-05, |
|
"loss": 1.4663, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.4896109104156494, |
|
"learning_rate": 1.91281512605042e-05, |
|
"loss": 1.4709, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.979303359985352, |
|
"learning_rate": 1.9096638655462186e-05, |
|
"loss": 1.4743, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.4648813009262085, |
|
"learning_rate": 1.9065126050420167e-05, |
|
"loss": 1.4687, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.739353895187378, |
|
"learning_rate": 1.903361344537815e-05, |
|
"loss": 1.4702, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.4263814687728882, |
|
"learning_rate": 1.9002100840336134e-05, |
|
"loss": 1.4695, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.5090336799621582, |
|
"learning_rate": 1.8970588235294116e-05, |
|
"loss": 1.4667, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.4606796503067017, |
|
"learning_rate": 1.89390756302521e-05, |
|
"loss": 1.4665, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.4979524612426758, |
|
"learning_rate": 1.8907563025210083e-05, |
|
"loss": 1.4645, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.5032795667648315, |
|
"learning_rate": 1.8876050420168068e-05, |
|
"loss": 1.4697, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.4917629957199097, |
|
"learning_rate": 1.884453781512605e-05, |
|
"loss": 1.4654, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5047801733016968, |
|
"learning_rate": 1.8813025210084035e-05, |
|
"loss": 1.4665, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5550223588943481, |
|
"learning_rate": 1.8781512605042017e-05, |
|
"loss": 1.4669, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4432892799377441, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 1.4652, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4227643013000488, |
|
"learning_rate": 1.8718487394957983e-05, |
|
"loss": 1.465, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5878413915634155, |
|
"learning_rate": 1.868697478991597e-05, |
|
"loss": 1.4675, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5786782503128052, |
|
"learning_rate": 1.865546218487395e-05, |
|
"loss": 1.4596, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4224051237106323, |
|
"learning_rate": 1.8623949579831932e-05, |
|
"loss": 1.462, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.7678115367889404, |
|
"learning_rate": 1.8592436974789917e-05, |
|
"loss": 1.4614, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4170020818710327, |
|
"learning_rate": 1.85609243697479e-05, |
|
"loss": 1.4649, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.5474693775177002, |
|
"learning_rate": 1.8529411764705884e-05, |
|
"loss": 1.464, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4655749797821045, |
|
"learning_rate": 1.8497899159663866e-05, |
|
"loss": 1.4654, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.6294610500335693, |
|
"learning_rate": 1.846638655462185e-05, |
|
"loss": 1.4616, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4760308265686035, |
|
"learning_rate": 1.8434873949579833e-05, |
|
"loss": 1.4643, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4796357154846191, |
|
"learning_rate": 1.8403361344537818e-05, |
|
"loss": 1.4659, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.9592546224594116, |
|
"learning_rate": 1.83718487394958e-05, |
|
"loss": 1.4611, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.493324637413025, |
|
"learning_rate": 1.8340336134453785e-05, |
|
"loss": 1.4626, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.453369379043579, |
|
"learning_rate": 1.8308823529411766e-05, |
|
"loss": 1.4603, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.5146046876907349, |
|
"learning_rate": 1.8277310924369748e-05, |
|
"loss": 1.4594, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.424707293510437, |
|
"learning_rate": 1.824579831932773e-05, |
|
"loss": 1.4631, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.464998722076416, |
|
"learning_rate": 1.8214285714285712e-05, |
|
"loss": 1.4617, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.4314439296722412, |
|
"learning_rate": 1.8182773109243697e-05, |
|
"loss": 1.4611, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.4533342123031616, |
|
"learning_rate": 1.815126050420168e-05, |
|
"loss": 1.4591, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.5328502655029297, |
|
"learning_rate": 1.8119747899159664e-05, |
|
"loss": 1.4606, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.4684851169586182, |
|
"learning_rate": 1.8088235294117645e-05, |
|
"loss": 1.463, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.512421727180481, |
|
"learning_rate": 1.805672268907563e-05, |
|
"loss": 1.4585, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.5069866180419922, |
|
"learning_rate": 1.8025210084033612e-05, |
|
"loss": 1.4565, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.4224152565002441, |
|
"learning_rate": 1.7993697478991597e-05, |
|
"loss": 1.4575, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.6329984664916992, |
|
"learning_rate": 1.796218487394958e-05, |
|
"loss": 1.4541, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.587007761001587, |
|
"learning_rate": 1.7930672268907564e-05, |
|
"loss": 1.4572, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.4805065393447876, |
|
"learning_rate": 1.7899159663865546e-05, |
|
"loss": 1.4618, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.517993450164795, |
|
"learning_rate": 1.786764705882353e-05, |
|
"loss": 1.4538, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.4399406909942627, |
|
"learning_rate": 1.7836134453781513e-05, |
|
"loss": 1.4576, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.4458235502243042, |
|
"learning_rate": 1.7804621848739495e-05, |
|
"loss": 1.4558, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.5840320587158203, |
|
"learning_rate": 1.777310924369748e-05, |
|
"loss": 1.4562, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.4832299947738647, |
|
"learning_rate": 1.774159663865546e-05, |
|
"loss": 1.456, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.4003788232803345, |
|
"learning_rate": 1.7710084033613447e-05, |
|
"loss": 1.4555, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.5091036558151245, |
|
"learning_rate": 1.767857142857143e-05, |
|
"loss": 1.4596, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4758837223052979, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 1.4566, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4372687339782715, |
|
"learning_rate": 1.7615546218487395e-05, |
|
"loss": 1.4524, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4391896724700928, |
|
"learning_rate": 1.758403361344538e-05, |
|
"loss": 1.4565, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4493831396102905, |
|
"learning_rate": 1.7552521008403362e-05, |
|
"loss": 1.4543, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.0319833755493164, |
|
"learning_rate": 1.7521008403361347e-05, |
|
"loss": 1.4536, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4861342906951904, |
|
"learning_rate": 1.748949579831933e-05, |
|
"loss": 1.454, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4432348012924194, |
|
"learning_rate": 1.7457983193277314e-05, |
|
"loss": 1.4546, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4457755088806152, |
|
"learning_rate": 1.7426470588235296e-05, |
|
"loss": 1.4542, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4785292148590088, |
|
"learning_rate": 1.7394957983193278e-05, |
|
"loss": 1.4539, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4646965265274048, |
|
"learning_rate": 1.7363445378151263e-05, |
|
"loss": 1.4557, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.3340420722961426, |
|
"learning_rate": 1.733193277310924e-05, |
|
"loss": 1.4512, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4864197969436646, |
|
"learning_rate": 1.7300420168067226e-05, |
|
"loss": 1.4514, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.441954493522644, |
|
"learning_rate": 1.7268907563025208e-05, |
|
"loss": 1.4565, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4796494245529175, |
|
"learning_rate": 1.7237394957983193e-05, |
|
"loss": 1.4549, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.5095195770263672, |
|
"learning_rate": 1.7205882352941175e-05, |
|
"loss": 1.4538, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.6988993883132935, |
|
"learning_rate": 1.717436974789916e-05, |
|
"loss": 1.4552, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4422426223754883, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 1.4514, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4488030672073364, |
|
"learning_rate": 1.7111344537815127e-05, |
|
"loss": 1.4545, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4784460067749023, |
|
"learning_rate": 1.707983193277311e-05, |
|
"loss": 1.4527, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4642586708068848, |
|
"learning_rate": 1.7048319327731094e-05, |
|
"loss": 1.4483, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.509343147277832, |
|
"learning_rate": 1.7016806722689076e-05, |
|
"loss": 1.4543, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.3862849473953247, |
|
"learning_rate": 1.6985294117647057e-05, |
|
"loss": 1.4531, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.4223895072937012, |
|
"learning_rate": 1.6953781512605042e-05, |
|
"loss": 1.451, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.4616318941116333, |
|
"learning_rate": 1.6922268907563024e-05, |
|
"loss": 1.4511, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.4746378660202026, |
|
"learning_rate": 1.689075630252101e-05, |
|
"loss": 1.4497, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.461519479751587, |
|
"learning_rate": 1.685924369747899e-05, |
|
"loss": 1.4516, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.3925315141677856, |
|
"learning_rate": 1.6827731092436976e-05, |
|
"loss": 1.4507, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.4032963514328003, |
|
"learning_rate": 1.6796218487394958e-05, |
|
"loss": 1.4497, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.4162888526916504, |
|
"learning_rate": 1.6764705882352943e-05, |
|
"loss": 1.4482, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.3672780990600586, |
|
"learning_rate": 1.6733193277310925e-05, |
|
"loss": 1.4518, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.522310733795166, |
|
"learning_rate": 1.670168067226891e-05, |
|
"loss": 1.4516, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3994154930114746, |
|
"learning_rate": 1.6670168067226892e-05, |
|
"loss": 1.4468, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.4941591024398804, |
|
"learning_rate": 1.6638655462184877e-05, |
|
"loss": 1.4491, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.4521230459213257, |
|
"learning_rate": 1.660714285714286e-05, |
|
"loss": 1.4475, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.528152585029602, |
|
"learning_rate": 1.657563025210084e-05, |
|
"loss": 1.4473, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.4769060611724854, |
|
"learning_rate": 1.6544117647058825e-05, |
|
"loss": 1.4463, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.4506659507751465, |
|
"learning_rate": 1.6512605042016807e-05, |
|
"loss": 1.4458, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.491810917854309, |
|
"learning_rate": 1.6481092436974792e-05, |
|
"loss": 1.4498, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.4600553512573242, |
|
"learning_rate": 1.6449579831932774e-05, |
|
"loss": 1.4444, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.4451686143875122, |
|
"learning_rate": 1.641806722689076e-05, |
|
"loss": 1.4441, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.4227120876312256, |
|
"learning_rate": 1.6386554621848738e-05, |
|
"loss": 1.4448, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.5668320655822754, |
|
"learning_rate": 1.6355042016806723e-05, |
|
"loss": 1.4456, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3923659324645996, |
|
"learning_rate": 1.6323529411764704e-05, |
|
"loss": 1.4477, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.4962598085403442, |
|
"learning_rate": 1.629201680672269e-05, |
|
"loss": 1.4454, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4878734350204468, |
|
"learning_rate": 1.626050420168067e-05, |
|
"loss": 1.4461, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4973180294036865, |
|
"learning_rate": 1.6228991596638656e-05, |
|
"loss": 1.4464, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4737753868103027, |
|
"learning_rate": 1.6197478991596638e-05, |
|
"loss": 1.444, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4609256982803345, |
|
"learning_rate": 1.616596638655462e-05, |
|
"loss": 1.4479, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4048258066177368, |
|
"learning_rate": 1.6134453781512605e-05, |
|
"loss": 1.4428, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.399703025817871, |
|
"learning_rate": 1.6102941176470587e-05, |
|
"loss": 1.4433, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.5445500612258911, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 1.4455, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4742292165756226, |
|
"learning_rate": 1.6039915966386554e-05, |
|
"loss": 1.4428, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4535382986068726, |
|
"learning_rate": 1.600840336134454e-05, |
|
"loss": 1.4453, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.467373013496399, |
|
"learning_rate": 1.597689075630252e-05, |
|
"loss": 1.4459, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4863603115081787, |
|
"learning_rate": 1.5945378151260506e-05, |
|
"loss": 1.4444, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.5373426675796509, |
|
"learning_rate": 1.5913865546218487e-05, |
|
"loss": 1.4418, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4747397899627686, |
|
"learning_rate": 1.5882352941176473e-05, |
|
"loss": 1.4423, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.5024008750915527, |
|
"learning_rate": 1.5850840336134454e-05, |
|
"loss": 1.4466, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.481330394744873, |
|
"learning_rate": 1.581932773109244e-05, |
|
"loss": 1.4395, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.419636607170105, |
|
"learning_rate": 1.578781512605042e-05, |
|
"loss": 1.4416, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4620583057403564, |
|
"learning_rate": 1.5756302521008403e-05, |
|
"loss": 1.447, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4666600227355957, |
|
"learning_rate": 1.5724789915966388e-05, |
|
"loss": 1.4378, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4554154872894287, |
|
"learning_rate": 1.569327731092437e-05, |
|
"loss": 1.4439, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4908123016357422, |
|
"learning_rate": 1.5661764705882355e-05, |
|
"loss": 1.4427, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.471479892730713, |
|
"learning_rate": 1.5630252100840337e-05, |
|
"loss": 1.4433, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4541757106781006, |
|
"learning_rate": 1.5598739495798322e-05, |
|
"loss": 1.4438, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.7064818143844604, |
|
"learning_rate": 1.5567226890756304e-05, |
|
"loss": 1.4409, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.5056750774383545, |
|
"learning_rate": 1.553571428571429e-05, |
|
"loss": 1.4405, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.4601994752883911, |
|
"learning_rate": 1.550420168067227e-05, |
|
"loss": 1.4407, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.4508180618286133, |
|
"learning_rate": 1.5472689075630256e-05, |
|
"loss": 1.4471, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.476529598236084, |
|
"learning_rate": 1.5441176470588234e-05, |
|
"loss": 1.4416, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.5242764949798584, |
|
"learning_rate": 1.540966386554622e-05, |
|
"loss": 1.4406, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.405678153038025, |
|
"learning_rate": 1.53781512605042e-05, |
|
"loss": 1.4399, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.4689253568649292, |
|
"learning_rate": 1.5346638655462183e-05, |
|
"loss": 1.4409, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.5302820205688477, |
|
"learning_rate": 1.5315126050420168e-05, |
|
"loss": 1.4435, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4745590686798096, |
|
"learning_rate": 1.528361344537815e-05, |
|
"loss": 1.4411, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.5703048706054688, |
|
"learning_rate": 1.5252100840336135e-05, |
|
"loss": 1.4372, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4982346296310425, |
|
"learning_rate": 1.5220588235294118e-05, |
|
"loss": 1.4342, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4562139511108398, |
|
"learning_rate": 1.51890756302521e-05, |
|
"loss": 1.4403, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.5004678964614868, |
|
"learning_rate": 1.5157563025210083e-05, |
|
"loss": 1.4405, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4451349973678589, |
|
"learning_rate": 1.5126050420168067e-05, |
|
"loss": 1.436, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.420857548713684, |
|
"learning_rate": 1.509453781512605e-05, |
|
"loss": 1.4402, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.4772206544876099, |
|
"learning_rate": 1.5063025210084034e-05, |
|
"loss": 1.4373, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.4933620691299438, |
|
"learning_rate": 1.5031512605042017e-05, |
|
"loss": 1.4392, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.5023765563964844, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.438, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.4560567140579224, |
|
"learning_rate": 1.4968487394957984e-05, |
|
"loss": 1.439, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.5497692823410034, |
|
"learning_rate": 1.4936974789915967e-05, |
|
"loss": 1.4347, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.5201669931411743, |
|
"learning_rate": 1.490546218487395e-05, |
|
"loss": 1.4365, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.4907211065292358, |
|
"learning_rate": 1.4873949579831934e-05, |
|
"loss": 1.4334, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.4821357727050781, |
|
"learning_rate": 1.4842436974789918e-05, |
|
"loss": 1.4361, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.4968074560165405, |
|
"learning_rate": 1.4810924369747901e-05, |
|
"loss": 1.4352, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.475728154182434, |
|
"learning_rate": 1.4779411764705883e-05, |
|
"loss": 1.4365, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.560935378074646, |
|
"learning_rate": 1.4747899159663864e-05, |
|
"loss": 1.4381, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.4216580390930176, |
|
"learning_rate": 1.4716386554621848e-05, |
|
"loss": 1.4322, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.499648094177246, |
|
"learning_rate": 1.4684873949579831e-05, |
|
"loss": 1.4378, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.4971799850463867, |
|
"learning_rate": 1.4653361344537815e-05, |
|
"loss": 1.4334, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.5106513500213623, |
|
"learning_rate": 1.4621848739495798e-05, |
|
"loss": 1.4347, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.488006353378296, |
|
"learning_rate": 1.4590336134453782e-05, |
|
"loss": 1.4361, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.484994888305664, |
|
"learning_rate": 1.4558823529411765e-05, |
|
"loss": 1.4389, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.4334303140640259, |
|
"learning_rate": 1.4527310924369749e-05, |
|
"loss": 1.4366, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.4980212450027466, |
|
"learning_rate": 1.4495798319327732e-05, |
|
"loss": 1.4335, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.4758628606796265, |
|
"learning_rate": 1.4464285714285715e-05, |
|
"loss": 1.4367, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.4914411306381226, |
|
"learning_rate": 1.4432773109243699e-05, |
|
"loss": 1.4373, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.5274006128311157, |
|
"learning_rate": 1.4401260504201682e-05, |
|
"loss": 1.4364, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.4571418762207031, |
|
"learning_rate": 1.4369747899159664e-05, |
|
"loss": 1.4354, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5726255178451538, |
|
"learning_rate": 1.4338235294117647e-05, |
|
"loss": 1.4338, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5626286268234253, |
|
"learning_rate": 1.4306722689075631e-05, |
|
"loss": 1.4345, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.4581658840179443, |
|
"learning_rate": 1.4275210084033613e-05, |
|
"loss": 1.4339, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.4836556911468506, |
|
"learning_rate": 1.4243697478991596e-05, |
|
"loss": 1.4331, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.4955805540084839, |
|
"learning_rate": 1.421218487394958e-05, |
|
"loss": 1.434, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5095798969268799, |
|
"learning_rate": 1.4180672268907563e-05, |
|
"loss": 1.4335, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.517565131187439, |
|
"learning_rate": 1.4149159663865546e-05, |
|
"loss": 1.4339, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.5089333057403564, |
|
"learning_rate": 1.411764705882353e-05, |
|
"loss": 1.4303, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.490110993385315, |
|
"learning_rate": 1.4086134453781513e-05, |
|
"loss": 1.4378, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.4934676885604858, |
|
"learning_rate": 1.4054621848739497e-05, |
|
"loss": 1.4309, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.453904628753662, |
|
"learning_rate": 1.402310924369748e-05, |
|
"loss": 1.4345, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.4364333152770996, |
|
"learning_rate": 1.3991596638655464e-05, |
|
"loss": 1.4347, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.5105829238891602, |
|
"learning_rate": 1.3960084033613445e-05, |
|
"loss": 1.4373, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.5879383087158203, |
|
"learning_rate": 1.3928571428571429e-05, |
|
"loss": 1.4337, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4907859563827515, |
|
"learning_rate": 1.3897058823529412e-05, |
|
"loss": 1.4378, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4965413808822632, |
|
"learning_rate": 1.3865546218487396e-05, |
|
"loss": 1.4332, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4512360095977783, |
|
"learning_rate": 1.3834033613445379e-05, |
|
"loss": 1.4293, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.5323312282562256, |
|
"learning_rate": 1.3802521008403361e-05, |
|
"loss": 1.4348, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.515937089920044, |
|
"learning_rate": 1.3771008403361344e-05, |
|
"loss": 1.435, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.5589243173599243, |
|
"learning_rate": 1.3739495798319328e-05, |
|
"loss": 1.4276, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4904866218566895, |
|
"learning_rate": 1.3707983193277311e-05, |
|
"loss": 1.4317, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.4851187467575073, |
|
"learning_rate": 1.3676470588235295e-05, |
|
"loss": 1.4297, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.3728834390640259, |
|
"learning_rate": 1.3644957983193278e-05, |
|
"loss": 1.4322, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.738533854484558, |
|
"learning_rate": 1.3613445378151261e-05, |
|
"loss": 1.4293, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.5092045068740845, |
|
"learning_rate": 1.3581932773109245e-05, |
|
"loss": 1.4292, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.5049362182617188, |
|
"learning_rate": 1.3550420168067227e-05, |
|
"loss": 1.4286, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.4427067041397095, |
|
"learning_rate": 1.351890756302521e-05, |
|
"loss": 1.4279, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.4460445642471313, |
|
"learning_rate": 1.3487394957983194e-05, |
|
"loss": 1.4301, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.5012342929840088, |
|
"learning_rate": 1.3455882352941177e-05, |
|
"loss": 1.4287, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4399917125701904, |
|
"learning_rate": 1.342436974789916e-05, |
|
"loss": 1.4308, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4089640378952026, |
|
"learning_rate": 1.3392857142857144e-05, |
|
"loss": 1.4264, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.5012991428375244, |
|
"learning_rate": 1.3361344537815127e-05, |
|
"loss": 1.4296, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4144240617752075, |
|
"learning_rate": 1.3329831932773109e-05, |
|
"loss": 1.4259, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4895191192626953, |
|
"learning_rate": 1.3298319327731092e-05, |
|
"loss": 1.4312, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.5855236053466797, |
|
"learning_rate": 1.3266806722689076e-05, |
|
"loss": 1.4275, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.4119740724563599, |
|
"learning_rate": 1.323529411764706e-05, |
|
"loss": 1.428, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5101768970489502, |
|
"learning_rate": 1.3203781512605043e-05, |
|
"loss": 1.4289, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.4803494215011597, |
|
"learning_rate": 1.3172268907563025e-05, |
|
"loss": 1.4273, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5688806772232056, |
|
"learning_rate": 1.3140756302521008e-05, |
|
"loss": 1.4276, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.2357559204101562, |
|
"learning_rate": 1.3109243697478991e-05, |
|
"loss": 1.4294, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.4668666124343872, |
|
"learning_rate": 1.3077731092436975e-05, |
|
"loss": 1.4293, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.46941339969635, |
|
"learning_rate": 1.3046218487394958e-05, |
|
"loss": 1.4321, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.633657455444336, |
|
"learning_rate": 1.3014705882352942e-05, |
|
"loss": 1.4272, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.6233292818069458, |
|
"learning_rate": 1.2983193277310925e-05, |
|
"loss": 1.4268, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.4441863298416138, |
|
"learning_rate": 1.2951680672268909e-05, |
|
"loss": 1.4262, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5020571947097778, |
|
"learning_rate": 1.2920168067226892e-05, |
|
"loss": 1.4247, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.476090669631958, |
|
"learning_rate": 1.2888655462184874e-05, |
|
"loss": 1.426, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.4784507751464844, |
|
"learning_rate": 1.2857142857142857e-05, |
|
"loss": 1.4262, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.4484635591506958, |
|
"learning_rate": 1.282563025210084e-05, |
|
"loss": 1.426, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5106843709945679, |
|
"learning_rate": 1.2794117647058824e-05, |
|
"loss": 1.4282, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.401078701019287, |
|
"learning_rate": 1.2762605042016806e-05, |
|
"loss": 1.4229, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.4721170663833618, |
|
"learning_rate": 1.273109243697479e-05, |
|
"loss": 1.4281, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.5121667385101318, |
|
"learning_rate": 1.2699579831932773e-05, |
|
"loss": 1.4272, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.4307163953781128, |
|
"learning_rate": 1.2668067226890756e-05, |
|
"loss": 1.4269, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.520992398262024, |
|
"learning_rate": 1.263655462184874e-05, |
|
"loss": 1.426, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.4671803712844849, |
|
"learning_rate": 1.2605042016806723e-05, |
|
"loss": 1.4207, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.4773739576339722, |
|
"learning_rate": 1.2573529411764706e-05, |
|
"loss": 1.4248, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.4782676696777344, |
|
"learning_rate": 1.254201680672269e-05, |
|
"loss": 1.4265, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.5411614179611206, |
|
"learning_rate": 1.2510504201680673e-05, |
|
"loss": 1.4223, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4932873249053955, |
|
"learning_rate": 1.2478991596638657e-05, |
|
"loss": 1.4252, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.451866626739502, |
|
"learning_rate": 1.244747899159664e-05, |
|
"loss": 1.4234, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4181545972824097, |
|
"learning_rate": 1.2415966386554622e-05, |
|
"loss": 1.4249, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.460598349571228, |
|
"learning_rate": 1.2384453781512605e-05, |
|
"loss": 1.4237, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4560647010803223, |
|
"learning_rate": 1.2352941176470587e-05, |
|
"loss": 1.4199, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4535589218139648, |
|
"learning_rate": 1.232142857142857e-05, |
|
"loss": 1.4248, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.4643712043762207, |
|
"learning_rate": 1.2289915966386554e-05, |
|
"loss": 1.4257, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5106630325317383, |
|
"learning_rate": 1.2258403361344537e-05, |
|
"loss": 1.4248, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.489579439163208, |
|
"learning_rate": 1.2226890756302521e-05, |
|
"loss": 1.4215, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.4746323823928833, |
|
"learning_rate": 1.2195378151260504e-05, |
|
"loss": 1.4202, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.4702941179275513, |
|
"learning_rate": 1.2163865546218488e-05, |
|
"loss": 1.4214, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5852062702178955, |
|
"learning_rate": 1.2132352941176471e-05, |
|
"loss": 1.4229, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5045883655548096, |
|
"learning_rate": 1.2100840336134455e-05, |
|
"loss": 1.4245, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4635881185531616, |
|
"learning_rate": 1.2069327731092438e-05, |
|
"loss": 1.425, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4574062824249268, |
|
"learning_rate": 1.2037815126050422e-05, |
|
"loss": 1.4241, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4566025733947754, |
|
"learning_rate": 1.2006302521008405e-05, |
|
"loss": 1.4204, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.525225281715393, |
|
"learning_rate": 1.1974789915966388e-05, |
|
"loss": 1.4218, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4726413488388062, |
|
"learning_rate": 1.1943277310924368e-05, |
|
"loss": 1.422, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4462370872497559, |
|
"learning_rate": 1.1911764705882352e-05, |
|
"loss": 1.4174, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4930446147918701, |
|
"learning_rate": 1.1880252100840335e-05, |
|
"loss": 1.4168, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.050973892211914, |
|
"learning_rate": 1.1848739495798319e-05, |
|
"loss": 1.4205, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.514642596244812, |
|
"learning_rate": 1.1817226890756302e-05, |
|
"loss": 1.42, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.4417085647583008, |
|
"learning_rate": 1.1785714285714286e-05, |
|
"loss": 1.42, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.473029375076294, |
|
"learning_rate": 1.1754201680672269e-05, |
|
"loss": 1.4228, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.573533296585083, |
|
"learning_rate": 1.1722689075630253e-05, |
|
"loss": 1.4193, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.5040185451507568, |
|
"learning_rate": 1.1691176470588236e-05, |
|
"loss": 1.4209, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.472280740737915, |
|
"learning_rate": 1.165966386554622e-05, |
|
"loss": 1.4203, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.4371939897537231, |
|
"learning_rate": 1.1628151260504203e-05, |
|
"loss": 1.4197, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.74043607711792, |
|
"learning_rate": 1.1596638655462186e-05, |
|
"loss": 1.4189, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.5340248346328735, |
|
"learning_rate": 1.156512605042017e-05, |
|
"loss": 1.4178, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.4650968313217163, |
|
"learning_rate": 1.1533613445378151e-05, |
|
"loss": 1.4157, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.6052621603012085, |
|
"learning_rate": 1.1502100840336135e-05, |
|
"loss": 1.4221, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.4934183359146118, |
|
"learning_rate": 1.1470588235294117e-05, |
|
"loss": 1.4219, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.6604057550430298, |
|
"learning_rate": 1.14390756302521e-05, |
|
"loss": 1.4165, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.448686957359314, |
|
"learning_rate": 1.1407563025210084e-05, |
|
"loss": 1.4167, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.4600298404693604, |
|
"learning_rate": 1.1376050420168067e-05, |
|
"loss": 1.4196, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.4856675863265991, |
|
"learning_rate": 1.134453781512605e-05, |
|
"loss": 1.4188, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.5987657308578491, |
|
"learning_rate": 1.1313025210084034e-05, |
|
"loss": 1.4176, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.4707138538360596, |
|
"learning_rate": 1.1281512605042017e-05, |
|
"loss": 1.4177, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.4592325687408447, |
|
"learning_rate": 1.125e-05, |
|
"loss": 1.419, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.477171540260315, |
|
"learning_rate": 1.1218487394957984e-05, |
|
"loss": 1.4118, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.5284925699234009, |
|
"learning_rate": 1.1186974789915968e-05, |
|
"loss": 1.418, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.5696572065353394, |
|
"learning_rate": 1.1155462184873951e-05, |
|
"loss": 1.4175, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5421068668365479, |
|
"learning_rate": 1.1123949579831933e-05, |
|
"loss": 1.4134, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5944511890411377, |
|
"learning_rate": 1.1092436974789916e-05, |
|
"loss": 1.4139, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.4496880769729614, |
|
"learning_rate": 1.10609243697479e-05, |
|
"loss": 1.4131, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5021952390670776, |
|
"learning_rate": 1.1029411764705883e-05, |
|
"loss": 1.4144, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5261799097061157, |
|
"learning_rate": 1.0997899159663865e-05, |
|
"loss": 1.4149, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.396974802017212, |
|
"learning_rate": 1.0966386554621848e-05, |
|
"loss": 1.4149, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.561023235321045, |
|
"learning_rate": 1.0934873949579832e-05, |
|
"loss": 1.4183, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.509398102760315, |
|
"learning_rate": 1.0903361344537815e-05, |
|
"loss": 1.4158, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.5046377182006836, |
|
"learning_rate": 1.0871848739495799e-05, |
|
"loss": 1.4137, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.504531979560852, |
|
"learning_rate": 1.0840336134453782e-05, |
|
"loss": 1.4155, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.6807337999343872, |
|
"learning_rate": 1.0808823529411765e-05, |
|
"loss": 1.4161, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.4374127388000488, |
|
"learning_rate": 1.0777310924369749e-05, |
|
"loss": 1.4162, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.4737296104431152, |
|
"learning_rate": 1.0745798319327732e-05, |
|
"loss": 1.4176, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.5063775777816772, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 1.4128, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.506156325340271, |
|
"learning_rate": 1.0682773109243698e-05, |
|
"loss": 1.4176, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.5394564867019653, |
|
"learning_rate": 1.0651260504201681e-05, |
|
"loss": 1.4119, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.4483675956726074, |
|
"learning_rate": 1.0619747899159664e-05, |
|
"loss": 1.4138, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.412644147872925, |
|
"learning_rate": 1.0588235294117648e-05, |
|
"loss": 1.4146, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.9123421907424927, |
|
"learning_rate": 1.0556722689075631e-05, |
|
"loss": 1.4194, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.4911080598831177, |
|
"learning_rate": 1.0525210084033613e-05, |
|
"loss": 1.418, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.511194109916687, |
|
"learning_rate": 1.0493697478991596e-05, |
|
"loss": 1.4114, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.4733537435531616, |
|
"learning_rate": 1.046218487394958e-05, |
|
"loss": 1.4149, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.4742454290390015, |
|
"learning_rate": 1.0430672268907563e-05, |
|
"loss": 1.4163, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.4842146635055542, |
|
"learning_rate": 1.0399159663865547e-05, |
|
"loss": 1.4118, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.5346875190734863, |
|
"learning_rate": 1.036764705882353e-05, |
|
"loss": 1.4148, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.6554747819900513, |
|
"learning_rate": 1.0336134453781512e-05, |
|
"loss": 1.416, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.5015145540237427, |
|
"learning_rate": 1.0304621848739495e-05, |
|
"loss": 1.4146, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.4634381532669067, |
|
"learning_rate": 1.0273109243697479e-05, |
|
"loss": 1.4199, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.7802950143814087, |
|
"learning_rate": 1.0241596638655462e-05, |
|
"loss": 1.4127, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.0422604084014893, |
|
"learning_rate": 1.0210084033613446e-05, |
|
"loss": 1.4121, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.4957752227783203, |
|
"learning_rate": 1.0178571428571429e-05, |
|
"loss": 1.4151, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.6368649005889893, |
|
"learning_rate": 1.0147058823529413e-05, |
|
"loss": 1.4211, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.493455410003662, |
|
"learning_rate": 1.0115546218487396e-05, |
|
"loss": 1.4131, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.5789108276367188, |
|
"learning_rate": 1.008403361344538e-05, |
|
"loss": 1.413, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.4984022378921509, |
|
"learning_rate": 1.0052521008403361e-05, |
|
"loss": 1.4156, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.443871021270752, |
|
"learning_rate": 1.0021008403361345e-05, |
|
"loss": 1.4123, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.532205581665039, |
|
"learning_rate": 9.989495798319328e-06, |
|
"loss": 1.4145, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.487888216972351, |
|
"learning_rate": 9.957983193277312e-06, |
|
"loss": 1.4132, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.5009286403656006, |
|
"learning_rate": 9.926470588235293e-06, |
|
"loss": 1.4132, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.53665292263031, |
|
"learning_rate": 9.894957983193277e-06, |
|
"loss": 1.4114, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.4559004306793213, |
|
"learning_rate": 9.86344537815126e-06, |
|
"loss": 1.4128, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.472882628440857, |
|
"learning_rate": 9.831932773109244e-06, |
|
"loss": 1.4106, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.528029203414917, |
|
"learning_rate": 9.800420168067227e-06, |
|
"loss": 1.4133, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.4509416818618774, |
|
"learning_rate": 9.76890756302521e-06, |
|
"loss": 1.4099, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.644581913948059, |
|
"learning_rate": 9.737394957983194e-06, |
|
"loss": 1.4102, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.5054335594177246, |
|
"learning_rate": 9.705882352941177e-06, |
|
"loss": 1.4119, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.47361421585083, |
|
"learning_rate": 9.67436974789916e-06, |
|
"loss": 1.4094, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.461796522140503, |
|
"learning_rate": 9.642857142857144e-06, |
|
"loss": 1.4108, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.6115666627883911, |
|
"learning_rate": 9.611344537815128e-06, |
|
"loss": 1.4096, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.526082992553711, |
|
"learning_rate": 9.57983193277311e-06, |
|
"loss": 1.4094, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.4482905864715576, |
|
"learning_rate": 9.548319327731093e-06, |
|
"loss": 1.4082, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.5066174268722534, |
|
"learning_rate": 9.516806722689075e-06, |
|
"loss": 1.4122, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.5225650072097778, |
|
"learning_rate": 9.485294117647058e-06, |
|
"loss": 1.4069, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.4794243574142456, |
|
"learning_rate": 9.453781512605041e-06, |
|
"loss": 1.4087, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.4825611114501953, |
|
"learning_rate": 9.422268907563025e-06, |
|
"loss": 1.4098, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.50911283493042, |
|
"learning_rate": 9.390756302521008e-06, |
|
"loss": 1.4066, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.5070313215255737, |
|
"learning_rate": 9.359243697478992e-06, |
|
"loss": 1.4067, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.4434587955474854, |
|
"learning_rate": 9.327731092436975e-06, |
|
"loss": 1.4074, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.4484858512878418, |
|
"learning_rate": 9.296218487394959e-06, |
|
"loss": 1.4056, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.6141736507415771, |
|
"learning_rate": 9.264705882352942e-06, |
|
"loss": 1.4084, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.4847619533538818, |
|
"learning_rate": 9.233193277310925e-06, |
|
"loss": 1.4092, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.4862167835235596, |
|
"learning_rate": 9.201680672268909e-06, |
|
"loss": 1.4086, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.5454356670379639, |
|
"learning_rate": 9.170168067226892e-06, |
|
"loss": 1.4088, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.4676494598388672, |
|
"learning_rate": 9.138655462184874e-06, |
|
"loss": 1.4094, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.4859504699707031, |
|
"learning_rate": 9.107142857142856e-06, |
|
"loss": 1.4076, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.499040961265564, |
|
"learning_rate": 9.07563025210084e-06, |
|
"loss": 1.4104, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.4864604473114014, |
|
"learning_rate": 9.044117647058823e-06, |
|
"loss": 1.4061, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.4507191181182861, |
|
"learning_rate": 9.012605042016806e-06, |
|
"loss": 1.4062, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.468526840209961, |
|
"learning_rate": 8.98109243697479e-06, |
|
"loss": 1.4081, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.6709305047988892, |
|
"learning_rate": 8.949579831932773e-06, |
|
"loss": 1.4126, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.9611443281173706, |
|
"learning_rate": 8.918067226890756e-06, |
|
"loss": 1.4079, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.6809275150299072, |
|
"learning_rate": 8.88655462184874e-06, |
|
"loss": 1.4114, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.746359825134277, |
|
"learning_rate": 8.855042016806723e-06, |
|
"loss": 1.4084, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.197726726531982, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 1.4066, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.4346739053726196, |
|
"learning_rate": 8.79201680672269e-06, |
|
"loss": 1.4066, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.571542739868164, |
|
"learning_rate": 8.760504201680674e-06, |
|
"loss": 1.4097, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.5356281995773315, |
|
"learning_rate": 8.728991596638657e-06, |
|
"loss": 1.4045, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.7401924133300781, |
|
"learning_rate": 8.697478991596639e-06, |
|
"loss": 1.4067, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.5491187572479248, |
|
"learning_rate": 8.66596638655462e-06, |
|
"loss": 1.4042, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.5863696336746216, |
|
"learning_rate": 8.634453781512604e-06, |
|
"loss": 1.4074, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.450952410697937, |
|
"learning_rate": 8.602941176470587e-06, |
|
"loss": 1.4076, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5750932693481445, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 1.41, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.4661774635314941, |
|
"learning_rate": 8.539915966386554e-06, |
|
"loss": 1.4091, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.540864109992981, |
|
"learning_rate": 8.508403361344538e-06, |
|
"loss": 1.4052, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5120595693588257, |
|
"learning_rate": 8.476890756302521e-06, |
|
"loss": 1.4072, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5357037782669067, |
|
"learning_rate": 8.445378151260505e-06, |
|
"loss": 1.4097, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5010443925857544, |
|
"learning_rate": 8.413865546218488e-06, |
|
"loss": 1.4094, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.4643309116363525, |
|
"learning_rate": 8.382352941176472e-06, |
|
"loss": 1.4077, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.4524095058441162, |
|
"learning_rate": 8.350840336134455e-06, |
|
"loss": 1.4065, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5203324556350708, |
|
"learning_rate": 8.319327731092438e-06, |
|
"loss": 1.4035, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4688167572021484, |
|
"learning_rate": 8.28781512605042e-06, |
|
"loss": 1.4067, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5595752000808716, |
|
"learning_rate": 8.256302521008404e-06, |
|
"loss": 1.4059, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4404747486114502, |
|
"learning_rate": 8.224789915966387e-06, |
|
"loss": 1.4035, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6032897233963013, |
|
"learning_rate": 8.193277310924369e-06, |
|
"loss": 1.4001, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6836262941360474, |
|
"learning_rate": 8.161764705882352e-06, |
|
"loss": 1.3981, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5205241441726685, |
|
"learning_rate": 8.130252100840336e-06, |
|
"loss": 1.3994, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.7194490432739258, |
|
"learning_rate": 8.098739495798319e-06, |
|
"loss": 1.4027, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.4517977237701416, |
|
"learning_rate": 8.067226890756303e-06, |
|
"loss": 1.4022, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.6818935871124268, |
|
"learning_rate": 8.035714285714286e-06, |
|
"loss": 1.4028, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.5117074251174927, |
|
"learning_rate": 8.00420168067227e-06, |
|
"loss": 1.4021, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.4689205884933472, |
|
"learning_rate": 7.972689075630253e-06, |
|
"loss": 1.4057, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.525889277458191, |
|
"learning_rate": 7.941176470588236e-06, |
|
"loss": 1.4041, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.4896938800811768, |
|
"learning_rate": 7.90966386554622e-06, |
|
"loss": 1.4027, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.4765034914016724, |
|
"learning_rate": 7.878151260504201e-06, |
|
"loss": 1.4005, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.5386637449264526, |
|
"learning_rate": 7.846638655462185e-06, |
|
"loss": 1.397, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.4808331727981567, |
|
"learning_rate": 7.815126050420168e-06, |
|
"loss": 1.401, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.517560362815857, |
|
"learning_rate": 7.783613445378152e-06, |
|
"loss": 1.4037, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.6733453273773193, |
|
"learning_rate": 7.752100840336135e-06, |
|
"loss": 1.3976, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.480815052986145, |
|
"learning_rate": 7.720588235294117e-06, |
|
"loss": 1.4, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.4836503267288208, |
|
"learning_rate": 7.6890756302521e-06, |
|
"loss": 1.3977, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.442256212234497, |
|
"learning_rate": 7.657563025210084e-06, |
|
"loss": 1.399, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.8496633768081665, |
|
"learning_rate": 7.626050420168067e-06, |
|
"loss": 1.4038, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.4886460304260254, |
|
"learning_rate": 7.59453781512605e-06, |
|
"loss": 1.4061, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.550764799118042, |
|
"learning_rate": 7.563025210084033e-06, |
|
"loss": 1.4003, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.5111615657806396, |
|
"learning_rate": 7.531512605042017e-06, |
|
"loss": 1.4021, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.5873339176177979, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.4003, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.5139081478118896, |
|
"learning_rate": 7.468487394957984e-06, |
|
"loss": 1.3974, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.4700753688812256, |
|
"learning_rate": 7.436974789915967e-06, |
|
"loss": 1.4009, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.4294934272766113, |
|
"learning_rate": 7.4054621848739505e-06, |
|
"loss": 1.3997, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.432667851448059, |
|
"learning_rate": 7.373949579831932e-06, |
|
"loss": 1.3992, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.6012872457504272, |
|
"learning_rate": 7.342436974789916e-06, |
|
"loss": 1.3988, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5000537633895874, |
|
"learning_rate": 7.310924369747899e-06, |
|
"loss": 1.399, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5064808130264282, |
|
"learning_rate": 7.2794117647058826e-06, |
|
"loss": 1.4022, |
|
"step": 384500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5001455545425415, |
|
"learning_rate": 7.247899159663866e-06, |
|
"loss": 1.3947, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.4360790252685547, |
|
"learning_rate": 7.2163865546218494e-06, |
|
"loss": 1.3983, |
|
"step": 385500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.4993146657943726, |
|
"learning_rate": 7.184873949579832e-06, |
|
"loss": 1.3987, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.4621449708938599, |
|
"learning_rate": 7.1533613445378155e-06, |
|
"loss": 1.3974, |
|
"step": 386500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.7409414052963257, |
|
"learning_rate": 7.121848739495798e-06, |
|
"loss": 1.4004, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.4486150741577148, |
|
"learning_rate": 7.0903361344537815e-06, |
|
"loss": 1.3982, |
|
"step": 387500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.5252596139907837, |
|
"learning_rate": 7.058823529411765e-06, |
|
"loss": 1.4013, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.4874343872070312, |
|
"learning_rate": 7.027310924369748e-06, |
|
"loss": 1.3995, |
|
"step": 388500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.5078623294830322, |
|
"learning_rate": 6.995798319327732e-06, |
|
"loss": 1.3985, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.5256296396255493, |
|
"learning_rate": 6.964285714285714e-06, |
|
"loss": 1.4005, |
|
"step": 389500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.5369598865509033, |
|
"learning_rate": 6.932773109243698e-06, |
|
"loss": 1.3929, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.4955265522003174, |
|
"learning_rate": 6.9012605042016804e-06, |
|
"loss": 1.3968, |
|
"step": 390500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.501406192779541, |
|
"learning_rate": 6.869747899159664e-06, |
|
"loss": 1.3982, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.5695279836654663, |
|
"learning_rate": 6.838235294117647e-06, |
|
"loss": 1.3986, |
|
"step": 391500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.590920329093933, |
|
"learning_rate": 6.806722689075631e-06, |
|
"loss": 1.3989, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.4469817876815796, |
|
"learning_rate": 6.775210084033613e-06, |
|
"loss": 1.3958, |
|
"step": 392500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.4517157077789307, |
|
"learning_rate": 6.743697478991597e-06, |
|
"loss": 1.3948, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.477184534072876, |
|
"learning_rate": 6.71218487394958e-06, |
|
"loss": 1.3955, |
|
"step": 393500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.1850063800811768, |
|
"learning_rate": 6.680672268907564e-06, |
|
"loss": 1.3977, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.4544538259506226, |
|
"learning_rate": 6.649159663865546e-06, |
|
"loss": 1.3974, |
|
"step": 394500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.4682557582855225, |
|
"learning_rate": 6.61764705882353e-06, |
|
"loss": 1.3976, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.4401472806930542, |
|
"learning_rate": 6.586134453781512e-06, |
|
"loss": 1.4002, |
|
"step": 395500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.5497291088104248, |
|
"learning_rate": 6.554621848739496e-06, |
|
"loss": 1.3945, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.525145173072815, |
|
"learning_rate": 6.523109243697479e-06, |
|
"loss": 1.4006, |
|
"step": 396500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.5119032859802246, |
|
"learning_rate": 6.491596638655463e-06, |
|
"loss": 1.3984, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.7145532369613647, |
|
"learning_rate": 6.460084033613446e-06, |
|
"loss": 1.398, |
|
"step": 397500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5175354480743408, |
|
"learning_rate": 6.428571428571429e-06, |
|
"loss": 1.3971, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.4529006481170654, |
|
"learning_rate": 6.397058823529412e-06, |
|
"loss": 1.3986, |
|
"step": 398500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.4779740571975708, |
|
"learning_rate": 6.365546218487395e-06, |
|
"loss": 1.3985, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.591557502746582, |
|
"learning_rate": 6.334033613445378e-06, |
|
"loss": 1.3971, |
|
"step": 399500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5829887390136719, |
|
"learning_rate": 6.3025210084033615e-06, |
|
"loss": 1.3989, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.546576976776123, |
|
"learning_rate": 6.271008403361345e-06, |
|
"loss": 1.398, |
|
"step": 400500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.4360915422439575, |
|
"learning_rate": 6.239495798319328e-06, |
|
"loss": 1.3933, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.555240273475647, |
|
"learning_rate": 6.207983193277311e-06, |
|
"loss": 1.3964, |
|
"step": 401500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.5486465692520142, |
|
"learning_rate": 6.176470588235294e-06, |
|
"loss": 1.3922, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.6140353679656982, |
|
"learning_rate": 6.144957983193277e-06, |
|
"loss": 1.3941, |
|
"step": 402500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.422938346862793, |
|
"learning_rate": 6.1134453781512605e-06, |
|
"loss": 1.3946, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.673789620399475, |
|
"learning_rate": 6.081932773109244e-06, |
|
"loss": 1.3965, |
|
"step": 403500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.52051842212677, |
|
"learning_rate": 6.050420168067227e-06, |
|
"loss": 1.3935, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.5157978534698486, |
|
"learning_rate": 6.018907563025211e-06, |
|
"loss": 1.3938, |
|
"step": 404500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.5434610843658447, |
|
"learning_rate": 5.987394957983194e-06, |
|
"loss": 1.3931, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.7399873733520508, |
|
"learning_rate": 5.955882352941176e-06, |
|
"loss": 1.3924, |
|
"step": 405500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.482820749282837, |
|
"learning_rate": 5.924369747899159e-06, |
|
"loss": 1.3923, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 4.893394947052002, |
|
"learning_rate": 5.892857142857143e-06, |
|
"loss": 1.393, |
|
"step": 406500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.538550615310669, |
|
"learning_rate": 5.861344537815126e-06, |
|
"loss": 1.3938, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.4997118711471558, |
|
"learning_rate": 5.82983193277311e-06, |
|
"loss": 1.3934, |
|
"step": 407500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.5265237092971802, |
|
"learning_rate": 5.798319327731093e-06, |
|
"loss": 1.3915, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.6841180324554443, |
|
"learning_rate": 5.766806722689076e-06, |
|
"loss": 1.3946, |
|
"step": 408500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.4722718000411987, |
|
"learning_rate": 5.735294117647058e-06, |
|
"loss": 1.3949, |
|
"step": 409000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.087042808532715, |
|
"learning_rate": 5.703781512605042e-06, |
|
"loss": 1.3925, |
|
"step": 409500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.4858590364456177, |
|
"learning_rate": 5.672268907563025e-06, |
|
"loss": 1.3943, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.4591546058654785, |
|
"learning_rate": 5.640756302521009e-06, |
|
"loss": 1.3924, |
|
"step": 410500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.4490437507629395, |
|
"learning_rate": 5.609243697478992e-06, |
|
"loss": 1.3949, |
|
"step": 411000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.5795851945877075, |
|
"learning_rate": 5.5777310924369755e-06, |
|
"loss": 1.3951, |
|
"step": 411500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.5447410345077515, |
|
"learning_rate": 5.546218487394958e-06, |
|
"loss": 1.396, |
|
"step": 412000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.510696530342102, |
|
"learning_rate": 5.5147058823529415e-06, |
|
"loss": 1.3929, |
|
"step": 412500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.52991783618927, |
|
"learning_rate": 5.483193277310924e-06, |
|
"loss": 1.393, |
|
"step": 413000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.5724798440933228, |
|
"learning_rate": 5.4516806722689076e-06, |
|
"loss": 1.3933, |
|
"step": 413500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.9198040962219238, |
|
"learning_rate": 5.420168067226891e-06, |
|
"loss": 1.3934, |
|
"step": 414000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.5322943925857544, |
|
"learning_rate": 5.3886554621848744e-06, |
|
"loss": 1.3925, |
|
"step": 414500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.4684040546417236, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 1.3933, |
|
"step": 415000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.4797214269638062, |
|
"learning_rate": 5.3256302521008405e-06, |
|
"loss": 1.3925, |
|
"step": 415500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.524305820465088, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 1.3929, |
|
"step": 416000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.4858139753341675, |
|
"learning_rate": 5.2626050420168065e-06, |
|
"loss": 1.3881, |
|
"step": 416500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.5586313009262085, |
|
"learning_rate": 5.23109243697479e-06, |
|
"loss": 1.393, |
|
"step": 417000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.54250168800354, |
|
"learning_rate": 5.199579831932773e-06, |
|
"loss": 1.3926, |
|
"step": 417500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 9.902482986450195, |
|
"learning_rate": 5.168067226890756e-06, |
|
"loss": 1.3923, |
|
"step": 418000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.239046573638916, |
|
"learning_rate": 5.136554621848739e-06, |
|
"loss": 1.3925, |
|
"step": 418500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.5059127807617188, |
|
"learning_rate": 5.105042016806723e-06, |
|
"loss": 1.3936, |
|
"step": 419000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.5107486248016357, |
|
"learning_rate": 5.073529411764706e-06, |
|
"loss": 1.3942, |
|
"step": 419500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.577019214630127, |
|
"learning_rate": 5.04201680672269e-06, |
|
"loss": 1.3896, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.4538390636444092, |
|
"learning_rate": 5.010504201680672e-06, |
|
"loss": 1.387, |
|
"step": 420500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.593549132347107, |
|
"learning_rate": 4.978991596638656e-06, |
|
"loss": 1.3908, |
|
"step": 421000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.4725204706192017, |
|
"learning_rate": 4.947478991596638e-06, |
|
"loss": 1.3904, |
|
"step": 421500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.4892488718032837, |
|
"learning_rate": 4.915966386554622e-06, |
|
"loss": 1.3896, |
|
"step": 422000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.503003478050232, |
|
"learning_rate": 4.884453781512605e-06, |
|
"loss": 1.3901, |
|
"step": 422500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.5650583505630493, |
|
"learning_rate": 4.852941176470589e-06, |
|
"loss": 1.3879, |
|
"step": 423000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.5746469497680664, |
|
"learning_rate": 4.821428571428572e-06, |
|
"loss": 1.3898, |
|
"step": 423500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.4636718034744263, |
|
"learning_rate": 4.789915966386555e-06, |
|
"loss": 1.3946, |
|
"step": 424000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.5072635412216187, |
|
"learning_rate": 4.758403361344537e-06, |
|
"loss": 1.3936, |
|
"step": 424500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.9211359024047852, |
|
"learning_rate": 4.726890756302521e-06, |
|
"loss": 1.3919, |
|
"step": 425000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.6186763048171997, |
|
"learning_rate": 4.695378151260504e-06, |
|
"loss": 1.3874, |
|
"step": 425500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.6086759567260742, |
|
"learning_rate": 4.663865546218488e-06, |
|
"loss": 1.3911, |
|
"step": 426000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.4456268548965454, |
|
"learning_rate": 4.632352941176471e-06, |
|
"loss": 1.3888, |
|
"step": 426500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.5766582489013672, |
|
"learning_rate": 4.6008403361344545e-06, |
|
"loss": 1.3884, |
|
"step": 427000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.4081532955169678, |
|
"learning_rate": 4.569327731092437e-06, |
|
"loss": 1.3904, |
|
"step": 427500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.4901301860809326, |
|
"learning_rate": 4.53781512605042e-06, |
|
"loss": 1.389, |
|
"step": 428000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.5027050971984863, |
|
"learning_rate": 4.506302521008403e-06, |
|
"loss": 1.3931, |
|
"step": 428500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.4869219064712524, |
|
"learning_rate": 4.4747899159663865e-06, |
|
"loss": 1.3888, |
|
"step": 429000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.439729928970337, |
|
"learning_rate": 4.44327731092437e-06, |
|
"loss": 1.3897, |
|
"step": 429500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.5325324535369873, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 1.3891, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.5293645858764648, |
|
"learning_rate": 4.380252100840337e-06, |
|
"loss": 1.3902, |
|
"step": 430500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.4475960731506348, |
|
"learning_rate": 4.3487394957983194e-06, |
|
"loss": 1.388, |
|
"step": 431000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.5612802505493164, |
|
"learning_rate": 4.317226890756302e-06, |
|
"loss": 1.3885, |
|
"step": 431500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.682928204536438, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 1.3899, |
|
"step": 432000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.5231236219406128, |
|
"learning_rate": 4.254201680672269e-06, |
|
"loss": 1.3877, |
|
"step": 432500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.446148157119751, |
|
"learning_rate": 4.222689075630252e-06, |
|
"loss": 1.3901, |
|
"step": 433000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.4778817892074585, |
|
"learning_rate": 4.191176470588236e-06, |
|
"loss": 1.3865, |
|
"step": 433500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.5888080596923828, |
|
"learning_rate": 4.159663865546219e-06, |
|
"loss": 1.3872, |
|
"step": 434000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.6371558904647827, |
|
"learning_rate": 4.128151260504202e-06, |
|
"loss": 1.3893, |
|
"step": 434500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.4442592859268188, |
|
"learning_rate": 4.096638655462184e-06, |
|
"loss": 1.3934, |
|
"step": 435000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.7637091875076294, |
|
"learning_rate": 4.065126050420168e-06, |
|
"loss": 1.3892, |
|
"step": 435500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.4838693141937256, |
|
"learning_rate": 4.033613445378151e-06, |
|
"loss": 1.3866, |
|
"step": 436000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.5558868646621704, |
|
"learning_rate": 4.002100840336135e-06, |
|
"loss": 1.3914, |
|
"step": 436500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.8331657648086548, |
|
"learning_rate": 3.970588235294118e-06, |
|
"loss": 1.389, |
|
"step": 437000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.7367424964904785, |
|
"learning_rate": 3.939075630252101e-06, |
|
"loss": 1.3897, |
|
"step": 437500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.5316094160079956, |
|
"learning_rate": 3.907563025210084e-06, |
|
"loss": 1.3871, |
|
"step": 438000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.5062899589538574, |
|
"learning_rate": 3.876050420168068e-06, |
|
"loss": 1.3876, |
|
"step": 438500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.5399343967437744, |
|
"learning_rate": 3.84453781512605e-06, |
|
"loss": 1.3873, |
|
"step": 439000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.8311206102371216, |
|
"learning_rate": 3.8130252100840336e-06, |
|
"loss": 1.3886, |
|
"step": 439500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.5011011362075806, |
|
"learning_rate": 3.7815126050420167e-06, |
|
"loss": 1.3877, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.5647181272506714, |
|
"learning_rate": 3.75e-06, |
|
"loss": 1.3895, |
|
"step": 440500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.9663615226745605, |
|
"learning_rate": 3.7184873949579835e-06, |
|
"loss": 1.3884, |
|
"step": 441000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.4808692932128906, |
|
"learning_rate": 3.686974789915966e-06, |
|
"loss": 1.3879, |
|
"step": 441500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.4271633625030518, |
|
"learning_rate": 3.6554621848739496e-06, |
|
"loss": 1.3913, |
|
"step": 442000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.5341715812683105, |
|
"learning_rate": 3.623949579831933e-06, |
|
"loss": 1.3874, |
|
"step": 442500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.4926517009735107, |
|
"learning_rate": 3.592436974789916e-06, |
|
"loss": 1.3873, |
|
"step": 443000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.4709627628326416, |
|
"learning_rate": 3.560924369747899e-06, |
|
"loss": 1.3856, |
|
"step": 443500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.4797513484954834, |
|
"learning_rate": 3.5294117647058825e-06, |
|
"loss": 1.3874, |
|
"step": 444000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.506548523902893, |
|
"learning_rate": 3.497899159663866e-06, |
|
"loss": 1.3859, |
|
"step": 444500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.4667857885360718, |
|
"learning_rate": 3.466386554621849e-06, |
|
"loss": 1.3889, |
|
"step": 445000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.4796762466430664, |
|
"learning_rate": 3.434873949579832e-06, |
|
"loss": 1.3912, |
|
"step": 445500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.534725546836853, |
|
"learning_rate": 3.4033613445378154e-06, |
|
"loss": 1.3881, |
|
"step": 446000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.6512054204940796, |
|
"learning_rate": 3.3718487394957984e-06, |
|
"loss": 1.3874, |
|
"step": 446500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.4926962852478027, |
|
"learning_rate": 3.340336134453782e-06, |
|
"loss": 1.3844, |
|
"step": 447000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.479819416999817, |
|
"learning_rate": 3.308823529411765e-06, |
|
"loss": 1.3862, |
|
"step": 447500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.429606318473816, |
|
"learning_rate": 3.277310924369748e-06, |
|
"loss": 1.3864, |
|
"step": 448000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.526227593421936, |
|
"learning_rate": 3.2457983193277313e-06, |
|
"loss": 1.388, |
|
"step": 448500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.5270380973815918, |
|
"learning_rate": 3.2142857142857143e-06, |
|
"loss": 1.3898, |
|
"step": 449000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.6459033489227295, |
|
"learning_rate": 3.1827731092436973e-06, |
|
"loss": 1.3872, |
|
"step": 449500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.5082780122756958, |
|
"learning_rate": 3.1512605042016808e-06, |
|
"loss": 1.3864, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.4675207138061523, |
|
"learning_rate": 3.119747899159664e-06, |
|
"loss": 1.3858, |
|
"step": 450500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.5487087965011597, |
|
"learning_rate": 3.088235294117647e-06, |
|
"loss": 1.3859, |
|
"step": 451000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.5166810750961304, |
|
"learning_rate": 3.0567226890756302e-06, |
|
"loss": 1.3838, |
|
"step": 451500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.4788706302642822, |
|
"learning_rate": 3.0252100840336137e-06, |
|
"loss": 1.3836, |
|
"step": 452000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.6381962299346924, |
|
"learning_rate": 2.993697478991597e-06, |
|
"loss": 1.3853, |
|
"step": 452500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.4548882246017456, |
|
"learning_rate": 2.9621848739495797e-06, |
|
"loss": 1.3878, |
|
"step": 453000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.5543279647827148, |
|
"learning_rate": 2.930672268907563e-06, |
|
"loss": 1.3885, |
|
"step": 453500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.5119037628173828, |
|
"learning_rate": 2.8991596638655466e-06, |
|
"loss": 1.3865, |
|
"step": 454000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.5338330268859863, |
|
"learning_rate": 2.867647058823529e-06, |
|
"loss": 1.3825, |
|
"step": 454500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.100884437561035, |
|
"learning_rate": 2.8361344537815126e-06, |
|
"loss": 1.3894, |
|
"step": 455000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.4853757619857788, |
|
"learning_rate": 2.804621848739496e-06, |
|
"loss": 1.385, |
|
"step": 455500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.545937180519104, |
|
"learning_rate": 2.773109243697479e-06, |
|
"loss": 1.3875, |
|
"step": 456000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.4860107898712158, |
|
"learning_rate": 2.741596638655462e-06, |
|
"loss": 1.3839, |
|
"step": 456500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5260435342788696, |
|
"learning_rate": 2.7100840336134455e-06, |
|
"loss": 1.3815, |
|
"step": 457000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5752997398376465, |
|
"learning_rate": 2.6785714285714285e-06, |
|
"loss": 1.3845, |
|
"step": 457500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5157984495162964, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 1.3831, |
|
"step": 458000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5206942558288574, |
|
"learning_rate": 2.615546218487395e-06, |
|
"loss": 1.3866, |
|
"step": 458500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.524672508239746, |
|
"learning_rate": 2.584033613445378e-06, |
|
"loss": 1.3869, |
|
"step": 459000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 6.727693557739258, |
|
"learning_rate": 2.5525210084033614e-06, |
|
"loss": 1.3805, |
|
"step": 459500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5827701091766357, |
|
"learning_rate": 2.521008403361345e-06, |
|
"loss": 1.39, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.4831866025924683, |
|
"learning_rate": 2.489495798319328e-06, |
|
"loss": 1.3886, |
|
"step": 460500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5272330045700073, |
|
"learning_rate": 2.457983193277311e-06, |
|
"loss": 1.3889, |
|
"step": 461000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.478623628616333, |
|
"learning_rate": 2.4264705882352943e-06, |
|
"loss": 1.3878, |
|
"step": 461500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5272207260131836, |
|
"learning_rate": 2.3949579831932773e-06, |
|
"loss": 1.3834, |
|
"step": 462000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.574120044708252, |
|
"learning_rate": 2.3634453781512604e-06, |
|
"loss": 1.3852, |
|
"step": 462500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5751044750213623, |
|
"learning_rate": 2.331932773109244e-06, |
|
"loss": 1.3829, |
|
"step": 463000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.4704902172088623, |
|
"learning_rate": 2.3004201680672272e-06, |
|
"loss": 1.3817, |
|
"step": 463500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.406973123550415, |
|
"learning_rate": 2.26890756302521e-06, |
|
"loss": 1.3872, |
|
"step": 464000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.4869129657745361, |
|
"learning_rate": 2.2373949579831933e-06, |
|
"loss": 1.3825, |
|
"step": 464500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.5050959587097168, |
|
"learning_rate": 2.2058823529411767e-06, |
|
"loss": 1.3821, |
|
"step": 465000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.4652327299118042, |
|
"learning_rate": 2.1743697478991597e-06, |
|
"loss": 1.3831, |
|
"step": 465500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.6011298894882202, |
|
"learning_rate": 2.1428571428571427e-06, |
|
"loss": 1.3824, |
|
"step": 466000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.589460015296936, |
|
"learning_rate": 2.111344537815126e-06, |
|
"loss": 1.3816, |
|
"step": 466500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.679612636566162, |
|
"learning_rate": 2.0798319327731096e-06, |
|
"loss": 1.383, |
|
"step": 467000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 5.37538480758667, |
|
"learning_rate": 2.048319327731092e-06, |
|
"loss": 1.3818, |
|
"step": 467500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.5256156921386719, |
|
"learning_rate": 2.0168067226890756e-06, |
|
"loss": 1.383, |
|
"step": 468000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.546476125717163, |
|
"learning_rate": 1.985294117647059e-06, |
|
"loss": 1.3841, |
|
"step": 468500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.429592251777649, |
|
"learning_rate": 1.953781512605042e-06, |
|
"loss": 1.3828, |
|
"step": 469000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.4674160480499268, |
|
"learning_rate": 1.922268907563025e-06, |
|
"loss": 1.3847, |
|
"step": 469500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.370859384536743, |
|
"learning_rate": 1.8907563025210083e-06, |
|
"loss": 1.3816, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.5106278657913208, |
|
"learning_rate": 1.8592436974789918e-06, |
|
"loss": 1.3783, |
|
"step": 470500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.5777826309204102, |
|
"learning_rate": 1.8277310924369748e-06, |
|
"loss": 1.3817, |
|
"step": 471000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.4805636405944824, |
|
"learning_rate": 1.796218487394958e-06, |
|
"loss": 1.3831, |
|
"step": 471500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.5154469013214111, |
|
"learning_rate": 1.7647058823529412e-06, |
|
"loss": 1.383, |
|
"step": 472000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.54281747341156, |
|
"learning_rate": 1.7331932773109245e-06, |
|
"loss": 1.3852, |
|
"step": 472500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.7247158288955688, |
|
"learning_rate": 1.7016806722689077e-06, |
|
"loss": 1.3819, |
|
"step": 473000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.4723429679870605, |
|
"learning_rate": 1.670168067226891e-06, |
|
"loss": 1.38, |
|
"step": 473500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.5267595052719116, |
|
"learning_rate": 1.638655462184874e-06, |
|
"loss": 1.3822, |
|
"step": 474000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.566758155822754, |
|
"learning_rate": 1.6071428571428572e-06, |
|
"loss": 1.3837, |
|
"step": 474500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.029449939727783, |
|
"learning_rate": 1.5756302521008404e-06, |
|
"loss": 1.3853, |
|
"step": 475000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.4750381708145142, |
|
"learning_rate": 1.5441176470588234e-06, |
|
"loss": 1.3838, |
|
"step": 475500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.5221339464187622, |
|
"learning_rate": 1.5126050420168068e-06, |
|
"loss": 1.3859, |
|
"step": 476000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.518754243850708, |
|
"learning_rate": 1.4810924369747898e-06, |
|
"loss": 1.3783, |
|
"step": 476500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.4300239086151123, |
|
"learning_rate": 1.4495798319327733e-06, |
|
"loss": 1.3769, |
|
"step": 477000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.5566083192825317, |
|
"learning_rate": 1.4180672268907563e-06, |
|
"loss": 1.3788, |
|
"step": 477500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.415859580039978, |
|
"learning_rate": 1.3865546218487395e-06, |
|
"loss": 1.385, |
|
"step": 478000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.4944028854370117, |
|
"learning_rate": 1.3550420168067228e-06, |
|
"loss": 1.3815, |
|
"step": 478500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.4514822959899902, |
|
"learning_rate": 1.323529411764706e-06, |
|
"loss": 1.3827, |
|
"step": 479000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.5512882471084595, |
|
"learning_rate": 1.292016806722689e-06, |
|
"loss": 1.384, |
|
"step": 479500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.574981689453125, |
|
"learning_rate": 1.2605042016806724e-06, |
|
"loss": 1.382, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.570827603340149, |
|
"learning_rate": 1.2289915966386554e-06, |
|
"loss": 1.382, |
|
"step": 480500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.5336010456085205, |
|
"learning_rate": 1.1974789915966387e-06, |
|
"loss": 1.3803, |
|
"step": 481000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.4452096223831177, |
|
"learning_rate": 1.165966386554622e-06, |
|
"loss": 1.3804, |
|
"step": 481500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.5529412031173706, |
|
"learning_rate": 1.134453781512605e-06, |
|
"loss": 1.3813, |
|
"step": 482000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.5553141832351685, |
|
"learning_rate": 1.1029411764705884e-06, |
|
"loss": 1.3822, |
|
"step": 482500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.5250602960586548, |
|
"learning_rate": 1.0714285714285714e-06, |
|
"loss": 1.379, |
|
"step": 483000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.4803342819213867, |
|
"learning_rate": 1.0399159663865548e-06, |
|
"loss": 1.3846, |
|
"step": 483500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.4097282886505127, |
|
"learning_rate": 1.0084033613445378e-06, |
|
"loss": 1.3855, |
|
"step": 484000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.535632848739624, |
|
"learning_rate": 9.76890756302521e-07, |
|
"loss": 1.3807, |
|
"step": 484500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.5535025596618652, |
|
"learning_rate": 9.453781512605042e-07, |
|
"loss": 1.379, |
|
"step": 485000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.5092753171920776, |
|
"learning_rate": 9.138655462184874e-07, |
|
"loss": 1.3777, |
|
"step": 485500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.5026346445083618, |
|
"learning_rate": 8.823529411764706e-07, |
|
"loss": 1.3844, |
|
"step": 486000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.1724424362182617, |
|
"learning_rate": 8.508403361344538e-07, |
|
"loss": 1.3808, |
|
"step": 486500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.5653128623962402, |
|
"learning_rate": 8.19327731092437e-07, |
|
"loss": 1.3826, |
|
"step": 487000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.8672337532043457, |
|
"learning_rate": 7.878151260504202e-07, |
|
"loss": 1.3804, |
|
"step": 487500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.5125828981399536, |
|
"learning_rate": 7.563025210084034e-07, |
|
"loss": 1.3785, |
|
"step": 488000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.5895177125930786, |
|
"learning_rate": 7.247899159663866e-07, |
|
"loss": 1.3806, |
|
"step": 488500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.505618929862976, |
|
"learning_rate": 6.932773109243698e-07, |
|
"loss": 1.3822, |
|
"step": 489000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.4767976999282837, |
|
"learning_rate": 6.61764705882353e-07, |
|
"loss": 1.3809, |
|
"step": 489500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.4713040590286255, |
|
"learning_rate": 6.302521008403362e-07, |
|
"loss": 1.38, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5712190866470337, |
|
"learning_rate": 5.987394957983193e-07, |
|
"loss": 1.3821, |
|
"step": 490500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.520726203918457, |
|
"learning_rate": 5.672268907563025e-07, |
|
"loss": 1.3817, |
|
"step": 491000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.4978504180908203, |
|
"learning_rate": 5.357142857142857e-07, |
|
"loss": 1.3825, |
|
"step": 491500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5783872604370117, |
|
"learning_rate": 5.042016806722689e-07, |
|
"loss": 1.3825, |
|
"step": 492000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5126821994781494, |
|
"learning_rate": 4.726890756302521e-07, |
|
"loss": 1.3803, |
|
"step": 492500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.4677457809448242, |
|
"learning_rate": 4.411764705882353e-07, |
|
"loss": 1.3804, |
|
"step": 493000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.5842092037200928, |
|
"learning_rate": 4.096638655462185e-07, |
|
"loss": 1.3818, |
|
"step": 493500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.5152337551116943, |
|
"learning_rate": 3.781512605042017e-07, |
|
"loss": 1.3797, |
|
"step": 494000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.5868217945098877, |
|
"learning_rate": 3.466386554621849e-07, |
|
"loss": 1.3829, |
|
"step": 494500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.4543733596801758, |
|
"learning_rate": 3.151260504201681e-07, |
|
"loss": 1.3811, |
|
"step": 495000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.5251801013946533, |
|
"learning_rate": 2.8361344537815123e-07, |
|
"loss": 1.3793, |
|
"step": 495500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.5227956771850586, |
|
"learning_rate": 2.5210084033613445e-07, |
|
"loss": 1.3848, |
|
"step": 496000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.506102204322815, |
|
"learning_rate": 2.2058823529411765e-07, |
|
"loss": 1.3789, |
|
"step": 496500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.4776455163955688, |
|
"learning_rate": 1.8907563025210085e-07, |
|
"loss": 1.3837, |
|
"step": 497000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.5449495315551758, |
|
"learning_rate": 1.5756302521008405e-07, |
|
"loss": 1.3823, |
|
"step": 497500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.4903110265731812, |
|
"learning_rate": 1.2605042016806723e-07, |
|
"loss": 1.3816, |
|
"step": 498000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.4964358806610107, |
|
"learning_rate": 9.453781512605043e-08, |
|
"loss": 1.3783, |
|
"step": 498500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.6141352653503418, |
|
"learning_rate": 6.302521008403361e-08, |
|
"loss": 1.3819, |
|
"step": 499000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.5006154775619507, |
|
"learning_rate": 3.151260504201681e-08, |
|
"loss": 1.3771, |
|
"step": 499500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.5279935598373413, |
|
"learning_rate": 0.0, |
|
"loss": 1.3805, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"step": 500000, |
|
"total_flos": 2.9824904071075946e+19, |
|
"train_loss": 1.5473345408935546, |
|
"train_runtime": 243315.0329, |
|
"train_samples_per_second": 526.067, |
|
"train_steps_per_second": 2.055 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 500000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 10000, |
|
"total_flos": 2.9824904071075946e+19, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|