|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9712, |
|
"eval_steps": 500, |
|
"global_step": 308000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 1.7230606079101562, |
|
"learning_rate": 4.99208e-05, |
|
"loss": 1.2281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 3.655383348464966, |
|
"learning_rate": 4.9840800000000006e-05, |
|
"loss": 0.7566, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 1.2925927639007568, |
|
"learning_rate": 4.97608e-05, |
|
"loss": 0.6764, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.286004900932312, |
|
"learning_rate": 4.968080000000001e-05, |
|
"loss": 0.6304, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.2140214443206787, |
|
"learning_rate": 4.96008e-05, |
|
"loss": 0.5981, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.2525482177734375, |
|
"learning_rate": 4.95208e-05, |
|
"loss": 0.5767, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 1.2310410737991333, |
|
"learning_rate": 4.94408e-05, |
|
"loss": 0.5597, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 1.1735206842422485, |
|
"learning_rate": 4.9360800000000004e-05, |
|
"loss": 0.5418, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 1.114688754081726, |
|
"learning_rate": 4.9280800000000004e-05, |
|
"loss": 0.5335, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.8874593377113342, |
|
"learning_rate": 4.9200800000000005e-05, |
|
"loss": 0.5237, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 1.1261299848556519, |
|
"learning_rate": 4.91208e-05, |
|
"loss": 0.5135, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 0.9994556307792664, |
|
"learning_rate": 4.9040800000000007e-05, |
|
"loss": 0.5059, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 1.2349673509597778, |
|
"learning_rate": 4.89608e-05, |
|
"loss": 0.4939, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 0.9770995378494263, |
|
"learning_rate": 4.88808e-05, |
|
"loss": 0.4824, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.981966495513916, |
|
"learning_rate": 4.88008e-05, |
|
"loss": 0.4875, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 1.0177415609359741, |
|
"learning_rate": 4.87208e-05, |
|
"loss": 0.4785, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 1.0521667003631592, |
|
"learning_rate": 4.8640800000000004e-05, |
|
"loss": 0.4731, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 0.8560615181922913, |
|
"learning_rate": 4.85608e-05, |
|
"loss": 0.4633, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 1.0170217752456665, |
|
"learning_rate": 4.8480800000000005e-05, |
|
"loss": 0.4576, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.9891325831413269, |
|
"learning_rate": 4.84008e-05, |
|
"loss": 0.4556, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 1.0609711408615112, |
|
"learning_rate": 4.832080000000001e-05, |
|
"loss": 0.4493, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 0.8623799681663513, |
|
"learning_rate": 4.82408e-05, |
|
"loss": 0.4459, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 0.9587870240211487, |
|
"learning_rate": 4.81608e-05, |
|
"loss": 0.4418, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 0.8939447999000549, |
|
"learning_rate": 4.80808e-05, |
|
"loss": 0.4327, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9886033535003662, |
|
"learning_rate": 4.80008e-05, |
|
"loss": 0.438, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 0.9157513976097107, |
|
"learning_rate": 4.7920800000000004e-05, |
|
"loss": 0.4323, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 0.9085854887962341, |
|
"learning_rate": 4.7840800000000005e-05, |
|
"loss": 0.4303, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 0.9123984575271606, |
|
"learning_rate": 4.77608e-05, |
|
"loss": 0.4247, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 0.839026689529419, |
|
"learning_rate": 4.7680960000000004e-05, |
|
"loss": 0.4233, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.8110847473144531, |
|
"learning_rate": 4.760096e-05, |
|
"loss": 0.4207, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 0.8462579250335693, |
|
"learning_rate": 4.7520960000000005e-05, |
|
"loss": 0.421, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 0.8980106711387634, |
|
"learning_rate": 4.744096e-05, |
|
"loss": 0.417, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 0.8297702074050903, |
|
"learning_rate": 4.736096000000001e-05, |
|
"loss": 0.4139, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 0.9856173992156982, |
|
"learning_rate": 4.728096e-05, |
|
"loss": 0.419, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.934256911277771, |
|
"learning_rate": 4.720096e-05, |
|
"loss": 0.4098, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 0.9190649390220642, |
|
"learning_rate": 4.712096e-05, |
|
"loss": 0.412, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 0.9078772664070129, |
|
"learning_rate": 4.704096e-05, |
|
"loss": 0.4043, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.082939624786377, |
|
"learning_rate": 4.696112e-05, |
|
"loss": 0.4045, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 0.9159390926361084, |
|
"learning_rate": 4.688112e-05, |
|
"loss": 0.4098, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.8420547842979431, |
|
"learning_rate": 4.680128e-05, |
|
"loss": 0.4033, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 0.7658286094665527, |
|
"learning_rate": 4.672128e-05, |
|
"loss": 0.4002, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 0.9074057340621948, |
|
"learning_rate": 4.664128e-05, |
|
"loss": 0.3964, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 0.6065025329589844, |
|
"learning_rate": 4.656128e-05, |
|
"loss": 0.3984, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 0.7523757219314575, |
|
"learning_rate": 4.6481280000000004e-05, |
|
"loss": 0.3959, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.807826042175293, |
|
"learning_rate": 4.6401280000000004e-05, |
|
"loss": 0.3921, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 0.8530682325363159, |
|
"learning_rate": 4.632128e-05, |
|
"loss": 0.4002, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 0.8661518692970276, |
|
"learning_rate": 4.6241280000000006e-05, |
|
"loss": 0.3856, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.7473235130310059, |
|
"learning_rate": 4.616144e-05, |
|
"loss": 0.3854, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 0.7954819202423096, |
|
"learning_rate": 4.6081440000000005e-05, |
|
"loss": 0.3871, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8758727312088013, |
|
"learning_rate": 4.600144e-05, |
|
"loss": 0.3842, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 0.8430293798446655, |
|
"learning_rate": 4.592144000000001e-05, |
|
"loss": 0.3886, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.6557173728942871, |
|
"learning_rate": 4.584144e-05, |
|
"loss": 0.3854, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 0.7791888117790222, |
|
"learning_rate": 4.576144e-05, |
|
"loss": 0.3796, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.736084520816803, |
|
"learning_rate": 4.56816e-05, |
|
"loss": 0.3806, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.7714269161224365, |
|
"learning_rate": 4.56016e-05, |
|
"loss": 0.3781, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.766144335269928, |
|
"learning_rate": 4.552176e-05, |
|
"loss": 0.3766, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 0.7035301923751831, |
|
"learning_rate": 4.544176e-05, |
|
"loss": 0.3737, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.7573793530464172, |
|
"learning_rate": 4.536176e-05, |
|
"loss": 0.3753, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 0.8799508213996887, |
|
"learning_rate": 4.528176e-05, |
|
"loss": 0.373, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.8543264269828796, |
|
"learning_rate": 4.520176e-05, |
|
"loss": 0.3735, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 0.6768947243690491, |
|
"learning_rate": 4.512176e-05, |
|
"loss": 0.3697, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.8239702582359314, |
|
"learning_rate": 4.504176e-05, |
|
"loss": 0.3675, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 0.8310449123382568, |
|
"learning_rate": 4.4961760000000004e-05, |
|
"loss": 0.3695, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.8459475040435791, |
|
"learning_rate": 4.488176e-05, |
|
"loss": 0.3694, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.7346063852310181, |
|
"learning_rate": 4.4801760000000006e-05, |
|
"loss": 0.3646, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.6958354115486145, |
|
"learning_rate": 4.472176e-05, |
|
"loss": 0.3704, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 0.8244686722755432, |
|
"learning_rate": 4.464176000000001e-05, |
|
"loss": 0.3647, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 0.7559502124786377, |
|
"learning_rate": 4.456192e-05, |
|
"loss": 0.3665, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 0.9046504497528076, |
|
"learning_rate": 4.4481920000000007e-05, |
|
"loss": 0.3637, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.7771899700164795, |
|
"learning_rate": 4.440192e-05, |
|
"loss": 0.3648, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 0.6887528300285339, |
|
"learning_rate": 4.432192e-05, |
|
"loss": 0.3562, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.7471407055854797, |
|
"learning_rate": 4.424192e-05, |
|
"loss": 0.3639, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 0.7198163270950317, |
|
"learning_rate": 4.416192e-05, |
|
"loss": 0.3604, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 0.7383478879928589, |
|
"learning_rate": 4.4081920000000004e-05, |
|
"loss": 0.3592, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8052579760551453, |
|
"learning_rate": 4.4001920000000004e-05, |
|
"loss": 0.3563, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.7765107154846191, |
|
"learning_rate": 4.392224e-05, |
|
"loss": 0.3548, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 0.7250288724899292, |
|
"learning_rate": 4.384224e-05, |
|
"loss": 0.3605, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 0.6914694309234619, |
|
"learning_rate": 4.376224e-05, |
|
"loss": 0.3551, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 0.6636275053024292, |
|
"learning_rate": 4.368224e-05, |
|
"loss": 0.3587, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.710564911365509, |
|
"learning_rate": 4.360224e-05, |
|
"loss": 0.3537, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 0.6195800304412842, |
|
"learning_rate": 4.3522240000000004e-05, |
|
"loss": 0.3537, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 0.7131514549255371, |
|
"learning_rate": 4.34424e-05, |
|
"loss": 0.3531, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 0.6594410538673401, |
|
"learning_rate": 4.336256e-05, |
|
"loss": 0.3518, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.7651230096817017, |
|
"learning_rate": 4.328256e-05, |
|
"loss": 0.3516, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.756515622138977, |
|
"learning_rate": 4.320256e-05, |
|
"loss": 0.3461, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 0.7201528549194336, |
|
"learning_rate": 4.3122560000000003e-05, |
|
"loss": 0.3497, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 0.7436856031417847, |
|
"learning_rate": 4.3042560000000004e-05, |
|
"loss": 0.3505, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.7914199829101562, |
|
"learning_rate": 4.2962560000000005e-05, |
|
"loss": 0.3439, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 0.7488194704055786, |
|
"learning_rate": 4.288256e-05, |
|
"loss": 0.349, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.8654124736785889, |
|
"learning_rate": 4.280256e-05, |
|
"loss": 0.3491, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 0.6817401647567749, |
|
"learning_rate": 4.272272e-05, |
|
"loss": 0.3447, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 0.6439715623855591, |
|
"learning_rate": 4.2642720000000006e-05, |
|
"loss": 0.3453, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.3840138912200928, |
|
"learning_rate": 4.256272e-05, |
|
"loss": 0.3445, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.7245766520500183, |
|
"learning_rate": 4.248272e-05, |
|
"loss": 0.3462, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.6877666711807251, |
|
"learning_rate": 4.240288e-05, |
|
"loss": 0.3465, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.8494886159896851, |
|
"learning_rate": 4.2322880000000006e-05, |
|
"loss": 0.348, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 0.6704971790313721, |
|
"learning_rate": 4.224288e-05, |
|
"loss": 0.3403, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 0.6239964962005615, |
|
"learning_rate": 4.216288000000001e-05, |
|
"loss": 0.3382, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 0.7317768335342407, |
|
"learning_rate": 4.208288e-05, |
|
"loss": 0.3385, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7397735118865967, |
|
"learning_rate": 4.200288e-05, |
|
"loss": 0.3405, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 1.1299536228179932, |
|
"learning_rate": 4.1922880000000003e-05, |
|
"loss": 0.3431, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.6406556963920593, |
|
"learning_rate": 4.184304e-05, |
|
"loss": 0.3384, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 0.8084424734115601, |
|
"learning_rate": 4.17632e-05, |
|
"loss": 0.3365, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.7525010704994202, |
|
"learning_rate": 4.16832e-05, |
|
"loss": 0.3399, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.7382110953330994, |
|
"learning_rate": 4.16032e-05, |
|
"loss": 0.335, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.6454793810844421, |
|
"learning_rate": 4.15232e-05, |
|
"loss": 0.3354, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 0.639664351940155, |
|
"learning_rate": 4.14432e-05, |
|
"loss": 0.3371, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.5574499368667603, |
|
"learning_rate": 4.1363200000000004e-05, |
|
"loss": 0.3341, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 0.6772671341896057, |
|
"learning_rate": 4.12832e-05, |
|
"loss": 0.3331, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.6943195462226868, |
|
"learning_rate": 4.120336e-05, |
|
"loss": 0.3365, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 0.7460485100746155, |
|
"learning_rate": 4.112336e-05, |
|
"loss": 0.3308, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.7071924805641174, |
|
"learning_rate": 4.1043360000000005e-05, |
|
"loss": 0.3312, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 0.6678891181945801, |
|
"learning_rate": 4.0963519999999996e-05, |
|
"loss": 0.3314, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.7100914120674133, |
|
"learning_rate": 4.0883520000000004e-05, |
|
"loss": 0.3307, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.6085671782493591, |
|
"learning_rate": 4.080352e-05, |
|
"loss": 0.3282, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 0.6634243130683899, |
|
"learning_rate": 4.0723520000000005e-05, |
|
"loss": 0.3321, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 0.7203409075737, |
|
"learning_rate": 4.064352e-05, |
|
"loss": 0.3318, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 0.7934884428977966, |
|
"learning_rate": 4.056352e-05, |
|
"loss": 0.3239, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 0.8591666221618652, |
|
"learning_rate": 4.048352e-05, |
|
"loss": 0.3275, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.6306772232055664, |
|
"learning_rate": 4.040352e-05, |
|
"loss": 0.3308, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 0.6059302687644958, |
|
"learning_rate": 4.032352e-05, |
|
"loss": 0.3266, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 0.6875105500221252, |
|
"learning_rate": 4.024352e-05, |
|
"loss": 0.3265, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 0.6397412419319153, |
|
"learning_rate": 4.0163520000000004e-05, |
|
"loss": 0.3268, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.7801005840301514, |
|
"learning_rate": 4.0083520000000005e-05, |
|
"loss": 0.3314, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6966884136199951, |
|
"learning_rate": 4.000352e-05, |
|
"loss": 0.3263, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 0.7413304448127747, |
|
"learning_rate": 3.9923520000000006e-05, |
|
"loss": 0.3284, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 0.7089780569076538, |
|
"learning_rate": 3.984352e-05, |
|
"loss": 0.3252, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.6669878959655762, |
|
"learning_rate": 3.976352e-05, |
|
"loss": 0.3239, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 0.7352403998374939, |
|
"learning_rate": 3.968368e-05, |
|
"loss": 0.3226, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.6916635036468506, |
|
"learning_rate": 3.9603840000000005e-05, |
|
"loss": 0.3234, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 0.6800302863121033, |
|
"learning_rate": 3.952384e-05, |
|
"loss": 0.3224, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.6685224771499634, |
|
"learning_rate": 3.9443840000000006e-05, |
|
"loss": 0.3197, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 0.7219159603118896, |
|
"learning_rate": 3.936384e-05, |
|
"loss": 0.3185, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 0.5928858518600464, |
|
"learning_rate": 3.928384e-05, |
|
"loss": 0.3291, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.6616542339324951, |
|
"learning_rate": 3.920384e-05, |
|
"loss": 0.3266, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.5957266092300415, |
|
"learning_rate": 3.912384e-05, |
|
"loss": 0.32, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 0.6576407551765442, |
|
"learning_rate": 3.904384e-05, |
|
"loss": 0.3246, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 0.6852056384086609, |
|
"learning_rate": 3.896416e-05, |
|
"loss": 0.3268, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 0.780893087387085, |
|
"learning_rate": 3.888416e-05, |
|
"loss": 0.3229, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.6741476655006409, |
|
"learning_rate": 3.880416e-05, |
|
"loss": 0.3188, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 0.5919800400733948, |
|
"learning_rate": 3.872416e-05, |
|
"loss": 0.3208, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 0.6476633548736572, |
|
"learning_rate": 3.864416e-05, |
|
"loss": 0.322, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 0.5667979717254639, |
|
"learning_rate": 3.8564159999999996e-05, |
|
"loss": 0.3151, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.6126554608345032, |
|
"learning_rate": 3.8484160000000004e-05, |
|
"loss": 0.3185, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.7995546460151672, |
|
"learning_rate": 3.840416e-05, |
|
"loss": 0.3174, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 0.5964981317520142, |
|
"learning_rate": 3.8324160000000005e-05, |
|
"loss": 0.3187, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 0.7718212008476257, |
|
"learning_rate": 3.824416e-05, |
|
"loss": 0.3156, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.7086686491966248, |
|
"learning_rate": 3.8164320000000005e-05, |
|
"loss": 0.3189, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 0.7988029718399048, |
|
"learning_rate": 3.808432e-05, |
|
"loss": 0.3151, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6092699766159058, |
|
"learning_rate": 3.8004320000000006e-05, |
|
"loss": 0.3153, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 0.6181166768074036, |
|
"learning_rate": 3.792432e-05, |
|
"loss": 0.3113, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 0.5952243208885193, |
|
"learning_rate": 3.784432e-05, |
|
"loss": 0.3091, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 0.5732501745223999, |
|
"learning_rate": 3.776432e-05, |
|
"loss": 0.3169, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 0.5866090059280396, |
|
"learning_rate": 3.768432e-05, |
|
"loss": 0.3135, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.6748520135879517, |
|
"learning_rate": 3.760432e-05, |
|
"loss": 0.3134, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 0.5922159552574158, |
|
"learning_rate": 3.752448e-05, |
|
"loss": 0.3156, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 0.6446545124053955, |
|
"learning_rate": 3.744448e-05, |
|
"loss": 0.3171, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 0.6506426334381104, |
|
"learning_rate": 3.736448e-05, |
|
"loss": 0.3138, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 0.6826354265213013, |
|
"learning_rate": 3.728448e-05, |
|
"loss": 0.3164, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.6866195797920227, |
|
"learning_rate": 3.72048e-05, |
|
"loss": 0.315, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 0.5590147376060486, |
|
"learning_rate": 3.7124960000000005e-05, |
|
"loss": 0.3094, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 0.6728788614273071, |
|
"learning_rate": 3.704496e-05, |
|
"loss": 0.3194, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 0.6108749508857727, |
|
"learning_rate": 3.696496000000001e-05, |
|
"loss": 0.3128, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 0.5888856649398804, |
|
"learning_rate": 3.688496e-05, |
|
"loss": 0.3121, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.727268397808075, |
|
"learning_rate": 3.680496e-05, |
|
"loss": 0.3193, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 0.6358634233474731, |
|
"learning_rate": 3.672496e-05, |
|
"loss": 0.3092, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 0.6482620239257812, |
|
"learning_rate": 3.664496e-05, |
|
"loss": 0.3098, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.5968552827835083, |
|
"learning_rate": 3.6564960000000004e-05, |
|
"loss": 0.3108, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 0.6621351838111877, |
|
"learning_rate": 3.6484960000000004e-05, |
|
"loss": 0.3065, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.5520649552345276, |
|
"learning_rate": 3.640496e-05, |
|
"loss": 0.3088, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 0.6885005831718445, |
|
"learning_rate": 3.632496e-05, |
|
"loss": 0.3075, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 0.666653573513031, |
|
"learning_rate": 3.624512e-05, |
|
"loss": 0.3113, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 0.6344409584999084, |
|
"learning_rate": 3.6165120000000005e-05, |
|
"loss": 0.3085, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 0.5792534947395325, |
|
"learning_rate": 3.608512e-05, |
|
"loss": 0.3132, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6864989995956421, |
|
"learning_rate": 3.600512e-05, |
|
"loss": 0.3079, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 0.6077435612678528, |
|
"learning_rate": 3.592512e-05, |
|
"loss": 0.3095, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 0.7073134779930115, |
|
"learning_rate": 3.584512e-05, |
|
"loss": 0.3116, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 0.6477733850479126, |
|
"learning_rate": 3.576512e-05, |
|
"loss": 0.3062, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 0.7786093354225159, |
|
"learning_rate": 3.568512e-05, |
|
"loss": 0.3017, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.6447868943214417, |
|
"learning_rate": 3.560528e-05, |
|
"loss": 0.3077, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 0.6663397550582886, |
|
"learning_rate": 3.552528e-05, |
|
"loss": 0.3089, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 0.533214807510376, |
|
"learning_rate": 3.544528e-05, |
|
"loss": 0.3064, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 0.6517444849014282, |
|
"learning_rate": 3.5365280000000004e-05, |
|
"loss": 0.3108, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 0.7635303735733032, |
|
"learning_rate": 3.528544e-05, |
|
"loss": 0.3028, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.6636632680892944, |
|
"learning_rate": 3.520544e-05, |
|
"loss": 0.3015, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 0.7296783924102783, |
|
"learning_rate": 3.5125440000000004e-05, |
|
"loss": 0.305, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 0.5089054703712463, |
|
"learning_rate": 3.50456e-05, |
|
"loss": 0.3092, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 0.6761330366134644, |
|
"learning_rate": 3.49656e-05, |
|
"loss": 0.3055, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 0.6327843070030212, |
|
"learning_rate": 3.4885600000000004e-05, |
|
"loss": 0.3055, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.5940554141998291, |
|
"learning_rate": 3.48056e-05, |
|
"loss": 0.3017, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 0.516828179359436, |
|
"learning_rate": 3.4725600000000005e-05, |
|
"loss": 0.3035, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.5835782289505005, |
|
"learning_rate": 3.46456e-05, |
|
"loss": 0.2978, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 0.5978230237960815, |
|
"learning_rate": 3.456560000000001e-05, |
|
"loss": 0.301, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 0.5460017323493958, |
|
"learning_rate": 3.44856e-05, |
|
"loss": 0.3052, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.6875701546669006, |
|
"learning_rate": 3.44056e-05, |
|
"loss": 0.3028, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 0.5780492424964905, |
|
"learning_rate": 3.43256e-05, |
|
"loss": 0.2988, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 0.5191554427146912, |
|
"learning_rate": 3.42456e-05, |
|
"loss": 0.3052, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 0.6811420917510986, |
|
"learning_rate": 3.416576e-05, |
|
"loss": 0.3032, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 0.6301366686820984, |
|
"learning_rate": 3.408576e-05, |
|
"loss": 0.2979, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5777577757835388, |
|
"learning_rate": 3.400576e-05, |
|
"loss": 0.2991, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 0.6444558501243591, |
|
"learning_rate": 3.392592e-05, |
|
"loss": 0.298, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 0.4793080985546112, |
|
"learning_rate": 3.384592e-05, |
|
"loss": 0.3014, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 0.6691552400588989, |
|
"learning_rate": 3.376608e-05, |
|
"loss": 0.3006, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 0.6318476796150208, |
|
"learning_rate": 3.368608e-05, |
|
"loss": 0.3032, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.5805894136428833, |
|
"learning_rate": 3.360608e-05, |
|
"loss": 0.3014, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 0.5658220648765564, |
|
"learning_rate": 3.352608e-05, |
|
"loss": 0.3, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 0.6117516160011292, |
|
"learning_rate": 3.3446080000000004e-05, |
|
"loss": 0.3014, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.6763502359390259, |
|
"learning_rate": 3.336608e-05, |
|
"loss": 0.3043, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 0.6046746969223022, |
|
"learning_rate": 3.3286080000000005e-05, |
|
"loss": 0.2965, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.7453213930130005, |
|
"learning_rate": 3.320608e-05, |
|
"loss": 0.2964, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 0.6010546088218689, |
|
"learning_rate": 3.3126080000000007e-05, |
|
"loss": 0.2975, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 0.7377296686172485, |
|
"learning_rate": 3.304608e-05, |
|
"loss": 0.2993, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 0.6612259745597839, |
|
"learning_rate": 3.2966240000000006e-05, |
|
"loss": 0.298, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 0.6570013165473938, |
|
"learning_rate": 3.288624e-05, |
|
"loss": 0.296, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.633602499961853, |
|
"learning_rate": 3.280624e-05, |
|
"loss": 0.2989, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.5594373345375061, |
|
"learning_rate": 3.272624e-05, |
|
"loss": 0.2977, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 0.5643302202224731, |
|
"learning_rate": 3.264624e-05, |
|
"loss": 0.2941, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 0.5127794146537781, |
|
"learning_rate": 3.256624e-05, |
|
"loss": 0.2953, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 0.6273791790008545, |
|
"learning_rate": 3.24864e-05, |
|
"loss": 0.2944, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.5089157223701477, |
|
"learning_rate": 3.24064e-05, |
|
"loss": 0.3, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 0.5816791653633118, |
|
"learning_rate": 3.232656e-05, |
|
"loss": 0.2957, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 0.6407476663589478, |
|
"learning_rate": 3.224656e-05, |
|
"loss": 0.2974, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 0.46444937586784363, |
|
"learning_rate": 3.216656e-05, |
|
"loss": 0.2969, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.4997446835041046, |
|
"learning_rate": 3.2086559999999996e-05, |
|
"loss": 0.2966, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6996490359306335, |
|
"learning_rate": 3.2006560000000003e-05, |
|
"loss": 0.2965, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 0.5806016325950623, |
|
"learning_rate": 3.192672e-05, |
|
"loss": 0.2952, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 0.6140916347503662, |
|
"learning_rate": 3.184672e-05, |
|
"loss": 0.2995, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 0.45879319310188293, |
|
"learning_rate": 3.1766719999999997e-05, |
|
"loss": 0.292, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 0.6141937971115112, |
|
"learning_rate": 3.1686720000000004e-05, |
|
"loss": 0.2945, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.6565462946891785, |
|
"learning_rate": 3.160672e-05, |
|
"loss": 0.2982, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 0.5997145175933838, |
|
"learning_rate": 3.1526720000000006e-05, |
|
"loss": 0.2957, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.736965537071228, |
|
"learning_rate": 3.144672e-05, |
|
"loss": 0.2953, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 0.6587550640106201, |
|
"learning_rate": 3.136672e-05, |
|
"loss": 0.2917, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 0.7265971302986145, |
|
"learning_rate": 3.128672e-05, |
|
"loss": 0.2908, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.6158114075660706, |
|
"learning_rate": 3.120672e-05, |
|
"loss": 0.2916, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 0.6521216034889221, |
|
"learning_rate": 3.112672e-05, |
|
"loss": 0.2947, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 0.5868868231773376, |
|
"learning_rate": 3.1046720000000004e-05, |
|
"loss": 0.2919, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 0.6495432257652283, |
|
"learning_rate": 3.096672e-05, |
|
"loss": 0.2974, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 0.6204816102981567, |
|
"learning_rate": 3.0886720000000005e-05, |
|
"loss": 0.2945, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.6333968639373779, |
|
"learning_rate": 3.080672e-05, |
|
"loss": 0.292, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 0.5613961815834045, |
|
"learning_rate": 3.0726880000000004e-05, |
|
"loss": 0.2938, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 0.6623988151550293, |
|
"learning_rate": 3.064688e-05, |
|
"loss": 0.2954, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 0.6134264469146729, |
|
"learning_rate": 3.0566880000000006e-05, |
|
"loss": 0.2915, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 0.6159347891807556, |
|
"learning_rate": 3.048688e-05, |
|
"loss": 0.2887, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.6079424023628235, |
|
"learning_rate": 3.0407040000000005e-05, |
|
"loss": 0.2915, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 0.7703385353088379, |
|
"learning_rate": 3.0327040000000002e-05, |
|
"loss": 0.2901, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 0.5626256465911865, |
|
"learning_rate": 3.024704e-05, |
|
"loss": 0.2938, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 0.554914653301239, |
|
"learning_rate": 3.016704e-05, |
|
"loss": 0.2913, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 0.6610060930252075, |
|
"learning_rate": 3.008704e-05, |
|
"loss": 0.2912, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6194009780883789, |
|
"learning_rate": 3.0007040000000002e-05, |
|
"loss": 0.2901, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 0.7150211930274963, |
|
"learning_rate": 2.992704e-05, |
|
"loss": 0.2895, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 0.6945148706436157, |
|
"learning_rate": 2.9847040000000003e-05, |
|
"loss": 0.2878, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 0.6546908617019653, |
|
"learning_rate": 2.9767200000000002e-05, |
|
"loss": 0.287, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 0.535040020942688, |
|
"learning_rate": 2.9687360000000004e-05, |
|
"loss": 0.2901, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.6062806844711304, |
|
"learning_rate": 2.960736e-05, |
|
"loss": 0.2862, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 0.6202298998832703, |
|
"learning_rate": 2.9527360000000005e-05, |
|
"loss": 0.2884, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 0.5966545343399048, |
|
"learning_rate": 2.9447360000000003e-05, |
|
"loss": 0.2877, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 0.5024796724319458, |
|
"learning_rate": 2.936736e-05, |
|
"loss": 0.2882, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 0.5895559191703796, |
|
"learning_rate": 2.9287520000000002e-05, |
|
"loss": 0.288, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.9302066564559937, |
|
"learning_rate": 2.920752e-05, |
|
"loss": 0.286, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 0.573466956615448, |
|
"learning_rate": 2.9127520000000003e-05, |
|
"loss": 0.2848, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 0.5901783108711243, |
|
"learning_rate": 2.904768e-05, |
|
"loss": 0.2883, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 0.7780030369758606, |
|
"learning_rate": 2.8967680000000002e-05, |
|
"loss": 0.2914, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 0.6630533933639526, |
|
"learning_rate": 2.888768e-05, |
|
"loss": 0.2878, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.6001667976379395, |
|
"learning_rate": 2.8807680000000004e-05, |
|
"loss": 0.2818, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 0.6324682831764221, |
|
"learning_rate": 2.872768e-05, |
|
"loss": 0.2849, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 0.6814092993736267, |
|
"learning_rate": 2.864768e-05, |
|
"loss": 0.288, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 0.651709794998169, |
|
"learning_rate": 2.8567680000000003e-05, |
|
"loss": 0.2872, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 0.5912330746650696, |
|
"learning_rate": 2.848768e-05, |
|
"loss": 0.2824, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.5821974277496338, |
|
"learning_rate": 2.8407680000000004e-05, |
|
"loss": 0.2853, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 0.6262611150741577, |
|
"learning_rate": 2.832784e-05, |
|
"loss": 0.2848, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 0.5360976457595825, |
|
"learning_rate": 2.8247840000000004e-05, |
|
"loss": 0.2869, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 0.6523284912109375, |
|
"learning_rate": 2.816784e-05, |
|
"loss": 0.2792, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 0.6329330205917358, |
|
"learning_rate": 2.808784e-05, |
|
"loss": 0.2865, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6053124666213989, |
|
"learning_rate": 2.8007840000000003e-05, |
|
"loss": 0.2844, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 0.6887571811676025, |
|
"learning_rate": 2.7927999999999998e-05, |
|
"loss": 0.288, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 0.7047476172447205, |
|
"learning_rate": 2.7848000000000002e-05, |
|
"loss": 0.2877, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 0.598227858543396, |
|
"learning_rate": 2.7768e-05, |
|
"loss": 0.2867, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 0.5094701051712036, |
|
"learning_rate": 2.7688000000000003e-05, |
|
"loss": 0.2832, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.5749739408493042, |
|
"learning_rate": 2.7608e-05, |
|
"loss": 0.2821, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 0.4442578852176666, |
|
"learning_rate": 2.7528320000000003e-05, |
|
"loss": 0.282, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 0.5418574213981628, |
|
"learning_rate": 2.744832e-05, |
|
"loss": 0.2816, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 0.5984327793121338, |
|
"learning_rate": 2.736832e-05, |
|
"loss": 0.285, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 0.6572843194007874, |
|
"learning_rate": 2.728832e-05, |
|
"loss": 0.2817, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.590993344783783, |
|
"learning_rate": 2.7208320000000003e-05, |
|
"loss": 0.288, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 0.6096624135971069, |
|
"learning_rate": 2.712832e-05, |
|
"loss": 0.2861, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 0.5189167261123657, |
|
"learning_rate": 2.7048319999999998e-05, |
|
"loss": 0.2857, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 0.5812899470329285, |
|
"learning_rate": 2.6968320000000002e-05, |
|
"loss": 0.2888, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 0.515201210975647, |
|
"learning_rate": 2.688832e-05, |
|
"loss": 0.2791, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.6398504972457886, |
|
"learning_rate": 2.6808320000000004e-05, |
|
"loss": 0.282, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 0.5990891456604004, |
|
"learning_rate": 2.672832e-05, |
|
"loss": 0.28, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 0.5883029699325562, |
|
"learning_rate": 2.664832e-05, |
|
"loss": 0.2777, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 0.6432376503944397, |
|
"learning_rate": 2.656848e-05, |
|
"loss": 0.2804, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 0.5375948548316956, |
|
"learning_rate": 2.6488479999999997e-05, |
|
"loss": 0.2807, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.6207411885261536, |
|
"learning_rate": 2.6408640000000003e-05, |
|
"loss": 0.283, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 0.5854378342628479, |
|
"learning_rate": 2.632864e-05, |
|
"loss": 0.2854, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 0.5260078310966492, |
|
"learning_rate": 2.6248800000000002e-05, |
|
"loss": 0.2836, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 0.6284717917442322, |
|
"learning_rate": 2.61688e-05, |
|
"loss": 0.2824, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 0.6092182397842407, |
|
"learning_rate": 2.608896e-05, |
|
"loss": 0.2804, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6028911471366882, |
|
"learning_rate": 2.600896e-05, |
|
"loss": 0.281, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.9632, |
|
"grad_norm": 0.5008478164672852, |
|
"learning_rate": 2.5928960000000003e-05, |
|
"loss": 0.277, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 0.5233867168426514, |
|
"learning_rate": 2.584896e-05, |
|
"loss": 0.2807, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.9696, |
|
"grad_norm": 0.5762408375740051, |
|
"learning_rate": 2.5768960000000004e-05, |
|
"loss": 0.2831, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 0.6097844243049622, |
|
"learning_rate": 2.568896e-05, |
|
"loss": 0.2803, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.6696804761886597, |
|
"learning_rate": 2.560896e-05, |
|
"loss": 0.2742, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 0.6028556823730469, |
|
"learning_rate": 2.5528960000000003e-05, |
|
"loss": 0.282, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.9824, |
|
"grad_norm": 0.6651898622512817, |
|
"learning_rate": 2.544896e-05, |
|
"loss": 0.2849, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 0.5219380855560303, |
|
"learning_rate": 2.536896e-05, |
|
"loss": 0.2785, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.9888, |
|
"grad_norm": 0.6161176562309265, |
|
"learning_rate": 2.5288960000000002e-05, |
|
"loss": 0.2808, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.7915316224098206, |
|
"learning_rate": 2.5208960000000003e-05, |
|
"loss": 0.2777, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.9952, |
|
"grad_norm": 0.7261882424354553, |
|
"learning_rate": 2.512896e-05, |
|
"loss": 0.2767, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 0.5452406406402588, |
|
"learning_rate": 2.5048959999999997e-05, |
|
"loss": 0.2764, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 1.0016, |
|
"grad_norm": 0.642181396484375, |
|
"learning_rate": 2.4969120000000003e-05, |
|
"loss": 0.2746, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 1.0048, |
|
"grad_norm": 0.5900291204452515, |
|
"learning_rate": 2.4889120000000003e-05, |
|
"loss": 0.2721, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.5960043668746948, |
|
"learning_rate": 2.480912e-05, |
|
"loss": 0.265, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 1.0112, |
|
"grad_norm": 0.582115650177002, |
|
"learning_rate": 2.472912e-05, |
|
"loss": 0.2673, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 1.0144, |
|
"grad_norm": 0.552392303943634, |
|
"learning_rate": 2.464912e-05, |
|
"loss": 0.2663, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 1.0176, |
|
"grad_norm": 0.5585765242576599, |
|
"learning_rate": 2.456912e-05, |
|
"loss": 0.2688, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 1.0208, |
|
"grad_norm": 0.6049332022666931, |
|
"learning_rate": 2.448912e-05, |
|
"loss": 0.266, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.5749877095222473, |
|
"learning_rate": 2.440912e-05, |
|
"loss": 0.2689, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.0272, |
|
"grad_norm": 0.5832675695419312, |
|
"learning_rate": 2.4329120000000002e-05, |
|
"loss": 0.2703, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 1.0304, |
|
"grad_norm": 0.8549031019210815, |
|
"learning_rate": 2.424928e-05, |
|
"loss": 0.2623, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 1.0336, |
|
"grad_norm": 0.5572855472564697, |
|
"learning_rate": 2.416928e-05, |
|
"loss": 0.2711, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 1.0368, |
|
"grad_norm": 0.6818140745162964, |
|
"learning_rate": 2.408928e-05, |
|
"loss": 0.2652, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.6900683045387268, |
|
"learning_rate": 2.400928e-05, |
|
"loss": 0.2669, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 1.0432, |
|
"grad_norm": 0.6015618443489075, |
|
"learning_rate": 2.392944e-05, |
|
"loss": 0.2654, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 1.0464, |
|
"grad_norm": 0.5343177318572998, |
|
"learning_rate": 2.3849440000000002e-05, |
|
"loss": 0.2656, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 1.0496, |
|
"grad_norm": 0.6130079627037048, |
|
"learning_rate": 2.3769440000000003e-05, |
|
"loss": 0.2592, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 1.0528, |
|
"grad_norm": 0.7150599956512451, |
|
"learning_rate": 2.368944e-05, |
|
"loss": 0.2634, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.6321354508399963, |
|
"learning_rate": 2.360944e-05, |
|
"loss": 0.2683, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 1.0592, |
|
"grad_norm": 0.6234462857246399, |
|
"learning_rate": 2.352976e-05, |
|
"loss": 0.2628, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 1.0624, |
|
"grad_norm": 0.6542537808418274, |
|
"learning_rate": 2.344976e-05, |
|
"loss": 0.2618, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 1.0656, |
|
"grad_norm": 0.6302633881568909, |
|
"learning_rate": 2.3369760000000002e-05, |
|
"loss": 0.2661, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 1.0688, |
|
"grad_norm": 0.5890353322029114, |
|
"learning_rate": 2.3289760000000002e-05, |
|
"loss": 0.2646, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.6490179300308228, |
|
"learning_rate": 2.320976e-05, |
|
"loss": 0.2635, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 1.0752, |
|
"grad_norm": 0.648162305355072, |
|
"learning_rate": 2.312976e-05, |
|
"loss": 0.2646, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 1.0784, |
|
"grad_norm": 0.675680935382843, |
|
"learning_rate": 2.304976e-05, |
|
"loss": 0.2626, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 1.0816, |
|
"grad_norm": 0.6192341446876526, |
|
"learning_rate": 2.2969760000000002e-05, |
|
"loss": 0.2641, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 1.0848, |
|
"grad_norm": 0.7046379446983337, |
|
"learning_rate": 2.288992e-05, |
|
"loss": 0.2643, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.5477197170257568, |
|
"learning_rate": 2.280992e-05, |
|
"loss": 0.265, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.0912, |
|
"grad_norm": 0.5775583982467651, |
|
"learning_rate": 2.2729920000000002e-05, |
|
"loss": 0.2645, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 1.0944, |
|
"grad_norm": 0.6389047503471375, |
|
"learning_rate": 2.2649920000000003e-05, |
|
"loss": 0.2634, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 1.0976, |
|
"grad_norm": 0.6169374585151672, |
|
"learning_rate": 2.256992e-05, |
|
"loss": 0.2642, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 1.1008, |
|
"grad_norm": 0.5913782715797424, |
|
"learning_rate": 2.2490080000000002e-05, |
|
"loss": 0.2658, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.7547928690910339, |
|
"learning_rate": 2.241008e-05, |
|
"loss": 0.2674, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 1.1072, |
|
"grad_norm": 0.6277585625648499, |
|
"learning_rate": 2.233024e-05, |
|
"loss": 0.2686, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 1.1104, |
|
"grad_norm": 0.6357282996177673, |
|
"learning_rate": 2.225024e-05, |
|
"loss": 0.2639, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 1.1136, |
|
"grad_norm": 0.5262208580970764, |
|
"learning_rate": 2.2170400000000004e-05, |
|
"loss": 0.2641, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 1.1168, |
|
"grad_norm": 0.6878075003623962, |
|
"learning_rate": 2.20904e-05, |
|
"loss": 0.2654, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.5332186222076416, |
|
"learning_rate": 2.2010400000000002e-05, |
|
"loss": 0.2638, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 1.1232, |
|
"grad_norm": 0.5562476515769958, |
|
"learning_rate": 2.19304e-05, |
|
"loss": 0.2648, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 1.1264, |
|
"grad_norm": 0.5924221277236938, |
|
"learning_rate": 2.18504e-05, |
|
"loss": 0.2627, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 1.1296, |
|
"grad_norm": 0.5250386595726013, |
|
"learning_rate": 2.17704e-05, |
|
"loss": 0.2619, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 1.1328, |
|
"grad_norm": 0.7426069378852844, |
|
"learning_rate": 2.16904e-05, |
|
"loss": 0.2628, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.4925951063632965, |
|
"learning_rate": 2.16104e-05, |
|
"loss": 0.2661, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 1.1392, |
|
"grad_norm": 0.5707270503044128, |
|
"learning_rate": 2.15304e-05, |
|
"loss": 0.2622, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 1.1424, |
|
"grad_norm": 0.5793021321296692, |
|
"learning_rate": 2.14504e-05, |
|
"loss": 0.2671, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 1.1456, |
|
"grad_norm": 0.5736916661262512, |
|
"learning_rate": 2.13704e-05, |
|
"loss": 0.2648, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 1.1488, |
|
"grad_norm": 0.588550329208374, |
|
"learning_rate": 2.129056e-05, |
|
"loss": 0.2641, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.5504462122917175, |
|
"learning_rate": 2.121056e-05, |
|
"loss": 0.2643, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.1552, |
|
"grad_norm": 0.5439949035644531, |
|
"learning_rate": 2.113056e-05, |
|
"loss": 0.2639, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 1.1584, |
|
"grad_norm": 0.6882042288780212, |
|
"learning_rate": 2.105056e-05, |
|
"loss": 0.2595, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 1.1616, |
|
"grad_norm": 0.6735561490058899, |
|
"learning_rate": 2.097056e-05, |
|
"loss": 0.2624, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 1.1648, |
|
"grad_norm": 0.5545785427093506, |
|
"learning_rate": 2.089056e-05, |
|
"loss": 0.2625, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.6497994065284729, |
|
"learning_rate": 2.081056e-05, |
|
"loss": 0.2611, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 1.1712, |
|
"grad_norm": 0.5887815356254578, |
|
"learning_rate": 2.073056e-05, |
|
"loss": 0.2632, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 1.1743999999999999, |
|
"grad_norm": 0.6037270426750183, |
|
"learning_rate": 2.0650560000000002e-05, |
|
"loss": 0.2645, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 1.1776, |
|
"grad_norm": 0.636946439743042, |
|
"learning_rate": 2.057072e-05, |
|
"loss": 0.2628, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 1.1808, |
|
"grad_norm": 0.5285276770591736, |
|
"learning_rate": 2.049072e-05, |
|
"loss": 0.2629, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.4634397625923157, |
|
"learning_rate": 2.041072e-05, |
|
"loss": 0.2615, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 1.1872, |
|
"grad_norm": 0.5693604946136475, |
|
"learning_rate": 2.033072e-05, |
|
"loss": 0.2619, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 1.1904, |
|
"grad_norm": 0.6433858275413513, |
|
"learning_rate": 2.025072e-05, |
|
"loss": 0.2591, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 1.1936, |
|
"grad_norm": 0.5103280544281006, |
|
"learning_rate": 2.017088e-05, |
|
"loss": 0.2606, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 1.1968, |
|
"grad_norm": 0.5591945648193359, |
|
"learning_rate": 2.009104e-05, |
|
"loss": 0.2628, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.5560447573661804, |
|
"learning_rate": 2.001104e-05, |
|
"loss": 0.2603, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 1.2032, |
|
"grad_norm": 0.5321928262710571, |
|
"learning_rate": 1.9931040000000002e-05, |
|
"loss": 0.2576, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 1.2064, |
|
"grad_norm": 0.6455059051513672, |
|
"learning_rate": 1.9851040000000003e-05, |
|
"loss": 0.2615, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 1.2096, |
|
"grad_norm": 0.6237916946411133, |
|
"learning_rate": 1.977104e-05, |
|
"loss": 0.2626, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 1.2128, |
|
"grad_norm": 0.5269157886505127, |
|
"learning_rate": 1.969104e-05, |
|
"loss": 0.2597, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.5521387457847595, |
|
"learning_rate": 1.961104e-05, |
|
"loss": 0.257, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 1.2192, |
|
"grad_norm": 0.6061577796936035, |
|
"learning_rate": 1.953104e-05, |
|
"loss": 0.2626, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 1.2224, |
|
"grad_norm": 0.6479594111442566, |
|
"learning_rate": 1.945104e-05, |
|
"loss": 0.2586, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 1.2256, |
|
"grad_norm": 0.5330658555030823, |
|
"learning_rate": 1.937104e-05, |
|
"loss": 0.2573, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 1.2288000000000001, |
|
"grad_norm": 0.5984029173851013, |
|
"learning_rate": 1.9291200000000003e-05, |
|
"loss": 0.2591, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 0.8451948165893555, |
|
"learning_rate": 1.92112e-05, |
|
"loss": 0.2591, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 1.2352, |
|
"grad_norm": 0.6519868969917297, |
|
"learning_rate": 1.91312e-05, |
|
"loss": 0.2608, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 1.2384, |
|
"grad_norm": 0.6487559080123901, |
|
"learning_rate": 1.9051199999999998e-05, |
|
"loss": 0.2558, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 1.2416, |
|
"grad_norm": 0.5544815063476562, |
|
"learning_rate": 1.897152e-05, |
|
"loss": 0.2609, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 1.2448, |
|
"grad_norm": 0.594536542892456, |
|
"learning_rate": 1.8891520000000002e-05, |
|
"loss": 0.2592, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.5301911234855652, |
|
"learning_rate": 1.8811520000000002e-05, |
|
"loss": 0.2579, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 1.2511999999999999, |
|
"grad_norm": 0.6232271790504456, |
|
"learning_rate": 1.873152e-05, |
|
"loss": 0.2611, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 1.2544, |
|
"grad_norm": 0.6571745276451111, |
|
"learning_rate": 1.865152e-05, |
|
"loss": 0.2617, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 1.2576, |
|
"grad_norm": 0.6281866431236267, |
|
"learning_rate": 1.857152e-05, |
|
"loss": 0.2605, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 1.2608, |
|
"grad_norm": 0.6584866642951965, |
|
"learning_rate": 1.8491520000000002e-05, |
|
"loss": 0.2575, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 0.5791180729866028, |
|
"learning_rate": 1.8411520000000003e-05, |
|
"loss": 0.2572, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 1.2671999999999999, |
|
"grad_norm": 0.5907946228981018, |
|
"learning_rate": 1.8331520000000004e-05, |
|
"loss": 0.2576, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 1.2704, |
|
"grad_norm": 0.6532405614852905, |
|
"learning_rate": 1.8251680000000002e-05, |
|
"loss": 0.2588, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 1.2736, |
|
"grad_norm": 0.5683246850967407, |
|
"learning_rate": 1.8171680000000003e-05, |
|
"loss": 0.2597, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 1.2768, |
|
"grad_norm": 0.5847846865653992, |
|
"learning_rate": 1.809168e-05, |
|
"loss": 0.2628, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.5554783344268799, |
|
"learning_rate": 1.801168e-05, |
|
"loss": 0.2542, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 1.2832, |
|
"grad_norm": 0.6664928793907166, |
|
"learning_rate": 1.793184e-05, |
|
"loss": 0.2586, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 1.2864, |
|
"grad_norm": 0.5993084907531738, |
|
"learning_rate": 1.785184e-05, |
|
"loss": 0.2571, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 1.2896, |
|
"grad_norm": 0.4557185173034668, |
|
"learning_rate": 1.777184e-05, |
|
"loss": 0.2594, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 1.2928, |
|
"grad_norm": 0.7798305749893188, |
|
"learning_rate": 1.7691840000000002e-05, |
|
"loss": 0.2561, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 0.5406688451766968, |
|
"learning_rate": 1.7611840000000002e-05, |
|
"loss": 0.2562, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 1.2992, |
|
"grad_norm": 0.5173208117485046, |
|
"learning_rate": 1.7531840000000003e-05, |
|
"loss": 0.2606, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 1.3024, |
|
"grad_norm": 0.6803346872329712, |
|
"learning_rate": 1.7452e-05, |
|
"loss": 0.259, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 1.3056, |
|
"grad_norm": 0.5223200917243958, |
|
"learning_rate": 1.7372000000000002e-05, |
|
"loss": 0.2571, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 1.3088, |
|
"grad_norm": 0.6100528240203857, |
|
"learning_rate": 1.7292e-05, |
|
"loss": 0.2558, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.623023271560669, |
|
"learning_rate": 1.7212e-05, |
|
"loss": 0.2563, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 1.3152, |
|
"grad_norm": 0.5915964841842651, |
|
"learning_rate": 1.713216e-05, |
|
"loss": 0.2581, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 1.3184, |
|
"grad_norm": 0.538467526435852, |
|
"learning_rate": 1.705216e-05, |
|
"loss": 0.2554, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 1.3216, |
|
"grad_norm": 0.5382514595985413, |
|
"learning_rate": 1.697216e-05, |
|
"loss": 0.2581, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 1.3248, |
|
"grad_norm": 0.6466744542121887, |
|
"learning_rate": 1.689216e-05, |
|
"loss": 0.2573, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 0.742675244808197, |
|
"learning_rate": 1.6812160000000002e-05, |
|
"loss": 0.2572, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 1.3312, |
|
"grad_norm": 0.6123968362808228, |
|
"learning_rate": 1.673232e-05, |
|
"loss": 0.2598, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 1.3344, |
|
"grad_norm": 0.6710489392280579, |
|
"learning_rate": 1.665232e-05, |
|
"loss": 0.2604, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 1.3376000000000001, |
|
"grad_norm": 0.685879111289978, |
|
"learning_rate": 1.657232e-05, |
|
"loss": 0.2576, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 1.3408, |
|
"grad_norm": 0.5600978136062622, |
|
"learning_rate": 1.649232e-05, |
|
"loss": 0.2606, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.5358079075813293, |
|
"learning_rate": 1.6412640000000002e-05, |
|
"loss": 0.2578, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 1.3472, |
|
"grad_norm": 0.7245667576789856, |
|
"learning_rate": 1.63328e-05, |
|
"loss": 0.2613, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 1.3504, |
|
"grad_norm": 0.5963015556335449, |
|
"learning_rate": 1.62528e-05, |
|
"loss": 0.2568, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 1.3536000000000001, |
|
"grad_norm": 0.6139352917671204, |
|
"learning_rate": 1.6172800000000002e-05, |
|
"loss": 0.2501, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 1.3568, |
|
"grad_norm": 0.5434224605560303, |
|
"learning_rate": 1.60928e-05, |
|
"loss": 0.2583, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.5723361372947693, |
|
"learning_rate": 1.60128e-05, |
|
"loss": 0.2582, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 1.3632, |
|
"grad_norm": 0.5621342658996582, |
|
"learning_rate": 1.593296e-05, |
|
"loss": 0.2583, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 1.3664, |
|
"grad_norm": 0.5716707706451416, |
|
"learning_rate": 1.585296e-05, |
|
"loss": 0.2548, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 1.3696, |
|
"grad_norm": 0.6344952583312988, |
|
"learning_rate": 1.577296e-05, |
|
"loss": 0.2526, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 1.3728, |
|
"grad_norm": 0.6360082030296326, |
|
"learning_rate": 1.569296e-05, |
|
"loss": 0.259, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.5400614142417908, |
|
"learning_rate": 1.5612960000000002e-05, |
|
"loss": 0.26, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 1.3792, |
|
"grad_norm": 0.6992815732955933, |
|
"learning_rate": 1.5532960000000002e-05, |
|
"loss": 0.259, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 1.3824, |
|
"grad_norm": 0.4903436601161957, |
|
"learning_rate": 1.545296e-05, |
|
"loss": 0.2582, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 1.3856, |
|
"grad_norm": 0.5602136850357056, |
|
"learning_rate": 1.537296e-05, |
|
"loss": 0.2563, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 1.3888, |
|
"grad_norm": 0.5858916640281677, |
|
"learning_rate": 1.529296e-05, |
|
"loss": 0.2553, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 0.438550740480423, |
|
"learning_rate": 1.521312e-05, |
|
"loss": 0.2567, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 1.3952, |
|
"grad_norm": 0.5660952925682068, |
|
"learning_rate": 1.513312e-05, |
|
"loss": 0.2552, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 1.3984, |
|
"grad_norm": 0.6139314770698547, |
|
"learning_rate": 1.5053120000000001e-05, |
|
"loss": 0.2506, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 1.4016, |
|
"grad_norm": 0.5470092296600342, |
|
"learning_rate": 1.4973120000000002e-05, |
|
"loss": 0.2549, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 1.4048, |
|
"grad_norm": 0.5565882325172424, |
|
"learning_rate": 1.489328e-05, |
|
"loss": 0.256, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.4755209684371948, |
|
"learning_rate": 1.4813280000000001e-05, |
|
"loss": 0.2571, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 1.4112, |
|
"grad_norm": 0.5266921520233154, |
|
"learning_rate": 1.4733280000000002e-05, |
|
"loss": 0.2546, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 1.4144, |
|
"grad_norm": 0.5858850479125977, |
|
"learning_rate": 1.465328e-05, |
|
"loss": 0.251, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 1.4176, |
|
"grad_norm": 0.5382100343704224, |
|
"learning_rate": 1.457328e-05, |
|
"loss": 0.254, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 1.4208, |
|
"grad_norm": 0.6082443594932556, |
|
"learning_rate": 1.4493280000000001e-05, |
|
"loss": 0.2549, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 0.56458979845047, |
|
"learning_rate": 1.4413440000000001e-05, |
|
"loss": 0.2556, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 1.4272, |
|
"grad_norm": 0.5702414512634277, |
|
"learning_rate": 1.433344e-05, |
|
"loss": 0.2523, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 1.4304000000000001, |
|
"grad_norm": 0.5704798102378845, |
|
"learning_rate": 1.4253440000000001e-05, |
|
"loss": 0.2539, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 1.4336, |
|
"grad_norm": 0.675832211971283, |
|
"learning_rate": 1.4173440000000002e-05, |
|
"loss": 0.256, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 1.4368, |
|
"grad_norm": 0.8129279017448425, |
|
"learning_rate": 1.4093439999999999e-05, |
|
"loss": 0.2533, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.6167120933532715, |
|
"learning_rate": 1.401344e-05, |
|
"loss": 0.254, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 1.4432, |
|
"grad_norm": 0.6647797226905823, |
|
"learning_rate": 1.39336e-05, |
|
"loss": 0.2497, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 1.4464000000000001, |
|
"grad_norm": 0.5046721696853638, |
|
"learning_rate": 1.38536e-05, |
|
"loss": 0.2513, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 1.4496, |
|
"grad_norm": 0.5725312232971191, |
|
"learning_rate": 1.37736e-05, |
|
"loss": 0.2547, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 1.4527999999999999, |
|
"grad_norm": 0.514900803565979, |
|
"learning_rate": 1.36936e-05, |
|
"loss": 0.2504, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 0.3646963834762573, |
|
"learning_rate": 1.3613600000000001e-05, |
|
"loss": 0.2548, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 1.4592, |
|
"grad_norm": 0.6462276577949524, |
|
"learning_rate": 1.3533600000000002e-05, |
|
"loss": 0.2537, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 1.4624, |
|
"grad_norm": 0.5525985956192017, |
|
"learning_rate": 1.34536e-05, |
|
"loss": 0.253, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 1.4656, |
|
"grad_norm": 0.5146868824958801, |
|
"learning_rate": 1.33736e-05, |
|
"loss": 0.2531, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 1.4687999999999999, |
|
"grad_norm": 0.6087847948074341, |
|
"learning_rate": 1.329376e-05, |
|
"loss": 0.2517, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.5387943387031555, |
|
"learning_rate": 1.321376e-05, |
|
"loss": 0.2514, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 1.4752, |
|
"grad_norm": 0.5926588177680969, |
|
"learning_rate": 1.313376e-05, |
|
"loss": 0.2544, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 1.4784, |
|
"grad_norm": 0.5444154143333435, |
|
"learning_rate": 1.3053760000000001e-05, |
|
"loss": 0.253, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 1.4816, |
|
"grad_norm": 0.5707711577415466, |
|
"learning_rate": 1.2973760000000002e-05, |
|
"loss": 0.2509, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 1.4848, |
|
"grad_norm": 0.5120610594749451, |
|
"learning_rate": 1.2893760000000002e-05, |
|
"loss": 0.2535, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 0.6814270615577698, |
|
"learning_rate": 1.281392e-05, |
|
"loss": 0.2542, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 1.4912, |
|
"grad_norm": 0.5387424826622009, |
|
"learning_rate": 1.2733920000000002e-05, |
|
"loss": 0.2559, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 1.4944, |
|
"grad_norm": 0.6061420440673828, |
|
"learning_rate": 1.2653919999999999e-05, |
|
"loss": 0.2492, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 1.4976, |
|
"grad_norm": 0.5238478183746338, |
|
"learning_rate": 1.257392e-05, |
|
"loss": 0.2495, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 1.5008, |
|
"grad_norm": 0.6245620846748352, |
|
"learning_rate": 1.2494080000000002e-05, |
|
"loss": 0.2503, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.594336211681366, |
|
"learning_rate": 1.241408e-05, |
|
"loss": 0.2526, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 1.5072, |
|
"grad_norm": 0.6665235161781311, |
|
"learning_rate": 1.233424e-05, |
|
"loss": 0.2565, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 1.5104, |
|
"grad_norm": 0.4540468454360962, |
|
"learning_rate": 1.225424e-05, |
|
"loss": 0.2509, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 1.5135999999999998, |
|
"grad_norm": 0.48204490542411804, |
|
"learning_rate": 1.217424e-05, |
|
"loss": 0.2566, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 1.5168, |
|
"grad_norm": 0.7209044098854065, |
|
"learning_rate": 1.2094240000000001e-05, |
|
"loss": 0.2485, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.5661574006080627, |
|
"learning_rate": 1.201424e-05, |
|
"loss": 0.2496, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 1.5232, |
|
"grad_norm": 0.4637988805770874, |
|
"learning_rate": 1.1934240000000001e-05, |
|
"loss": 0.2498, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 1.5264, |
|
"grad_norm": 0.5440483093261719, |
|
"learning_rate": 1.1854240000000002e-05, |
|
"loss": 0.2539, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 1.5295999999999998, |
|
"grad_norm": 0.6143088936805725, |
|
"learning_rate": 1.1774240000000001e-05, |
|
"loss": 0.2507, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 1.5328, |
|
"grad_norm": 0.553655207157135, |
|
"learning_rate": 1.1694400000000001e-05, |
|
"loss": 0.2494, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.5812162160873413, |
|
"learning_rate": 1.16144e-05, |
|
"loss": 0.2494, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 1.5392000000000001, |
|
"grad_norm": 0.4919438660144806, |
|
"learning_rate": 1.1534400000000001e-05, |
|
"loss": 0.2471, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 1.5424, |
|
"grad_norm": 0.6260576844215393, |
|
"learning_rate": 1.1454400000000002e-05, |
|
"loss": 0.2518, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 1.5455999999999999, |
|
"grad_norm": 0.6452062726020813, |
|
"learning_rate": 1.137456e-05, |
|
"loss": 0.253, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 1.5488, |
|
"grad_norm": 0.5557950139045715, |
|
"learning_rate": 1.1294559999999999e-05, |
|
"loss": 0.2553, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 0.6445254683494568, |
|
"learning_rate": 1.121456e-05, |
|
"loss": 0.2513, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 1.5552000000000001, |
|
"grad_norm": 0.5771984457969666, |
|
"learning_rate": 1.113456e-05, |
|
"loss": 0.2518, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 1.5584, |
|
"grad_norm": 0.48172786831855774, |
|
"learning_rate": 1.105456e-05, |
|
"loss": 0.2529, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 1.5615999999999999, |
|
"grad_norm": 0.5962732434272766, |
|
"learning_rate": 1.0974720000000002e-05, |
|
"loss": 0.2552, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 1.5648, |
|
"grad_norm": 0.5713253617286682, |
|
"learning_rate": 1.089472e-05, |
|
"loss": 0.2518, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.7049676775932312, |
|
"learning_rate": 1.081472e-05, |
|
"loss": 0.2503, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 1.5712000000000002, |
|
"grad_norm": 0.5562995076179504, |
|
"learning_rate": 1.073472e-05, |
|
"loss": 0.2534, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 1.5744, |
|
"grad_norm": 0.5492623448371887, |
|
"learning_rate": 1.065472e-05, |
|
"loss": 0.2508, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 1.5776, |
|
"grad_norm": 0.6449033617973328, |
|
"learning_rate": 1.057472e-05, |
|
"loss": 0.2506, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 1.5808, |
|
"grad_norm": 0.5232768058776855, |
|
"learning_rate": 1.049488e-05, |
|
"loss": 0.2497, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 0.5512565970420837, |
|
"learning_rate": 1.0414880000000001e-05, |
|
"loss": 0.2534, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 1.5872000000000002, |
|
"grad_norm": 0.48962149024009705, |
|
"learning_rate": 1.033488e-05, |
|
"loss": 0.2552, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 1.5904, |
|
"grad_norm": 0.5635197162628174, |
|
"learning_rate": 1.025488e-05, |
|
"loss": 0.251, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 1.5936, |
|
"grad_norm": 0.5858097076416016, |
|
"learning_rate": 1.0175040000000001e-05, |
|
"loss": 0.2521, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 1.5968, |
|
"grad_norm": 0.5749566555023193, |
|
"learning_rate": 1.0095200000000001e-05, |
|
"loss": 0.2542, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6057468056678772, |
|
"learning_rate": 1.00152e-05, |
|
"loss": 0.249, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.6032, |
|
"grad_norm": 0.68129962682724, |
|
"learning_rate": 9.9352e-06, |
|
"loss": 0.2496, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 1.6064, |
|
"grad_norm": 0.5518680810928345, |
|
"learning_rate": 9.8552e-06, |
|
"loss": 0.2483, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 1.6096, |
|
"grad_norm": 0.7354257702827454, |
|
"learning_rate": 9.775200000000001e-06, |
|
"loss": 0.251, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 1.6128, |
|
"grad_norm": 0.5537115335464478, |
|
"learning_rate": 9.6952e-06, |
|
"loss": 0.2519, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 0.5443572402000427, |
|
"learning_rate": 9.615360000000002e-06, |
|
"loss": 0.2505, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 1.6192, |
|
"grad_norm": 0.8157851099967957, |
|
"learning_rate": 9.53536e-06, |
|
"loss": 0.2456, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 1.6223999999999998, |
|
"grad_norm": 0.5709113478660583, |
|
"learning_rate": 9.455520000000001e-06, |
|
"loss": 0.2511, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 1.6256, |
|
"grad_norm": 0.5266199707984924, |
|
"learning_rate": 9.37552e-06, |
|
"loss": 0.2523, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 1.6288, |
|
"grad_norm": 0.6796950697898865, |
|
"learning_rate": 9.29552e-06, |
|
"loss": 0.2529, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.5162604451179504, |
|
"learning_rate": 9.215520000000002e-06, |
|
"loss": 0.2486, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 1.6352, |
|
"grad_norm": 0.5577069520950317, |
|
"learning_rate": 9.13552e-06, |
|
"loss": 0.2501, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 1.6383999999999999, |
|
"grad_norm": 0.5930905342102051, |
|
"learning_rate": 9.055520000000001e-06, |
|
"loss": 0.2505, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 1.6416, |
|
"grad_norm": 0.5219632983207703, |
|
"learning_rate": 8.97552e-06, |
|
"loss": 0.2499, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 1.6448, |
|
"grad_norm": 0.5385752320289612, |
|
"learning_rate": 8.89552e-06, |
|
"loss": 0.2473, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 0.5498505234718323, |
|
"learning_rate": 8.81552e-06, |
|
"loss": 0.2513, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 1.6512, |
|
"grad_norm": 0.5780929923057556, |
|
"learning_rate": 8.73552e-06, |
|
"loss": 0.2488, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 1.6543999999999999, |
|
"grad_norm": 0.6167399883270264, |
|
"learning_rate": 8.65552e-06, |
|
"loss": 0.2501, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 1.6576, |
|
"grad_norm": 0.6829573512077332, |
|
"learning_rate": 8.575520000000001e-06, |
|
"loss": 0.2477, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 1.6608, |
|
"grad_norm": 0.4874655604362488, |
|
"learning_rate": 8.49552e-06, |
|
"loss": 0.2507, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.5769158601760864, |
|
"learning_rate": 8.41568e-06, |
|
"loss": 0.2477, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.6672, |
|
"grad_norm": 0.45717403292655945, |
|
"learning_rate": 8.335679999999999e-06, |
|
"loss": 0.2515, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 1.6703999999999999, |
|
"grad_norm": 0.5851497650146484, |
|
"learning_rate": 8.25568e-06, |
|
"loss": 0.2453, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 1.6736, |
|
"grad_norm": 0.7223703265190125, |
|
"learning_rate": 8.17568e-06, |
|
"loss": 0.2534, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 1.6768, |
|
"grad_norm": 0.5290210843086243, |
|
"learning_rate": 8.09584e-06, |
|
"loss": 0.2468, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.5959377884864807, |
|
"learning_rate": 8.015999999999999e-06, |
|
"loss": 0.2498, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 1.6832, |
|
"grad_norm": 0.5404760241508484, |
|
"learning_rate": 7.936e-06, |
|
"loss": 0.2485, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 1.6864, |
|
"grad_norm": 0.496378093957901, |
|
"learning_rate": 7.856e-06, |
|
"loss": 0.2474, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 1.6896, |
|
"grad_norm": 0.54584801197052, |
|
"learning_rate": 7.776e-06, |
|
"loss": 0.2498, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 1.6928, |
|
"grad_norm": 0.5465365052223206, |
|
"learning_rate": 7.696160000000002e-06, |
|
"loss": 0.2468, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.5857728719711304, |
|
"learning_rate": 7.61616e-06, |
|
"loss": 0.2485, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 1.6992, |
|
"grad_norm": 0.5276440382003784, |
|
"learning_rate": 7.5361600000000005e-06, |
|
"loss": 0.2467, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 1.7024, |
|
"grad_norm": 0.5197107195854187, |
|
"learning_rate": 7.456160000000001e-06, |
|
"loss": 0.2457, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 1.7056, |
|
"grad_norm": 0.6030395030975342, |
|
"learning_rate": 7.37616e-06, |
|
"loss": 0.2449, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 1.7088, |
|
"grad_norm": 0.5553884506225586, |
|
"learning_rate": 7.29616e-06, |
|
"loss": 0.2459, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 0.6518832445144653, |
|
"learning_rate": 7.216160000000001e-06, |
|
"loss": 0.2469, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 1.7151999999999998, |
|
"grad_norm": 0.6981451511383057, |
|
"learning_rate": 7.13616e-06, |
|
"loss": 0.2493, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 1.7184, |
|
"grad_norm": 0.6021608114242554, |
|
"learning_rate": 7.056160000000001e-06, |
|
"loss": 0.2477, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 1.7216, |
|
"grad_norm": 0.6317922472953796, |
|
"learning_rate": 6.976160000000001e-06, |
|
"loss": 0.2461, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 1.7248, |
|
"grad_norm": 0.6130341291427612, |
|
"learning_rate": 6.89616e-06, |
|
"loss": 0.2508, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.6314118504524231, |
|
"learning_rate": 6.816160000000001e-06, |
|
"loss": 0.243, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 1.7311999999999999, |
|
"grad_norm": 0.6070537567138672, |
|
"learning_rate": 6.73616e-06, |
|
"loss": 0.2472, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 1.7344, |
|
"grad_norm": 0.5763754844665527, |
|
"learning_rate": 6.6561600000000005e-06, |
|
"loss": 0.2511, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 1.7376, |
|
"grad_norm": 0.6849692463874817, |
|
"learning_rate": 6.57632e-06, |
|
"loss": 0.2461, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 1.7408000000000001, |
|
"grad_norm": 0.6505193710327148, |
|
"learning_rate": 6.4963200000000005e-06, |
|
"loss": 0.2472, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 0.5150639414787292, |
|
"learning_rate": 6.4163200000000004e-06, |
|
"loss": 0.2456, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 1.7471999999999999, |
|
"grad_norm": 0.6367226839065552, |
|
"learning_rate": 6.3363199999999995e-06, |
|
"loss": 0.2478, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 1.7504, |
|
"grad_norm": 0.6016091704368591, |
|
"learning_rate": 6.25632e-06, |
|
"loss": 0.2457, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 1.7536, |
|
"grad_norm": 0.6344937682151794, |
|
"learning_rate": 6.17632e-06, |
|
"loss": 0.2449, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 1.7568000000000001, |
|
"grad_norm": 0.6031948924064636, |
|
"learning_rate": 6.09632e-06, |
|
"loss": 0.2474, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.6515588760375977, |
|
"learning_rate": 6.01648e-06, |
|
"loss": 0.2463, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 1.7631999999999999, |
|
"grad_norm": 0.562818706035614, |
|
"learning_rate": 5.93648e-06, |
|
"loss": 0.2507, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 1.7664, |
|
"grad_norm": 0.591066300868988, |
|
"learning_rate": 5.85648e-06, |
|
"loss": 0.2488, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 1.7696, |
|
"grad_norm": 0.544698178768158, |
|
"learning_rate": 5.77664e-06, |
|
"loss": 0.2445, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 1.7728000000000002, |
|
"grad_norm": 0.4837886691093445, |
|
"learning_rate": 5.6968e-06, |
|
"loss": 0.2497, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 0.5405265092849731, |
|
"learning_rate": 5.6168e-06, |
|
"loss": 0.2489, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 1.7792, |
|
"grad_norm": 0.560249924659729, |
|
"learning_rate": 5.5368e-06, |
|
"loss": 0.2466, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 1.7824, |
|
"grad_norm": 0.5680164098739624, |
|
"learning_rate": 5.4568e-06, |
|
"loss": 0.2475, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 1.7856, |
|
"grad_norm": 0.7152078747749329, |
|
"learning_rate": 5.376800000000001e-06, |
|
"loss": 0.2439, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 1.7888, |
|
"grad_norm": 0.6013668775558472, |
|
"learning_rate": 5.296800000000001e-06, |
|
"loss": 0.2482, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.5784064531326294, |
|
"learning_rate": 5.2168e-06, |
|
"loss": 0.2495, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 1.7952, |
|
"grad_norm": 0.5531567931175232, |
|
"learning_rate": 5.1368e-06, |
|
"loss": 0.2497, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 1.7984, |
|
"grad_norm": 0.5494315028190613, |
|
"learning_rate": 5.0568000000000004e-06, |
|
"loss": 0.2476, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 1.8016, |
|
"grad_norm": 0.6254246830940247, |
|
"learning_rate": 4.9768e-06, |
|
"loss": 0.243, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 1.8048, |
|
"grad_norm": 0.7309369444847107, |
|
"learning_rate": 4.8969600000000005e-06, |
|
"loss": 0.2428, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 0.587374210357666, |
|
"learning_rate": 4.81696e-06, |
|
"loss": 0.253, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 1.8112, |
|
"grad_norm": 0.5197418928146362, |
|
"learning_rate": 4.73696e-06, |
|
"loss": 0.2454, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 1.8144, |
|
"grad_norm": 0.5393714308738708, |
|
"learning_rate": 4.65696e-06, |
|
"loss": 0.2442, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 1.8176, |
|
"grad_norm": 0.6797386407852173, |
|
"learning_rate": 4.57696e-06, |
|
"loss": 0.2455, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 1.8208, |
|
"grad_norm": 0.5192613005638123, |
|
"learning_rate": 4.49696e-06, |
|
"loss": 0.2457, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 0.5224815607070923, |
|
"learning_rate": 4.416960000000001e-06, |
|
"loss": 0.2442, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 1.8272, |
|
"grad_norm": 0.5999212861061096, |
|
"learning_rate": 4.336960000000001e-06, |
|
"loss": 0.2465, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 1.8304, |
|
"grad_norm": 0.6273928880691528, |
|
"learning_rate": 4.25712e-06, |
|
"loss": 0.2451, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 1.8336000000000001, |
|
"grad_norm": 0.4545860290527344, |
|
"learning_rate": 4.177120000000001e-06, |
|
"loss": 0.2455, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 1.8368, |
|
"grad_norm": 0.5125412344932556, |
|
"learning_rate": 4.09712e-06, |
|
"loss": 0.2471, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.6210908889770508, |
|
"learning_rate": 4.01712e-06, |
|
"loss": 0.2504, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 1.8432, |
|
"grad_norm": 0.5454786419868469, |
|
"learning_rate": 3.9371200000000005e-06, |
|
"loss": 0.2459, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 1.8464, |
|
"grad_norm": 0.4733567535877228, |
|
"learning_rate": 3.8571200000000004e-06, |
|
"loss": 0.2417, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 1.8496000000000001, |
|
"grad_norm": 0.5461744666099548, |
|
"learning_rate": 3.7772800000000005e-06, |
|
"loss": 0.2448, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 1.8528, |
|
"grad_norm": 0.5227847695350647, |
|
"learning_rate": 3.69728e-06, |
|
"loss": 0.2491, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.4567984938621521, |
|
"learning_rate": 3.61728e-06, |
|
"loss": 0.2491, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 1.8592, |
|
"grad_norm": 0.634410560131073, |
|
"learning_rate": 3.5372800000000003e-06, |
|
"loss": 0.249, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 1.8624, |
|
"grad_norm": 0.5894684195518494, |
|
"learning_rate": 3.45728e-06, |
|
"loss": 0.2468, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 1.8656000000000001, |
|
"grad_norm": 0.6827466487884521, |
|
"learning_rate": 3.37728e-06, |
|
"loss": 0.2446, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 1.8688, |
|
"grad_norm": 0.6229824423789978, |
|
"learning_rate": 3.2972799999999996e-06, |
|
"loss": 0.2487, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 0.5481498837471008, |
|
"learning_rate": 3.2172800000000004e-06, |
|
"loss": 0.2461, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 1.8752, |
|
"grad_norm": 0.6237102746963501, |
|
"learning_rate": 3.13728e-06, |
|
"loss": 0.2474, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 1.8784, |
|
"grad_norm": 0.6718310117721558, |
|
"learning_rate": 3.0572800000000002e-06, |
|
"loss": 0.2512, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 1.8816000000000002, |
|
"grad_norm": 0.5450266003608704, |
|
"learning_rate": 2.97728e-06, |
|
"loss": 0.249, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 1.8848, |
|
"grad_norm": 0.6037828326225281, |
|
"learning_rate": 2.8974400000000002e-06, |
|
"loss": 0.2447, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 0.60282963514328, |
|
"learning_rate": 2.81744e-06, |
|
"loss": 0.2449, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 1.8912, |
|
"grad_norm": 0.5670416355133057, |
|
"learning_rate": 2.73744e-06, |
|
"loss": 0.2424, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 1.8944, |
|
"grad_norm": 0.6023501753807068, |
|
"learning_rate": 2.6576e-06, |
|
"loss": 0.2449, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 1.8976, |
|
"grad_norm": 0.5216399431228638, |
|
"learning_rate": 2.5776e-06, |
|
"loss": 0.243, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 1.9008, |
|
"grad_norm": 0.49217623472213745, |
|
"learning_rate": 2.4976000000000004e-06, |
|
"loss": 0.2432, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 0.46074241399765015, |
|
"learning_rate": 2.4176e-06, |
|
"loss": 0.2434, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 1.9072, |
|
"grad_norm": 0.5020151734352112, |
|
"learning_rate": 2.3376000000000003e-06, |
|
"loss": 0.2424, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 1.9104, |
|
"grad_norm": 0.5782959461212158, |
|
"learning_rate": 2.2576e-06, |
|
"loss": 0.246, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 1.9136, |
|
"grad_norm": 0.5627701282501221, |
|
"learning_rate": 2.1777600000000003e-06, |
|
"loss": 0.2483, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 1.9167999999999998, |
|
"grad_norm": 0.5413541793823242, |
|
"learning_rate": 2.09776e-06, |
|
"loss": 0.2472, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.5274430513381958, |
|
"learning_rate": 2.01776e-06, |
|
"loss": 0.2439, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 1.9232, |
|
"grad_norm": 0.5475273728370667, |
|
"learning_rate": 1.93776e-06, |
|
"loss": 0.2442, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 1.9264000000000001, |
|
"grad_norm": 0.42543721199035645, |
|
"learning_rate": 1.85776e-06, |
|
"loss": 0.2472, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 1.9296, |
|
"grad_norm": 0.6353417634963989, |
|
"learning_rate": 1.7777600000000001e-06, |
|
"loss": 0.2478, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 1.9327999999999999, |
|
"grad_norm": 0.6469547748565674, |
|
"learning_rate": 1.69776e-06, |
|
"loss": 0.2399, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 0.5442044734954834, |
|
"learning_rate": 1.6177600000000002e-06, |
|
"loss": 0.2461, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 1.9392, |
|
"grad_norm": 0.6031491756439209, |
|
"learning_rate": 1.53776e-06, |
|
"loss": 0.2463, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 1.9424000000000001, |
|
"grad_norm": 0.6695773005485535, |
|
"learning_rate": 1.4579200000000002e-06, |
|
"loss": 0.2475, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 1.9456, |
|
"grad_norm": 0.5614984631538391, |
|
"learning_rate": 1.37792e-06, |
|
"loss": 0.2442, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 1.9487999999999999, |
|
"grad_norm": 0.5449358224868774, |
|
"learning_rate": 1.2980800000000002e-06, |
|
"loss": 0.2482, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.4947919249534607, |
|
"learning_rate": 1.2180800000000001e-06, |
|
"loss": 0.2434, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 1.9552, |
|
"grad_norm": 0.5914204716682434, |
|
"learning_rate": 1.13808e-06, |
|
"loss": 0.2449, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 1.9584000000000001, |
|
"grad_norm": 0.5385975241661072, |
|
"learning_rate": 1.05808e-06, |
|
"loss": 0.2484, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 1.9616, |
|
"grad_norm": 0.5446351170539856, |
|
"learning_rate": 9.780799999999999e-07, |
|
"loss": 0.2434, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 1.9647999999999999, |
|
"grad_norm": 0.5600497722625732, |
|
"learning_rate": 8.980800000000001e-07, |
|
"loss": 0.2456, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 0.46547964215278625, |
|
"learning_rate": 8.1808e-07, |
|
"loss": 0.2467, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 1.9712, |
|
"grad_norm": 0.5435817241668701, |
|
"learning_rate": 7.380800000000001e-07, |
|
"loss": 0.25, |
|
"step": 308000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 312500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.50047224233984e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|