|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.1872, |
|
"eval_steps": 500, |
|
"global_step": 185500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 1.7230606079101562, |
|
"learning_rate": 4.99208e-05, |
|
"loss": 1.2281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 3.655383348464966, |
|
"learning_rate": 4.9840800000000006e-05, |
|
"loss": 0.7566, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 1.2925927639007568, |
|
"learning_rate": 4.97608e-05, |
|
"loss": 0.6764, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.286004900932312, |
|
"learning_rate": 4.968080000000001e-05, |
|
"loss": 0.6304, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.2140214443206787, |
|
"learning_rate": 4.96008e-05, |
|
"loss": 0.5981, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.2525482177734375, |
|
"learning_rate": 4.95208e-05, |
|
"loss": 0.5767, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 1.2310410737991333, |
|
"learning_rate": 4.94408e-05, |
|
"loss": 0.5597, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 1.1735206842422485, |
|
"learning_rate": 4.9360800000000004e-05, |
|
"loss": 0.5418, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 1.114688754081726, |
|
"learning_rate": 4.9280800000000004e-05, |
|
"loss": 0.5335, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.8874593377113342, |
|
"learning_rate": 4.9200800000000005e-05, |
|
"loss": 0.5237, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 1.1261299848556519, |
|
"learning_rate": 4.91208e-05, |
|
"loss": 0.5135, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 0.9994556307792664, |
|
"learning_rate": 4.9040800000000007e-05, |
|
"loss": 0.5059, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 1.2349673509597778, |
|
"learning_rate": 4.89608e-05, |
|
"loss": 0.4939, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 0.9770995378494263, |
|
"learning_rate": 4.88808e-05, |
|
"loss": 0.4824, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.981966495513916, |
|
"learning_rate": 4.88008e-05, |
|
"loss": 0.4875, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 1.0177415609359741, |
|
"learning_rate": 4.87208e-05, |
|
"loss": 0.4785, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 1.0521667003631592, |
|
"learning_rate": 4.8640800000000004e-05, |
|
"loss": 0.4731, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 0.8560615181922913, |
|
"learning_rate": 4.85608e-05, |
|
"loss": 0.4633, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 1.0170217752456665, |
|
"learning_rate": 4.8480800000000005e-05, |
|
"loss": 0.4576, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.9891325831413269, |
|
"learning_rate": 4.84008e-05, |
|
"loss": 0.4556, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 1.0609711408615112, |
|
"learning_rate": 4.832080000000001e-05, |
|
"loss": 0.4493, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 0.8623799681663513, |
|
"learning_rate": 4.82408e-05, |
|
"loss": 0.4459, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 0.9587870240211487, |
|
"learning_rate": 4.81608e-05, |
|
"loss": 0.4418, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 0.8939447999000549, |
|
"learning_rate": 4.80808e-05, |
|
"loss": 0.4327, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9886033535003662, |
|
"learning_rate": 4.80008e-05, |
|
"loss": 0.438, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 0.9157513976097107, |
|
"learning_rate": 4.7920800000000004e-05, |
|
"loss": 0.4323, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 0.9085854887962341, |
|
"learning_rate": 4.7840800000000005e-05, |
|
"loss": 0.4303, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 0.9123984575271606, |
|
"learning_rate": 4.77608e-05, |
|
"loss": 0.4247, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 0.839026689529419, |
|
"learning_rate": 4.7680960000000004e-05, |
|
"loss": 0.4233, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.8110847473144531, |
|
"learning_rate": 4.760096e-05, |
|
"loss": 0.4207, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 0.8462579250335693, |
|
"learning_rate": 4.7520960000000005e-05, |
|
"loss": 0.421, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 0.8980106711387634, |
|
"learning_rate": 4.744096e-05, |
|
"loss": 0.417, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 0.8297702074050903, |
|
"learning_rate": 4.736096000000001e-05, |
|
"loss": 0.4139, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 0.9856173992156982, |
|
"learning_rate": 4.728096e-05, |
|
"loss": 0.419, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.934256911277771, |
|
"learning_rate": 4.720096e-05, |
|
"loss": 0.4098, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 0.9190649390220642, |
|
"learning_rate": 4.712096e-05, |
|
"loss": 0.412, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 0.9078772664070129, |
|
"learning_rate": 4.704096e-05, |
|
"loss": 0.4043, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.082939624786377, |
|
"learning_rate": 4.696112e-05, |
|
"loss": 0.4045, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 0.9159390926361084, |
|
"learning_rate": 4.688112e-05, |
|
"loss": 0.4098, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.8420547842979431, |
|
"learning_rate": 4.680128e-05, |
|
"loss": 0.4033, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 0.7658286094665527, |
|
"learning_rate": 4.672128e-05, |
|
"loss": 0.4002, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 0.9074057340621948, |
|
"learning_rate": 4.664128e-05, |
|
"loss": 0.3964, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 0.6065025329589844, |
|
"learning_rate": 4.656128e-05, |
|
"loss": 0.3984, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 0.7523757219314575, |
|
"learning_rate": 4.6481280000000004e-05, |
|
"loss": 0.3959, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.807826042175293, |
|
"learning_rate": 4.6401280000000004e-05, |
|
"loss": 0.3921, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 0.8530682325363159, |
|
"learning_rate": 4.632128e-05, |
|
"loss": 0.4002, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 0.8661518692970276, |
|
"learning_rate": 4.6241280000000006e-05, |
|
"loss": 0.3856, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.7473235130310059, |
|
"learning_rate": 4.616144e-05, |
|
"loss": 0.3854, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 0.7954819202423096, |
|
"learning_rate": 4.6081440000000005e-05, |
|
"loss": 0.3871, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8758727312088013, |
|
"learning_rate": 4.600144e-05, |
|
"loss": 0.3842, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 0.8430293798446655, |
|
"learning_rate": 4.592144000000001e-05, |
|
"loss": 0.3886, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.6557173728942871, |
|
"learning_rate": 4.584144e-05, |
|
"loss": 0.3854, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 0.7791888117790222, |
|
"learning_rate": 4.576144e-05, |
|
"loss": 0.3796, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.736084520816803, |
|
"learning_rate": 4.56816e-05, |
|
"loss": 0.3806, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.7714269161224365, |
|
"learning_rate": 4.56016e-05, |
|
"loss": 0.3781, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.766144335269928, |
|
"learning_rate": 4.552176e-05, |
|
"loss": 0.3766, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 0.7035301923751831, |
|
"learning_rate": 4.544176e-05, |
|
"loss": 0.3737, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.7573793530464172, |
|
"learning_rate": 4.536176e-05, |
|
"loss": 0.3753, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 0.8799508213996887, |
|
"learning_rate": 4.528176e-05, |
|
"loss": 0.373, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.8543264269828796, |
|
"learning_rate": 4.520176e-05, |
|
"loss": 0.3735, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 0.6768947243690491, |
|
"learning_rate": 4.512176e-05, |
|
"loss": 0.3697, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.8239702582359314, |
|
"learning_rate": 4.504176e-05, |
|
"loss": 0.3675, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 0.8310449123382568, |
|
"learning_rate": 4.4961760000000004e-05, |
|
"loss": 0.3695, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.8459475040435791, |
|
"learning_rate": 4.488176e-05, |
|
"loss": 0.3694, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.7346063852310181, |
|
"learning_rate": 4.4801760000000006e-05, |
|
"loss": 0.3646, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.6958354115486145, |
|
"learning_rate": 4.472176e-05, |
|
"loss": 0.3704, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 0.8244686722755432, |
|
"learning_rate": 4.464176000000001e-05, |
|
"loss": 0.3647, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 0.7559502124786377, |
|
"learning_rate": 4.456192e-05, |
|
"loss": 0.3665, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 0.9046504497528076, |
|
"learning_rate": 4.4481920000000007e-05, |
|
"loss": 0.3637, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.7771899700164795, |
|
"learning_rate": 4.440192e-05, |
|
"loss": 0.3648, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 0.6887528300285339, |
|
"learning_rate": 4.432192e-05, |
|
"loss": 0.3562, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.7471407055854797, |
|
"learning_rate": 4.424192e-05, |
|
"loss": 0.3639, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 0.7198163270950317, |
|
"learning_rate": 4.416192e-05, |
|
"loss": 0.3604, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 0.7383478879928589, |
|
"learning_rate": 4.4081920000000004e-05, |
|
"loss": 0.3592, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8052579760551453, |
|
"learning_rate": 4.4001920000000004e-05, |
|
"loss": 0.3563, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.7765107154846191, |
|
"learning_rate": 4.392224e-05, |
|
"loss": 0.3548, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 0.7250288724899292, |
|
"learning_rate": 4.384224e-05, |
|
"loss": 0.3605, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 0.6914694309234619, |
|
"learning_rate": 4.376224e-05, |
|
"loss": 0.3551, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 0.6636275053024292, |
|
"learning_rate": 4.368224e-05, |
|
"loss": 0.3587, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.710564911365509, |
|
"learning_rate": 4.360224e-05, |
|
"loss": 0.3537, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 0.6195800304412842, |
|
"learning_rate": 4.3522240000000004e-05, |
|
"loss": 0.3537, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 0.7131514549255371, |
|
"learning_rate": 4.34424e-05, |
|
"loss": 0.3531, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 0.6594410538673401, |
|
"learning_rate": 4.336256e-05, |
|
"loss": 0.3518, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.7651230096817017, |
|
"learning_rate": 4.328256e-05, |
|
"loss": 0.3516, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.756515622138977, |
|
"learning_rate": 4.320256e-05, |
|
"loss": 0.3461, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 0.7201528549194336, |
|
"learning_rate": 4.3122560000000003e-05, |
|
"loss": 0.3497, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 0.7436856031417847, |
|
"learning_rate": 4.3042560000000004e-05, |
|
"loss": 0.3505, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.7914199829101562, |
|
"learning_rate": 4.2962560000000005e-05, |
|
"loss": 0.3439, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 0.7488194704055786, |
|
"learning_rate": 4.288256e-05, |
|
"loss": 0.349, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.8654124736785889, |
|
"learning_rate": 4.280256e-05, |
|
"loss": 0.3491, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 0.6817401647567749, |
|
"learning_rate": 4.272272e-05, |
|
"loss": 0.3447, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 0.6439715623855591, |
|
"learning_rate": 4.2642720000000006e-05, |
|
"loss": 0.3453, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.3840138912200928, |
|
"learning_rate": 4.256272e-05, |
|
"loss": 0.3445, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.7245766520500183, |
|
"learning_rate": 4.248272e-05, |
|
"loss": 0.3462, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.6877666711807251, |
|
"learning_rate": 4.240288e-05, |
|
"loss": 0.3465, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.8494886159896851, |
|
"learning_rate": 4.2322880000000006e-05, |
|
"loss": 0.348, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 0.6704971790313721, |
|
"learning_rate": 4.224288e-05, |
|
"loss": 0.3403, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 0.6239964962005615, |
|
"learning_rate": 4.216288000000001e-05, |
|
"loss": 0.3382, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 0.7317768335342407, |
|
"learning_rate": 4.208288e-05, |
|
"loss": 0.3385, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7397735118865967, |
|
"learning_rate": 4.200288e-05, |
|
"loss": 0.3405, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 1.1299536228179932, |
|
"learning_rate": 4.1922880000000003e-05, |
|
"loss": 0.3431, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.6406556963920593, |
|
"learning_rate": 4.184304e-05, |
|
"loss": 0.3384, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 0.8084424734115601, |
|
"learning_rate": 4.17632e-05, |
|
"loss": 0.3365, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.7525010704994202, |
|
"learning_rate": 4.16832e-05, |
|
"loss": 0.3399, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.7382110953330994, |
|
"learning_rate": 4.16032e-05, |
|
"loss": 0.335, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.6454793810844421, |
|
"learning_rate": 4.15232e-05, |
|
"loss": 0.3354, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 0.639664351940155, |
|
"learning_rate": 4.14432e-05, |
|
"loss": 0.3371, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.5574499368667603, |
|
"learning_rate": 4.1363200000000004e-05, |
|
"loss": 0.3341, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 0.6772671341896057, |
|
"learning_rate": 4.12832e-05, |
|
"loss": 0.3331, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.6943195462226868, |
|
"learning_rate": 4.120336e-05, |
|
"loss": 0.3365, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 0.7460485100746155, |
|
"learning_rate": 4.112336e-05, |
|
"loss": 0.3308, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.7071924805641174, |
|
"learning_rate": 4.1043360000000005e-05, |
|
"loss": 0.3312, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 0.6678891181945801, |
|
"learning_rate": 4.0963519999999996e-05, |
|
"loss": 0.3314, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.7100914120674133, |
|
"learning_rate": 4.0883520000000004e-05, |
|
"loss": 0.3307, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.6085671782493591, |
|
"learning_rate": 4.080352e-05, |
|
"loss": 0.3282, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 0.6634243130683899, |
|
"learning_rate": 4.0723520000000005e-05, |
|
"loss": 0.3321, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 0.7203409075737, |
|
"learning_rate": 4.064352e-05, |
|
"loss": 0.3318, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 0.7934884428977966, |
|
"learning_rate": 4.056352e-05, |
|
"loss": 0.3239, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 0.8591666221618652, |
|
"learning_rate": 4.048352e-05, |
|
"loss": 0.3275, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.6306772232055664, |
|
"learning_rate": 4.040352e-05, |
|
"loss": 0.3308, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 0.6059302687644958, |
|
"learning_rate": 4.032352e-05, |
|
"loss": 0.3266, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 0.6875105500221252, |
|
"learning_rate": 4.024352e-05, |
|
"loss": 0.3265, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 0.6397412419319153, |
|
"learning_rate": 4.0163520000000004e-05, |
|
"loss": 0.3268, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.7801005840301514, |
|
"learning_rate": 4.0083520000000005e-05, |
|
"loss": 0.3314, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6966884136199951, |
|
"learning_rate": 4.000352e-05, |
|
"loss": 0.3263, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 0.7413304448127747, |
|
"learning_rate": 3.9923520000000006e-05, |
|
"loss": 0.3284, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 0.7089780569076538, |
|
"learning_rate": 3.984352e-05, |
|
"loss": 0.3252, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.6669878959655762, |
|
"learning_rate": 3.976352e-05, |
|
"loss": 0.3239, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 0.7352403998374939, |
|
"learning_rate": 3.968368e-05, |
|
"loss": 0.3226, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.6916635036468506, |
|
"learning_rate": 3.9603840000000005e-05, |
|
"loss": 0.3234, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 0.6800302863121033, |
|
"learning_rate": 3.952384e-05, |
|
"loss": 0.3224, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.6685224771499634, |
|
"learning_rate": 3.9443840000000006e-05, |
|
"loss": 0.3197, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 0.7219159603118896, |
|
"learning_rate": 3.936384e-05, |
|
"loss": 0.3185, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 0.5928858518600464, |
|
"learning_rate": 3.928384e-05, |
|
"loss": 0.3291, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.6616542339324951, |
|
"learning_rate": 3.920384e-05, |
|
"loss": 0.3266, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.5957266092300415, |
|
"learning_rate": 3.912384e-05, |
|
"loss": 0.32, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 0.6576407551765442, |
|
"learning_rate": 3.904384e-05, |
|
"loss": 0.3246, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 0.6852056384086609, |
|
"learning_rate": 3.896416e-05, |
|
"loss": 0.3268, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 0.780893087387085, |
|
"learning_rate": 3.888416e-05, |
|
"loss": 0.3229, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.6741476655006409, |
|
"learning_rate": 3.880416e-05, |
|
"loss": 0.3188, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 0.5919800400733948, |
|
"learning_rate": 3.872416e-05, |
|
"loss": 0.3208, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 0.6476633548736572, |
|
"learning_rate": 3.864416e-05, |
|
"loss": 0.322, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 0.5667979717254639, |
|
"learning_rate": 3.8564159999999996e-05, |
|
"loss": 0.3151, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.6126554608345032, |
|
"learning_rate": 3.8484160000000004e-05, |
|
"loss": 0.3185, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.7995546460151672, |
|
"learning_rate": 3.840416e-05, |
|
"loss": 0.3174, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 0.5964981317520142, |
|
"learning_rate": 3.8324160000000005e-05, |
|
"loss": 0.3187, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 0.7718212008476257, |
|
"learning_rate": 3.824416e-05, |
|
"loss": 0.3156, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.7086686491966248, |
|
"learning_rate": 3.8164320000000005e-05, |
|
"loss": 0.3189, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 0.7988029718399048, |
|
"learning_rate": 3.808432e-05, |
|
"loss": 0.3151, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6092699766159058, |
|
"learning_rate": 3.8004320000000006e-05, |
|
"loss": 0.3153, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 0.6181166768074036, |
|
"learning_rate": 3.792432e-05, |
|
"loss": 0.3113, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 0.5952243208885193, |
|
"learning_rate": 3.784432e-05, |
|
"loss": 0.3091, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 0.5732501745223999, |
|
"learning_rate": 3.776432e-05, |
|
"loss": 0.3169, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 0.5866090059280396, |
|
"learning_rate": 3.768432e-05, |
|
"loss": 0.3135, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.6748520135879517, |
|
"learning_rate": 3.760432e-05, |
|
"loss": 0.3134, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 0.5922159552574158, |
|
"learning_rate": 3.752448e-05, |
|
"loss": 0.3156, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 0.6446545124053955, |
|
"learning_rate": 3.744448e-05, |
|
"loss": 0.3171, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 0.6506426334381104, |
|
"learning_rate": 3.736448e-05, |
|
"loss": 0.3138, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 0.6826354265213013, |
|
"learning_rate": 3.728448e-05, |
|
"loss": 0.3164, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.6866195797920227, |
|
"learning_rate": 3.72048e-05, |
|
"loss": 0.315, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 0.5590147376060486, |
|
"learning_rate": 3.7124960000000005e-05, |
|
"loss": 0.3094, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 0.6728788614273071, |
|
"learning_rate": 3.704496e-05, |
|
"loss": 0.3194, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 0.6108749508857727, |
|
"learning_rate": 3.696496000000001e-05, |
|
"loss": 0.3128, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 0.5888856649398804, |
|
"learning_rate": 3.688496e-05, |
|
"loss": 0.3121, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.727268397808075, |
|
"learning_rate": 3.680496e-05, |
|
"loss": 0.3193, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 0.6358634233474731, |
|
"learning_rate": 3.672496e-05, |
|
"loss": 0.3092, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 0.6482620239257812, |
|
"learning_rate": 3.664496e-05, |
|
"loss": 0.3098, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.5968552827835083, |
|
"learning_rate": 3.6564960000000004e-05, |
|
"loss": 0.3108, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 0.6621351838111877, |
|
"learning_rate": 3.6484960000000004e-05, |
|
"loss": 0.3065, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.5520649552345276, |
|
"learning_rate": 3.640496e-05, |
|
"loss": 0.3088, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 0.6885005831718445, |
|
"learning_rate": 3.632496e-05, |
|
"loss": 0.3075, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 0.666653573513031, |
|
"learning_rate": 3.624512e-05, |
|
"loss": 0.3113, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 0.6344409584999084, |
|
"learning_rate": 3.6165120000000005e-05, |
|
"loss": 0.3085, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 0.5792534947395325, |
|
"learning_rate": 3.608512e-05, |
|
"loss": 0.3132, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6864989995956421, |
|
"learning_rate": 3.600512e-05, |
|
"loss": 0.3079, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 0.6077435612678528, |
|
"learning_rate": 3.592512e-05, |
|
"loss": 0.3095, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 0.7073134779930115, |
|
"learning_rate": 3.584512e-05, |
|
"loss": 0.3116, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 0.6477733850479126, |
|
"learning_rate": 3.576512e-05, |
|
"loss": 0.3062, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 0.7786093354225159, |
|
"learning_rate": 3.568512e-05, |
|
"loss": 0.3017, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.6447868943214417, |
|
"learning_rate": 3.560528e-05, |
|
"loss": 0.3077, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 0.6663397550582886, |
|
"learning_rate": 3.552528e-05, |
|
"loss": 0.3089, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 0.533214807510376, |
|
"learning_rate": 3.544528e-05, |
|
"loss": 0.3064, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 0.6517444849014282, |
|
"learning_rate": 3.5365280000000004e-05, |
|
"loss": 0.3108, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 0.7635303735733032, |
|
"learning_rate": 3.528544e-05, |
|
"loss": 0.3028, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.6636632680892944, |
|
"learning_rate": 3.520544e-05, |
|
"loss": 0.3015, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 0.7296783924102783, |
|
"learning_rate": 3.5125440000000004e-05, |
|
"loss": 0.305, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 0.5089054703712463, |
|
"learning_rate": 3.50456e-05, |
|
"loss": 0.3092, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 0.6761330366134644, |
|
"learning_rate": 3.49656e-05, |
|
"loss": 0.3055, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 0.6327843070030212, |
|
"learning_rate": 3.4885600000000004e-05, |
|
"loss": 0.3055, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.5940554141998291, |
|
"learning_rate": 3.48056e-05, |
|
"loss": 0.3017, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 0.516828179359436, |
|
"learning_rate": 3.4725600000000005e-05, |
|
"loss": 0.3035, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.5835782289505005, |
|
"learning_rate": 3.46456e-05, |
|
"loss": 0.2978, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 0.5978230237960815, |
|
"learning_rate": 3.456560000000001e-05, |
|
"loss": 0.301, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 0.5460017323493958, |
|
"learning_rate": 3.44856e-05, |
|
"loss": 0.3052, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.6875701546669006, |
|
"learning_rate": 3.44056e-05, |
|
"loss": 0.3028, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 0.5780492424964905, |
|
"learning_rate": 3.43256e-05, |
|
"loss": 0.2988, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 0.5191554427146912, |
|
"learning_rate": 3.42456e-05, |
|
"loss": 0.3052, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 0.6811420917510986, |
|
"learning_rate": 3.416576e-05, |
|
"loss": 0.3032, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 0.6301366686820984, |
|
"learning_rate": 3.408576e-05, |
|
"loss": 0.2979, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5777577757835388, |
|
"learning_rate": 3.400576e-05, |
|
"loss": 0.2991, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 0.6444558501243591, |
|
"learning_rate": 3.392592e-05, |
|
"loss": 0.298, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 0.4793080985546112, |
|
"learning_rate": 3.384592e-05, |
|
"loss": 0.3014, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 0.6691552400588989, |
|
"learning_rate": 3.376608e-05, |
|
"loss": 0.3006, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 0.6318476796150208, |
|
"learning_rate": 3.368608e-05, |
|
"loss": 0.3032, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.5805894136428833, |
|
"learning_rate": 3.360608e-05, |
|
"loss": 0.3014, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 0.5658220648765564, |
|
"learning_rate": 3.352608e-05, |
|
"loss": 0.3, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 0.6117516160011292, |
|
"learning_rate": 3.3446080000000004e-05, |
|
"loss": 0.3014, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.6763502359390259, |
|
"learning_rate": 3.336608e-05, |
|
"loss": 0.3043, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 0.6046746969223022, |
|
"learning_rate": 3.3286080000000005e-05, |
|
"loss": 0.2965, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.7453213930130005, |
|
"learning_rate": 3.320608e-05, |
|
"loss": 0.2964, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 0.6010546088218689, |
|
"learning_rate": 3.3126080000000007e-05, |
|
"loss": 0.2975, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 0.7377296686172485, |
|
"learning_rate": 3.304608e-05, |
|
"loss": 0.2993, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 0.6612259745597839, |
|
"learning_rate": 3.2966240000000006e-05, |
|
"loss": 0.298, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 0.6570013165473938, |
|
"learning_rate": 3.288624e-05, |
|
"loss": 0.296, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.633602499961853, |
|
"learning_rate": 3.280624e-05, |
|
"loss": 0.2989, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.5594373345375061, |
|
"learning_rate": 3.272624e-05, |
|
"loss": 0.2977, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 0.5643302202224731, |
|
"learning_rate": 3.264624e-05, |
|
"loss": 0.2941, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 0.5127794146537781, |
|
"learning_rate": 3.256624e-05, |
|
"loss": 0.2953, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 0.6273791790008545, |
|
"learning_rate": 3.24864e-05, |
|
"loss": 0.2944, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.5089157223701477, |
|
"learning_rate": 3.24064e-05, |
|
"loss": 0.3, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 0.5816791653633118, |
|
"learning_rate": 3.232656e-05, |
|
"loss": 0.2957, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 0.6407476663589478, |
|
"learning_rate": 3.224656e-05, |
|
"loss": 0.2974, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 0.46444937586784363, |
|
"learning_rate": 3.216656e-05, |
|
"loss": 0.2969, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.4997446835041046, |
|
"learning_rate": 3.2086559999999996e-05, |
|
"loss": 0.2966, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6996490359306335, |
|
"learning_rate": 3.2006560000000003e-05, |
|
"loss": 0.2965, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 0.5806016325950623, |
|
"learning_rate": 3.192672e-05, |
|
"loss": 0.2952, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 0.6140916347503662, |
|
"learning_rate": 3.184672e-05, |
|
"loss": 0.2995, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 0.45879319310188293, |
|
"learning_rate": 3.1766719999999997e-05, |
|
"loss": 0.292, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 0.6141937971115112, |
|
"learning_rate": 3.1686720000000004e-05, |
|
"loss": 0.2945, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.6565462946891785, |
|
"learning_rate": 3.160672e-05, |
|
"loss": 0.2982, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 0.5997145175933838, |
|
"learning_rate": 3.1526720000000006e-05, |
|
"loss": 0.2957, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.736965537071228, |
|
"learning_rate": 3.144672e-05, |
|
"loss": 0.2953, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 0.6587550640106201, |
|
"learning_rate": 3.136672e-05, |
|
"loss": 0.2917, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 0.7265971302986145, |
|
"learning_rate": 3.128672e-05, |
|
"loss": 0.2908, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.6158114075660706, |
|
"learning_rate": 3.120672e-05, |
|
"loss": 0.2916, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 0.6521216034889221, |
|
"learning_rate": 3.112672e-05, |
|
"loss": 0.2947, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 0.5868868231773376, |
|
"learning_rate": 3.1046720000000004e-05, |
|
"loss": 0.2919, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 0.6495432257652283, |
|
"learning_rate": 3.096672e-05, |
|
"loss": 0.2974, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 0.6204816102981567, |
|
"learning_rate": 3.0886720000000005e-05, |
|
"loss": 0.2945, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.6333968639373779, |
|
"learning_rate": 3.080672e-05, |
|
"loss": 0.292, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 0.5613961815834045, |
|
"learning_rate": 3.0726880000000004e-05, |
|
"loss": 0.2938, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 0.6623988151550293, |
|
"learning_rate": 3.064688e-05, |
|
"loss": 0.2954, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 0.6134264469146729, |
|
"learning_rate": 3.0566880000000006e-05, |
|
"loss": 0.2915, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 0.6159347891807556, |
|
"learning_rate": 3.048688e-05, |
|
"loss": 0.2887, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.6079424023628235, |
|
"learning_rate": 3.0407040000000005e-05, |
|
"loss": 0.2915, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 0.7703385353088379, |
|
"learning_rate": 3.0327040000000002e-05, |
|
"loss": 0.2901, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 0.5626256465911865, |
|
"learning_rate": 3.024704e-05, |
|
"loss": 0.2938, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 0.554914653301239, |
|
"learning_rate": 3.016704e-05, |
|
"loss": 0.2913, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 0.6610060930252075, |
|
"learning_rate": 3.008704e-05, |
|
"loss": 0.2912, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6194009780883789, |
|
"learning_rate": 3.0007040000000002e-05, |
|
"loss": 0.2901, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 0.7150211930274963, |
|
"learning_rate": 2.992704e-05, |
|
"loss": 0.2895, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 0.6945148706436157, |
|
"learning_rate": 2.9847040000000003e-05, |
|
"loss": 0.2878, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 0.6546908617019653, |
|
"learning_rate": 2.9767200000000002e-05, |
|
"loss": 0.287, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 0.535040020942688, |
|
"learning_rate": 2.9687360000000004e-05, |
|
"loss": 0.2901, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.6062806844711304, |
|
"learning_rate": 2.960736e-05, |
|
"loss": 0.2862, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 0.6202298998832703, |
|
"learning_rate": 2.9527360000000005e-05, |
|
"loss": 0.2884, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 0.5966545343399048, |
|
"learning_rate": 2.9447360000000003e-05, |
|
"loss": 0.2877, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 0.5024796724319458, |
|
"learning_rate": 2.936736e-05, |
|
"loss": 0.2882, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 0.5895559191703796, |
|
"learning_rate": 2.9287520000000002e-05, |
|
"loss": 0.288, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.9302066564559937, |
|
"learning_rate": 2.920752e-05, |
|
"loss": 0.286, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 0.573466956615448, |
|
"learning_rate": 2.9127520000000003e-05, |
|
"loss": 0.2848, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 0.5901783108711243, |
|
"learning_rate": 2.904768e-05, |
|
"loss": 0.2883, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 0.7780030369758606, |
|
"learning_rate": 2.8967680000000002e-05, |
|
"loss": 0.2914, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 0.6630533933639526, |
|
"learning_rate": 2.888768e-05, |
|
"loss": 0.2878, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.6001667976379395, |
|
"learning_rate": 2.8807680000000004e-05, |
|
"loss": 0.2818, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 0.6324682831764221, |
|
"learning_rate": 2.872768e-05, |
|
"loss": 0.2849, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 0.6814092993736267, |
|
"learning_rate": 2.864768e-05, |
|
"loss": 0.288, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 0.651709794998169, |
|
"learning_rate": 2.8567680000000003e-05, |
|
"loss": 0.2872, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 0.5912330746650696, |
|
"learning_rate": 2.848768e-05, |
|
"loss": 0.2824, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.5821974277496338, |
|
"learning_rate": 2.8407680000000004e-05, |
|
"loss": 0.2853, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 0.6262611150741577, |
|
"learning_rate": 2.832784e-05, |
|
"loss": 0.2848, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 0.5360976457595825, |
|
"learning_rate": 2.8247840000000004e-05, |
|
"loss": 0.2869, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 0.6523284912109375, |
|
"learning_rate": 2.816784e-05, |
|
"loss": 0.2792, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 0.6329330205917358, |
|
"learning_rate": 2.808784e-05, |
|
"loss": 0.2865, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6053124666213989, |
|
"learning_rate": 2.8007840000000003e-05, |
|
"loss": 0.2844, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 0.6887571811676025, |
|
"learning_rate": 2.7927999999999998e-05, |
|
"loss": 0.288, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 0.7047476172447205, |
|
"learning_rate": 2.7848000000000002e-05, |
|
"loss": 0.2877, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 0.598227858543396, |
|
"learning_rate": 2.7768e-05, |
|
"loss": 0.2867, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 0.5094701051712036, |
|
"learning_rate": 2.7688000000000003e-05, |
|
"loss": 0.2832, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.5749739408493042, |
|
"learning_rate": 2.7608e-05, |
|
"loss": 0.2821, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 0.4442578852176666, |
|
"learning_rate": 2.7528320000000003e-05, |
|
"loss": 0.282, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 0.5418574213981628, |
|
"learning_rate": 2.744832e-05, |
|
"loss": 0.2816, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 0.5984327793121338, |
|
"learning_rate": 2.736832e-05, |
|
"loss": 0.285, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 0.6572843194007874, |
|
"learning_rate": 2.728832e-05, |
|
"loss": 0.2817, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.590993344783783, |
|
"learning_rate": 2.7208320000000003e-05, |
|
"loss": 0.288, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 0.6096624135971069, |
|
"learning_rate": 2.712832e-05, |
|
"loss": 0.2861, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 0.5189167261123657, |
|
"learning_rate": 2.7048319999999998e-05, |
|
"loss": 0.2857, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 0.5812899470329285, |
|
"learning_rate": 2.6968320000000002e-05, |
|
"loss": 0.2888, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 0.515201210975647, |
|
"learning_rate": 2.688832e-05, |
|
"loss": 0.2791, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.6398504972457886, |
|
"learning_rate": 2.6808320000000004e-05, |
|
"loss": 0.282, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 0.5990891456604004, |
|
"learning_rate": 2.672832e-05, |
|
"loss": 0.28, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 0.5883029699325562, |
|
"learning_rate": 2.664832e-05, |
|
"loss": 0.2777, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 0.6432376503944397, |
|
"learning_rate": 2.656848e-05, |
|
"loss": 0.2804, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 0.5375948548316956, |
|
"learning_rate": 2.6488479999999997e-05, |
|
"loss": 0.2807, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.6207411885261536, |
|
"learning_rate": 2.6408640000000003e-05, |
|
"loss": 0.283, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 0.5854378342628479, |
|
"learning_rate": 2.632864e-05, |
|
"loss": 0.2854, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 0.5260078310966492, |
|
"learning_rate": 2.6248800000000002e-05, |
|
"loss": 0.2836, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 0.6284717917442322, |
|
"learning_rate": 2.61688e-05, |
|
"loss": 0.2824, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 0.6092182397842407, |
|
"learning_rate": 2.608896e-05, |
|
"loss": 0.2804, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6028911471366882, |
|
"learning_rate": 2.600896e-05, |
|
"loss": 0.281, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.9632, |
|
"grad_norm": 0.5008478164672852, |
|
"learning_rate": 2.5928960000000003e-05, |
|
"loss": 0.277, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 0.5233867168426514, |
|
"learning_rate": 2.584896e-05, |
|
"loss": 0.2807, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.9696, |
|
"grad_norm": 0.5762408375740051, |
|
"learning_rate": 2.5768960000000004e-05, |
|
"loss": 0.2831, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 0.6097844243049622, |
|
"learning_rate": 2.568896e-05, |
|
"loss": 0.2803, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.6696804761886597, |
|
"learning_rate": 2.560896e-05, |
|
"loss": 0.2742, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 0.6028556823730469, |
|
"learning_rate": 2.5528960000000003e-05, |
|
"loss": 0.282, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.9824, |
|
"grad_norm": 0.6651898622512817, |
|
"learning_rate": 2.544896e-05, |
|
"loss": 0.2849, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 0.5219380855560303, |
|
"learning_rate": 2.536896e-05, |
|
"loss": 0.2785, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.9888, |
|
"grad_norm": 0.6161176562309265, |
|
"learning_rate": 2.5288960000000002e-05, |
|
"loss": 0.2808, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.7915316224098206, |
|
"learning_rate": 2.5208960000000003e-05, |
|
"loss": 0.2777, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.9952, |
|
"grad_norm": 0.7261882424354553, |
|
"learning_rate": 2.512896e-05, |
|
"loss": 0.2767, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 0.5452406406402588, |
|
"learning_rate": 2.5048959999999997e-05, |
|
"loss": 0.2764, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 1.0016, |
|
"grad_norm": 0.642181396484375, |
|
"learning_rate": 2.4969120000000003e-05, |
|
"loss": 0.2746, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 1.0048, |
|
"grad_norm": 0.5900291204452515, |
|
"learning_rate": 2.4889120000000003e-05, |
|
"loss": 0.2721, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.5960043668746948, |
|
"learning_rate": 2.480912e-05, |
|
"loss": 0.265, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 1.0112, |
|
"grad_norm": 0.582115650177002, |
|
"learning_rate": 2.472912e-05, |
|
"loss": 0.2673, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 1.0144, |
|
"grad_norm": 0.552392303943634, |
|
"learning_rate": 2.464912e-05, |
|
"loss": 0.2663, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 1.0176, |
|
"grad_norm": 0.5585765242576599, |
|
"learning_rate": 2.456912e-05, |
|
"loss": 0.2688, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 1.0208, |
|
"grad_norm": 0.6049332022666931, |
|
"learning_rate": 2.448912e-05, |
|
"loss": 0.266, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.5749877095222473, |
|
"learning_rate": 2.440912e-05, |
|
"loss": 0.2689, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.0272, |
|
"grad_norm": 0.5832675695419312, |
|
"learning_rate": 2.4329120000000002e-05, |
|
"loss": 0.2703, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 1.0304, |
|
"grad_norm": 0.8549031019210815, |
|
"learning_rate": 2.424928e-05, |
|
"loss": 0.2623, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 1.0336, |
|
"grad_norm": 0.5572855472564697, |
|
"learning_rate": 2.416928e-05, |
|
"loss": 0.2711, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 1.0368, |
|
"grad_norm": 0.6818140745162964, |
|
"learning_rate": 2.408928e-05, |
|
"loss": 0.2652, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.6900683045387268, |
|
"learning_rate": 2.400928e-05, |
|
"loss": 0.2669, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 1.0432, |
|
"grad_norm": 0.6015618443489075, |
|
"learning_rate": 2.392944e-05, |
|
"loss": 0.2654, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 1.0464, |
|
"grad_norm": 0.5343177318572998, |
|
"learning_rate": 2.3849440000000002e-05, |
|
"loss": 0.2656, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 1.0496, |
|
"grad_norm": 0.6130079627037048, |
|
"learning_rate": 2.3769440000000003e-05, |
|
"loss": 0.2592, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 1.0528, |
|
"grad_norm": 0.7150599956512451, |
|
"learning_rate": 2.368944e-05, |
|
"loss": 0.2634, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.6321354508399963, |
|
"learning_rate": 2.360944e-05, |
|
"loss": 0.2683, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 1.0592, |
|
"grad_norm": 0.6234462857246399, |
|
"learning_rate": 2.352976e-05, |
|
"loss": 0.2628, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 1.0624, |
|
"grad_norm": 0.6542537808418274, |
|
"learning_rate": 2.344976e-05, |
|
"loss": 0.2618, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 1.0656, |
|
"grad_norm": 0.6302633881568909, |
|
"learning_rate": 2.3369760000000002e-05, |
|
"loss": 0.2661, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 1.0688, |
|
"grad_norm": 0.5890353322029114, |
|
"learning_rate": 2.3289760000000002e-05, |
|
"loss": 0.2646, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.6490179300308228, |
|
"learning_rate": 2.320976e-05, |
|
"loss": 0.2635, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 1.0752, |
|
"grad_norm": 0.648162305355072, |
|
"learning_rate": 2.312976e-05, |
|
"loss": 0.2646, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 1.0784, |
|
"grad_norm": 0.675680935382843, |
|
"learning_rate": 2.304976e-05, |
|
"loss": 0.2626, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 1.0816, |
|
"grad_norm": 0.6192341446876526, |
|
"learning_rate": 2.2969760000000002e-05, |
|
"loss": 0.2641, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 1.0848, |
|
"grad_norm": 0.7046379446983337, |
|
"learning_rate": 2.288992e-05, |
|
"loss": 0.2643, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.5477197170257568, |
|
"learning_rate": 2.280992e-05, |
|
"loss": 0.265, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.0912, |
|
"grad_norm": 0.5775583982467651, |
|
"learning_rate": 2.2729920000000002e-05, |
|
"loss": 0.2645, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 1.0944, |
|
"grad_norm": 0.6389047503471375, |
|
"learning_rate": 2.2649920000000003e-05, |
|
"loss": 0.2634, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 1.0976, |
|
"grad_norm": 0.6169374585151672, |
|
"learning_rate": 2.256992e-05, |
|
"loss": 0.2642, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 1.1008, |
|
"grad_norm": 0.5913782715797424, |
|
"learning_rate": 2.2490080000000002e-05, |
|
"loss": 0.2658, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.7547928690910339, |
|
"learning_rate": 2.241008e-05, |
|
"loss": 0.2674, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 1.1072, |
|
"grad_norm": 0.6277585625648499, |
|
"learning_rate": 2.233024e-05, |
|
"loss": 0.2686, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 1.1104, |
|
"grad_norm": 0.6357282996177673, |
|
"learning_rate": 2.225024e-05, |
|
"loss": 0.2639, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 1.1136, |
|
"grad_norm": 0.5262208580970764, |
|
"learning_rate": 2.2170400000000004e-05, |
|
"loss": 0.2641, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 1.1168, |
|
"grad_norm": 0.6878075003623962, |
|
"learning_rate": 2.20904e-05, |
|
"loss": 0.2654, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.5332186222076416, |
|
"learning_rate": 2.2010400000000002e-05, |
|
"loss": 0.2638, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 1.1232, |
|
"grad_norm": 0.5562476515769958, |
|
"learning_rate": 2.19304e-05, |
|
"loss": 0.2648, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 1.1264, |
|
"grad_norm": 0.5924221277236938, |
|
"learning_rate": 2.18504e-05, |
|
"loss": 0.2627, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 1.1296, |
|
"grad_norm": 0.5250386595726013, |
|
"learning_rate": 2.17704e-05, |
|
"loss": 0.2619, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 1.1328, |
|
"grad_norm": 0.7426069378852844, |
|
"learning_rate": 2.16904e-05, |
|
"loss": 0.2628, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.4925951063632965, |
|
"learning_rate": 2.16104e-05, |
|
"loss": 0.2661, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 1.1392, |
|
"grad_norm": 0.5707270503044128, |
|
"learning_rate": 2.15304e-05, |
|
"loss": 0.2622, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 1.1424, |
|
"grad_norm": 0.5793021321296692, |
|
"learning_rate": 2.14504e-05, |
|
"loss": 0.2671, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 1.1456, |
|
"grad_norm": 0.5736916661262512, |
|
"learning_rate": 2.13704e-05, |
|
"loss": 0.2648, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 1.1488, |
|
"grad_norm": 0.588550329208374, |
|
"learning_rate": 2.129056e-05, |
|
"loss": 0.2641, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.5504462122917175, |
|
"learning_rate": 2.121056e-05, |
|
"loss": 0.2643, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.1552, |
|
"grad_norm": 0.5439949035644531, |
|
"learning_rate": 2.113056e-05, |
|
"loss": 0.2639, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 1.1584, |
|
"grad_norm": 0.6882042288780212, |
|
"learning_rate": 2.105056e-05, |
|
"loss": 0.2595, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 1.1616, |
|
"grad_norm": 0.6735561490058899, |
|
"learning_rate": 2.097056e-05, |
|
"loss": 0.2624, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 1.1648, |
|
"grad_norm": 0.5545785427093506, |
|
"learning_rate": 2.089056e-05, |
|
"loss": 0.2625, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.6497994065284729, |
|
"learning_rate": 2.081056e-05, |
|
"loss": 0.2611, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 1.1712, |
|
"grad_norm": 0.5887815356254578, |
|
"learning_rate": 2.073056e-05, |
|
"loss": 0.2632, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 1.1743999999999999, |
|
"grad_norm": 0.6037270426750183, |
|
"learning_rate": 2.0650560000000002e-05, |
|
"loss": 0.2645, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 1.1776, |
|
"grad_norm": 0.636946439743042, |
|
"learning_rate": 2.057072e-05, |
|
"loss": 0.2628, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 1.1808, |
|
"grad_norm": 0.5285276770591736, |
|
"learning_rate": 2.049072e-05, |
|
"loss": 0.2629, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.4634397625923157, |
|
"learning_rate": 2.041072e-05, |
|
"loss": 0.2615, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 1.1872, |
|
"grad_norm": 0.5693604946136475, |
|
"learning_rate": 2.033072e-05, |
|
"loss": 0.2619, |
|
"step": 185500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 312500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.0369350959104e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|