|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4832, |
|
"eval_steps": 500, |
|
"global_step": 75500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 1.7230606079101562, |
|
"learning_rate": 4.99208e-05, |
|
"loss": 1.2281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 3.655383348464966, |
|
"learning_rate": 4.9840800000000006e-05, |
|
"loss": 0.7566, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 1.2925927639007568, |
|
"learning_rate": 4.97608e-05, |
|
"loss": 0.6764, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.286004900932312, |
|
"learning_rate": 4.968080000000001e-05, |
|
"loss": 0.6304, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.2140214443206787, |
|
"learning_rate": 4.96008e-05, |
|
"loss": 0.5981, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.2525482177734375, |
|
"learning_rate": 4.95208e-05, |
|
"loss": 0.5767, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 1.2310410737991333, |
|
"learning_rate": 4.94408e-05, |
|
"loss": 0.5597, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 1.1735206842422485, |
|
"learning_rate": 4.9360800000000004e-05, |
|
"loss": 0.5418, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 1.114688754081726, |
|
"learning_rate": 4.9280800000000004e-05, |
|
"loss": 0.5335, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.8874593377113342, |
|
"learning_rate": 4.9200800000000005e-05, |
|
"loss": 0.5237, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 1.1261299848556519, |
|
"learning_rate": 4.91208e-05, |
|
"loss": 0.5135, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 0.9994556307792664, |
|
"learning_rate": 4.9040800000000007e-05, |
|
"loss": 0.5059, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 1.2349673509597778, |
|
"learning_rate": 4.89608e-05, |
|
"loss": 0.4939, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 0.9770995378494263, |
|
"learning_rate": 4.88808e-05, |
|
"loss": 0.4824, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.981966495513916, |
|
"learning_rate": 4.88008e-05, |
|
"loss": 0.4875, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 1.0177415609359741, |
|
"learning_rate": 4.87208e-05, |
|
"loss": 0.4785, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 1.0521667003631592, |
|
"learning_rate": 4.8640800000000004e-05, |
|
"loss": 0.4731, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 0.8560615181922913, |
|
"learning_rate": 4.85608e-05, |
|
"loss": 0.4633, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 1.0170217752456665, |
|
"learning_rate": 4.8480800000000005e-05, |
|
"loss": 0.4576, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.9891325831413269, |
|
"learning_rate": 4.84008e-05, |
|
"loss": 0.4556, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 1.0609711408615112, |
|
"learning_rate": 4.832080000000001e-05, |
|
"loss": 0.4493, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 0.8623799681663513, |
|
"learning_rate": 4.82408e-05, |
|
"loss": 0.4459, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 0.9587870240211487, |
|
"learning_rate": 4.81608e-05, |
|
"loss": 0.4418, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 0.8939447999000549, |
|
"learning_rate": 4.80808e-05, |
|
"loss": 0.4327, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9886033535003662, |
|
"learning_rate": 4.80008e-05, |
|
"loss": 0.438, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 0.9157513976097107, |
|
"learning_rate": 4.7920800000000004e-05, |
|
"loss": 0.4323, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 0.9085854887962341, |
|
"learning_rate": 4.7840800000000005e-05, |
|
"loss": 0.4303, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 0.9123984575271606, |
|
"learning_rate": 4.77608e-05, |
|
"loss": 0.4247, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 0.839026689529419, |
|
"learning_rate": 4.7680960000000004e-05, |
|
"loss": 0.4233, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.8110847473144531, |
|
"learning_rate": 4.760096e-05, |
|
"loss": 0.4207, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 0.8462579250335693, |
|
"learning_rate": 4.7520960000000005e-05, |
|
"loss": 0.421, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 0.8980106711387634, |
|
"learning_rate": 4.744096e-05, |
|
"loss": 0.417, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 0.8297702074050903, |
|
"learning_rate": 4.736096000000001e-05, |
|
"loss": 0.4139, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 0.9856173992156982, |
|
"learning_rate": 4.728096e-05, |
|
"loss": 0.419, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.934256911277771, |
|
"learning_rate": 4.720096e-05, |
|
"loss": 0.4098, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 0.9190649390220642, |
|
"learning_rate": 4.712096e-05, |
|
"loss": 0.412, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 0.9078772664070129, |
|
"learning_rate": 4.704096e-05, |
|
"loss": 0.4043, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.082939624786377, |
|
"learning_rate": 4.696112e-05, |
|
"loss": 0.4045, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 0.9159390926361084, |
|
"learning_rate": 4.688112e-05, |
|
"loss": 0.4098, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.8420547842979431, |
|
"learning_rate": 4.680128e-05, |
|
"loss": 0.4033, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 0.7658286094665527, |
|
"learning_rate": 4.672128e-05, |
|
"loss": 0.4002, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 0.9074057340621948, |
|
"learning_rate": 4.664128e-05, |
|
"loss": 0.3964, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 0.6065025329589844, |
|
"learning_rate": 4.656128e-05, |
|
"loss": 0.3984, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 0.7523757219314575, |
|
"learning_rate": 4.6481280000000004e-05, |
|
"loss": 0.3959, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.807826042175293, |
|
"learning_rate": 4.6401280000000004e-05, |
|
"loss": 0.3921, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 0.8530682325363159, |
|
"learning_rate": 4.632128e-05, |
|
"loss": 0.4002, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 0.8661518692970276, |
|
"learning_rate": 4.6241280000000006e-05, |
|
"loss": 0.3856, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.7473235130310059, |
|
"learning_rate": 4.616144e-05, |
|
"loss": 0.3854, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 0.7954819202423096, |
|
"learning_rate": 4.6081440000000005e-05, |
|
"loss": 0.3871, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8758727312088013, |
|
"learning_rate": 4.600144e-05, |
|
"loss": 0.3842, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 0.8430293798446655, |
|
"learning_rate": 4.592144000000001e-05, |
|
"loss": 0.3886, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.6557173728942871, |
|
"learning_rate": 4.584144e-05, |
|
"loss": 0.3854, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 0.7791888117790222, |
|
"learning_rate": 4.576144e-05, |
|
"loss": 0.3796, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.736084520816803, |
|
"learning_rate": 4.56816e-05, |
|
"loss": 0.3806, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.7714269161224365, |
|
"learning_rate": 4.56016e-05, |
|
"loss": 0.3781, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.766144335269928, |
|
"learning_rate": 4.552176e-05, |
|
"loss": 0.3766, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 0.7035301923751831, |
|
"learning_rate": 4.544176e-05, |
|
"loss": 0.3737, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.7573793530464172, |
|
"learning_rate": 4.536176e-05, |
|
"loss": 0.3753, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 0.8799508213996887, |
|
"learning_rate": 4.528176e-05, |
|
"loss": 0.373, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.8543264269828796, |
|
"learning_rate": 4.520176e-05, |
|
"loss": 0.3735, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 0.6768947243690491, |
|
"learning_rate": 4.512176e-05, |
|
"loss": 0.3697, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.8239702582359314, |
|
"learning_rate": 4.504176e-05, |
|
"loss": 0.3675, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 0.8310449123382568, |
|
"learning_rate": 4.4961760000000004e-05, |
|
"loss": 0.3695, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.8459475040435791, |
|
"learning_rate": 4.488176e-05, |
|
"loss": 0.3694, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.7346063852310181, |
|
"learning_rate": 4.4801760000000006e-05, |
|
"loss": 0.3646, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.6958354115486145, |
|
"learning_rate": 4.472176e-05, |
|
"loss": 0.3704, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 0.8244686722755432, |
|
"learning_rate": 4.464176000000001e-05, |
|
"loss": 0.3647, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 0.7559502124786377, |
|
"learning_rate": 4.456192e-05, |
|
"loss": 0.3665, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 0.9046504497528076, |
|
"learning_rate": 4.4481920000000007e-05, |
|
"loss": 0.3637, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.7771899700164795, |
|
"learning_rate": 4.440192e-05, |
|
"loss": 0.3648, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 0.6887528300285339, |
|
"learning_rate": 4.432192e-05, |
|
"loss": 0.3562, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.7471407055854797, |
|
"learning_rate": 4.424192e-05, |
|
"loss": 0.3639, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 0.7198163270950317, |
|
"learning_rate": 4.416192e-05, |
|
"loss": 0.3604, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 0.7383478879928589, |
|
"learning_rate": 4.4081920000000004e-05, |
|
"loss": 0.3592, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8052579760551453, |
|
"learning_rate": 4.4001920000000004e-05, |
|
"loss": 0.3563, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.7765107154846191, |
|
"learning_rate": 4.392224e-05, |
|
"loss": 0.3548, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 0.7250288724899292, |
|
"learning_rate": 4.384224e-05, |
|
"loss": 0.3605, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 0.6914694309234619, |
|
"learning_rate": 4.376224e-05, |
|
"loss": 0.3551, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 0.6636275053024292, |
|
"learning_rate": 4.368224e-05, |
|
"loss": 0.3587, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.710564911365509, |
|
"learning_rate": 4.360224e-05, |
|
"loss": 0.3537, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 0.6195800304412842, |
|
"learning_rate": 4.3522240000000004e-05, |
|
"loss": 0.3537, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 0.7131514549255371, |
|
"learning_rate": 4.34424e-05, |
|
"loss": 0.3531, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 0.6594410538673401, |
|
"learning_rate": 4.336256e-05, |
|
"loss": 0.3518, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.7651230096817017, |
|
"learning_rate": 4.328256e-05, |
|
"loss": 0.3516, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.756515622138977, |
|
"learning_rate": 4.320256e-05, |
|
"loss": 0.3461, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 0.7201528549194336, |
|
"learning_rate": 4.3122560000000003e-05, |
|
"loss": 0.3497, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 0.7436856031417847, |
|
"learning_rate": 4.3042560000000004e-05, |
|
"loss": 0.3505, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.7914199829101562, |
|
"learning_rate": 4.2962560000000005e-05, |
|
"loss": 0.3439, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 0.7488194704055786, |
|
"learning_rate": 4.288256e-05, |
|
"loss": 0.349, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.8654124736785889, |
|
"learning_rate": 4.280256e-05, |
|
"loss": 0.3491, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 0.6817401647567749, |
|
"learning_rate": 4.272272e-05, |
|
"loss": 0.3447, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 0.6439715623855591, |
|
"learning_rate": 4.2642720000000006e-05, |
|
"loss": 0.3453, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.3840138912200928, |
|
"learning_rate": 4.256272e-05, |
|
"loss": 0.3445, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.7245766520500183, |
|
"learning_rate": 4.248272e-05, |
|
"loss": 0.3462, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.6877666711807251, |
|
"learning_rate": 4.240288e-05, |
|
"loss": 0.3465, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.8494886159896851, |
|
"learning_rate": 4.2322880000000006e-05, |
|
"loss": 0.348, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 0.6704971790313721, |
|
"learning_rate": 4.224288e-05, |
|
"loss": 0.3403, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 0.6239964962005615, |
|
"learning_rate": 4.216288000000001e-05, |
|
"loss": 0.3382, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 0.7317768335342407, |
|
"learning_rate": 4.208288e-05, |
|
"loss": 0.3385, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7397735118865967, |
|
"learning_rate": 4.200288e-05, |
|
"loss": 0.3405, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 1.1299536228179932, |
|
"learning_rate": 4.1922880000000003e-05, |
|
"loss": 0.3431, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.6406556963920593, |
|
"learning_rate": 4.184304e-05, |
|
"loss": 0.3384, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 0.8084424734115601, |
|
"learning_rate": 4.17632e-05, |
|
"loss": 0.3365, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.7525010704994202, |
|
"learning_rate": 4.16832e-05, |
|
"loss": 0.3399, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.7382110953330994, |
|
"learning_rate": 4.16032e-05, |
|
"loss": 0.335, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.6454793810844421, |
|
"learning_rate": 4.15232e-05, |
|
"loss": 0.3354, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 0.639664351940155, |
|
"learning_rate": 4.14432e-05, |
|
"loss": 0.3371, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.5574499368667603, |
|
"learning_rate": 4.1363200000000004e-05, |
|
"loss": 0.3341, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 0.6772671341896057, |
|
"learning_rate": 4.12832e-05, |
|
"loss": 0.3331, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.6943195462226868, |
|
"learning_rate": 4.120336e-05, |
|
"loss": 0.3365, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 0.7460485100746155, |
|
"learning_rate": 4.112336e-05, |
|
"loss": 0.3308, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.7071924805641174, |
|
"learning_rate": 4.1043360000000005e-05, |
|
"loss": 0.3312, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 0.6678891181945801, |
|
"learning_rate": 4.0963519999999996e-05, |
|
"loss": 0.3314, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.7100914120674133, |
|
"learning_rate": 4.0883520000000004e-05, |
|
"loss": 0.3307, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.6085671782493591, |
|
"learning_rate": 4.080352e-05, |
|
"loss": 0.3282, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 0.6634243130683899, |
|
"learning_rate": 4.0723520000000005e-05, |
|
"loss": 0.3321, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 0.7203409075737, |
|
"learning_rate": 4.064352e-05, |
|
"loss": 0.3318, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 0.7934884428977966, |
|
"learning_rate": 4.056352e-05, |
|
"loss": 0.3239, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 0.8591666221618652, |
|
"learning_rate": 4.048352e-05, |
|
"loss": 0.3275, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.6306772232055664, |
|
"learning_rate": 4.040352e-05, |
|
"loss": 0.3308, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 0.6059302687644958, |
|
"learning_rate": 4.032352e-05, |
|
"loss": 0.3266, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 0.6875105500221252, |
|
"learning_rate": 4.024352e-05, |
|
"loss": 0.3265, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 0.6397412419319153, |
|
"learning_rate": 4.0163520000000004e-05, |
|
"loss": 0.3268, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.7801005840301514, |
|
"learning_rate": 4.0083520000000005e-05, |
|
"loss": 0.3314, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6966884136199951, |
|
"learning_rate": 4.000352e-05, |
|
"loss": 0.3263, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 0.7413304448127747, |
|
"learning_rate": 3.9923520000000006e-05, |
|
"loss": 0.3284, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 0.7089780569076538, |
|
"learning_rate": 3.984352e-05, |
|
"loss": 0.3252, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.6669878959655762, |
|
"learning_rate": 3.976352e-05, |
|
"loss": 0.3239, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 0.7352403998374939, |
|
"learning_rate": 3.968368e-05, |
|
"loss": 0.3226, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.6916635036468506, |
|
"learning_rate": 3.9603840000000005e-05, |
|
"loss": 0.3234, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 0.6800302863121033, |
|
"learning_rate": 3.952384e-05, |
|
"loss": 0.3224, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.6685224771499634, |
|
"learning_rate": 3.9443840000000006e-05, |
|
"loss": 0.3197, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 0.7219159603118896, |
|
"learning_rate": 3.936384e-05, |
|
"loss": 0.3185, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 0.5928858518600464, |
|
"learning_rate": 3.928384e-05, |
|
"loss": 0.3291, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.6616542339324951, |
|
"learning_rate": 3.920384e-05, |
|
"loss": 0.3266, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.5957266092300415, |
|
"learning_rate": 3.912384e-05, |
|
"loss": 0.32, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 0.6576407551765442, |
|
"learning_rate": 3.904384e-05, |
|
"loss": 0.3246, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 0.6852056384086609, |
|
"learning_rate": 3.896416e-05, |
|
"loss": 0.3268, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 0.780893087387085, |
|
"learning_rate": 3.888416e-05, |
|
"loss": 0.3229, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.6741476655006409, |
|
"learning_rate": 3.880416e-05, |
|
"loss": 0.3188, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 0.5919800400733948, |
|
"learning_rate": 3.872416e-05, |
|
"loss": 0.3208, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 0.6476633548736572, |
|
"learning_rate": 3.864416e-05, |
|
"loss": 0.322, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 0.5667979717254639, |
|
"learning_rate": 3.8564159999999996e-05, |
|
"loss": 0.3151, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.6126554608345032, |
|
"learning_rate": 3.8484160000000004e-05, |
|
"loss": 0.3185, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.7995546460151672, |
|
"learning_rate": 3.840416e-05, |
|
"loss": 0.3174, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 0.5964981317520142, |
|
"learning_rate": 3.8324160000000005e-05, |
|
"loss": 0.3187, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 0.7718212008476257, |
|
"learning_rate": 3.824416e-05, |
|
"loss": 0.3156, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.7086686491966248, |
|
"learning_rate": 3.8164320000000005e-05, |
|
"loss": 0.3189, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 0.7988029718399048, |
|
"learning_rate": 3.808432e-05, |
|
"loss": 0.3151, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6092699766159058, |
|
"learning_rate": 3.8004320000000006e-05, |
|
"loss": 0.3153, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 0.6181166768074036, |
|
"learning_rate": 3.792432e-05, |
|
"loss": 0.3113, |
|
"step": 75500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 312500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.6781056589824e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|