adapters-gemma-bf16-QLORA-super_glue-rte
/
trainer_state-gemma-bf16-QLORA-super_glue-rte-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 160, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.0625, | |
"grad_norm": 268.0, | |
"learning_rate": 2.5e-05, | |
"loss": 2.5702, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.0625, | |
"eval_accuracy": 0.508, | |
"eval_loss": 2.8505053520202637, | |
"eval_runtime": 4.8412, | |
"eval_samples_per_second": 51.64, | |
"eval_steps_per_second": 1.652, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.125, | |
"grad_norm": 227.0, | |
"learning_rate": 5e-05, | |
"loss": 2.6389, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.125, | |
"eval_accuracy": 0.528, | |
"eval_loss": 2.2588462829589844, | |
"eval_runtime": 4.8967, | |
"eval_samples_per_second": 51.054, | |
"eval_steps_per_second": 1.634, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.1875, | |
"grad_norm": 236.0, | |
"learning_rate": 4.968354430379747e-05, | |
"loss": 2.3551, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.1875, | |
"eval_accuracy": 0.536, | |
"eval_loss": 1.4990876913070679, | |
"eval_runtime": 4.8929, | |
"eval_samples_per_second": 51.095, | |
"eval_steps_per_second": 1.635, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 89.0, | |
"learning_rate": 4.936708860759494e-05, | |
"loss": 1.5552, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.25, | |
"eval_accuracy": 0.544, | |
"eval_loss": 1.8495231866836548, | |
"eval_runtime": 4.8705, | |
"eval_samples_per_second": 51.33, | |
"eval_steps_per_second": 1.643, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.3125, | |
"grad_norm": 130.0, | |
"learning_rate": 4.905063291139241e-05, | |
"loss": 1.6022, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.3125, | |
"eval_accuracy": 0.552, | |
"eval_loss": 1.5980929136276245, | |
"eval_runtime": 4.8961, | |
"eval_samples_per_second": 51.061, | |
"eval_steps_per_second": 1.634, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.375, | |
"grad_norm": 190.0, | |
"learning_rate": 4.8734177215189874e-05, | |
"loss": 1.8478, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.375, | |
"eval_accuracy": 0.62, | |
"eval_loss": 1.179288625717163, | |
"eval_runtime": 4.8667, | |
"eval_samples_per_second": 51.37, | |
"eval_steps_per_second": 1.644, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.4375, | |
"grad_norm": 92.5, | |
"learning_rate": 4.8417721518987346e-05, | |
"loss": 1.1437, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.4375, | |
"eval_accuracy": 0.624, | |
"eval_loss": 1.0324233770370483, | |
"eval_runtime": 4.8984, | |
"eval_samples_per_second": 51.037, | |
"eval_steps_per_second": 1.633, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 52.25, | |
"learning_rate": 4.810126582278481e-05, | |
"loss": 0.7386, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.5, | |
"eval_accuracy": 0.628, | |
"eval_loss": 1.0248664617538452, | |
"eval_runtime": 4.8699, | |
"eval_samples_per_second": 51.335, | |
"eval_steps_per_second": 1.643, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.5625, | |
"grad_norm": 69.0, | |
"learning_rate": 4.778481012658228e-05, | |
"loss": 1.0636, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.5625, | |
"eval_accuracy": 0.656, | |
"eval_loss": 0.9664216041564941, | |
"eval_runtime": 4.8671, | |
"eval_samples_per_second": 51.365, | |
"eval_steps_per_second": 1.644, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.625, | |
"grad_norm": 104.0, | |
"learning_rate": 4.7468354430379746e-05, | |
"loss": 0.9956, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.625, | |
"eval_accuracy": 0.696, | |
"eval_loss": 0.8491688966751099, | |
"eval_runtime": 4.8707, | |
"eval_samples_per_second": 51.328, | |
"eval_steps_per_second": 1.642, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.6875, | |
"grad_norm": 51.75, | |
"learning_rate": 4.715189873417722e-05, | |
"loss": 0.7195, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.6875, | |
"eval_accuracy": 0.724, | |
"eval_loss": 0.7724543809890747, | |
"eval_runtime": 4.8634, | |
"eval_samples_per_second": 51.405, | |
"eval_steps_per_second": 1.645, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 24.625, | |
"learning_rate": 4.683544303797468e-05, | |
"loss": 0.7704, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.75, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.7519556283950806, | |
"eval_runtime": 4.8881, | |
"eval_samples_per_second": 51.144, | |
"eval_steps_per_second": 1.637, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.8125, | |
"grad_norm": 18.625, | |
"learning_rate": 4.6518987341772154e-05, | |
"loss": 0.4636, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.8125, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.7026970386505127, | |
"eval_runtime": 4.8695, | |
"eval_samples_per_second": 51.34, | |
"eval_steps_per_second": 1.643, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.875, | |
"grad_norm": 22.25, | |
"learning_rate": 4.6202531645569625e-05, | |
"loss": 0.771, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.875, | |
"eval_accuracy": 0.788, | |
"eval_loss": 0.6045461893081665, | |
"eval_runtime": 4.8902, | |
"eval_samples_per_second": 51.123, | |
"eval_steps_per_second": 1.636, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.9375, | |
"grad_norm": 15.5, | |
"learning_rate": 4.588607594936709e-05, | |
"loss": 0.7138, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.9375, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.5602908730506897, | |
"eval_runtime": 4.8902, | |
"eval_samples_per_second": 51.123, | |
"eval_steps_per_second": 1.636, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 30.875, | |
"learning_rate": 4.556962025316456e-05, | |
"loss": 0.7263, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.804, | |
"eval_loss": 0.5666066408157349, | |
"eval_runtime": 4.8404, | |
"eval_samples_per_second": 51.649, | |
"eval_steps_per_second": 1.653, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.0625, | |
"grad_norm": 77.5, | |
"learning_rate": 4.525316455696203e-05, | |
"loss": 0.3919, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.0625, | |
"eval_accuracy": 0.808, | |
"eval_loss": 0.5534331202507019, | |
"eval_runtime": 4.816, | |
"eval_samples_per_second": 51.911, | |
"eval_steps_per_second": 1.661, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.125, | |
"grad_norm": 44.5, | |
"learning_rate": 4.49367088607595e-05, | |
"loss": 0.2448, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.125, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.5073856711387634, | |
"eval_runtime": 4.8889, | |
"eval_samples_per_second": 51.137, | |
"eval_steps_per_second": 1.636, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.1875, | |
"grad_norm": 31.75, | |
"learning_rate": 4.462025316455696e-05, | |
"loss": 0.2112, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.1875, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4510812759399414, | |
"eval_runtime": 4.8632, | |
"eval_samples_per_second": 51.407, | |
"eval_steps_per_second": 1.645, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 33.25, | |
"learning_rate": 4.430379746835443e-05, | |
"loss": 0.335, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.25, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.4451099932193756, | |
"eval_runtime": 4.8972, | |
"eval_samples_per_second": 51.05, | |
"eval_steps_per_second": 1.634, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.3125, | |
"grad_norm": 17.375, | |
"learning_rate": 4.3987341772151904e-05, | |
"loss": 0.196, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.3125, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.47517192363739014, | |
"eval_runtime": 4.8852, | |
"eval_samples_per_second": 51.175, | |
"eval_steps_per_second": 1.638, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.375, | |
"grad_norm": 45.75, | |
"learning_rate": 4.367088607594937e-05, | |
"loss": 0.1765, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.375, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.4687165319919586, | |
"eval_runtime": 4.8161, | |
"eval_samples_per_second": 51.909, | |
"eval_steps_per_second": 1.661, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.4375, | |
"grad_norm": 18.625, | |
"learning_rate": 4.3354430379746834e-05, | |
"loss": 0.2245, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.4375, | |
"eval_accuracy": 0.824, | |
"eval_loss": 0.4411630928516388, | |
"eval_runtime": 4.8861, | |
"eval_samples_per_second": 51.165, | |
"eval_steps_per_second": 1.637, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 10.0, | |
"learning_rate": 4.3037974683544305e-05, | |
"loss": 0.137, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.5, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4266666769981384, | |
"eval_runtime": 4.8898, | |
"eval_samples_per_second": 51.127, | |
"eval_steps_per_second": 1.636, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.5625, | |
"grad_norm": 13.125, | |
"learning_rate": 4.2721518987341776e-05, | |
"loss": 0.1957, | |
"step": 25 | |
}, | |
{ | |
"epoch": 1.5625, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.42926496267318726, | |
"eval_runtime": 4.8436, | |
"eval_samples_per_second": 51.614, | |
"eval_steps_per_second": 1.652, | |
"step": 25 | |
}, | |
{ | |
"epoch": 1.625, | |
"grad_norm": 62.0, | |
"learning_rate": 4.240506329113924e-05, | |
"loss": 0.3881, | |
"step": 26 | |
}, | |
{ | |
"epoch": 1.625, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.43251192569732666, | |
"eval_runtime": 4.8871, | |
"eval_samples_per_second": 51.155, | |
"eval_steps_per_second": 1.637, | |
"step": 26 | |
}, | |
{ | |
"epoch": 1.6875, | |
"grad_norm": 7.9375, | |
"learning_rate": 4.208860759493671e-05, | |
"loss": 0.1298, | |
"step": 27 | |
}, | |
{ | |
"epoch": 1.6875, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.46340441703796387, | |
"eval_runtime": 4.8777, | |
"eval_samples_per_second": 51.254, | |
"eval_steps_per_second": 1.64, | |
"step": 27 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 13.625, | |
"learning_rate": 4.177215189873418e-05, | |
"loss": 0.1124, | |
"step": 28 | |
}, | |
{ | |
"epoch": 1.75, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.46482232213020325, | |
"eval_runtime": 4.8682, | |
"eval_samples_per_second": 51.353, | |
"eval_steps_per_second": 1.643, | |
"step": 28 | |
}, | |
{ | |
"epoch": 1.8125, | |
"grad_norm": 59.25, | |
"learning_rate": 4.145569620253165e-05, | |
"loss": 0.2744, | |
"step": 29 | |
}, | |
{ | |
"epoch": 1.8125, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.43966910243034363, | |
"eval_runtime": 4.8941, | |
"eval_samples_per_second": 51.082, | |
"eval_steps_per_second": 1.635, | |
"step": 29 | |
}, | |
{ | |
"epoch": 1.875, | |
"grad_norm": 5.28125, | |
"learning_rate": 4.113924050632912e-05, | |
"loss": 0.0496, | |
"step": 30 | |
}, | |
{ | |
"epoch": 1.875, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4173106849193573, | |
"eval_runtime": 4.8998, | |
"eval_samples_per_second": 51.023, | |
"eval_steps_per_second": 1.633, | |
"step": 30 | |
}, | |
{ | |
"epoch": 1.9375, | |
"grad_norm": 29.625, | |
"learning_rate": 4.0822784810126584e-05, | |
"loss": 0.1658, | |
"step": 31 | |
}, | |
{ | |
"epoch": 1.9375, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.41856226325035095, | |
"eval_runtime": 4.8693, | |
"eval_samples_per_second": 51.343, | |
"eval_steps_per_second": 1.643, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 12.625, | |
"learning_rate": 4.050632911392405e-05, | |
"loss": 0.1718, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.4255797863006592, | |
"eval_runtime": 4.8685, | |
"eval_samples_per_second": 51.35, | |
"eval_steps_per_second": 1.643, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.0625, | |
"grad_norm": 14.125, | |
"learning_rate": 4.018987341772152e-05, | |
"loss": 0.0979, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.0625, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.4206787645816803, | |
"eval_runtime": 4.8232, | |
"eval_samples_per_second": 51.833, | |
"eval_steps_per_second": 1.659, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.125, | |
"grad_norm": 4.65625, | |
"learning_rate": 3.987341772151899e-05, | |
"loss": 0.0284, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.125, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.41741058230400085, | |
"eval_runtime": 4.8675, | |
"eval_samples_per_second": 51.361, | |
"eval_steps_per_second": 1.644, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.1875, | |
"grad_norm": 8.875, | |
"learning_rate": 3.9556962025316456e-05, | |
"loss": 0.0558, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.1875, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.4207935929298401, | |
"eval_runtime": 4.7688, | |
"eval_samples_per_second": 52.424, | |
"eval_steps_per_second": 1.678, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 4.75, | |
"learning_rate": 3.924050632911392e-05, | |
"loss": 0.1725, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.25, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.43104878067970276, | |
"eval_runtime": 4.8634, | |
"eval_samples_per_second": 51.404, | |
"eval_steps_per_second": 1.645, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.3125, | |
"grad_norm": 11.25, | |
"learning_rate": 3.89240506329114e-05, | |
"loss": 0.0363, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.3125, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4319431185722351, | |
"eval_runtime": 4.8918, | |
"eval_samples_per_second": 51.106, | |
"eval_steps_per_second": 1.635, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.375, | |
"grad_norm": 10.125, | |
"learning_rate": 3.8607594936708864e-05, | |
"loss": 0.0429, | |
"step": 38 | |
}, | |
{ | |
"epoch": 2.375, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.4509444534778595, | |
"eval_runtime": 4.8964, | |
"eval_samples_per_second": 51.058, | |
"eval_steps_per_second": 1.634, | |
"step": 38 | |
}, | |
{ | |
"epoch": 2.4375, | |
"grad_norm": 8.3125, | |
"learning_rate": 3.829113924050633e-05, | |
"loss": 0.0468, | |
"step": 39 | |
}, | |
{ | |
"epoch": 2.4375, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.4592805802822113, | |
"eval_runtime": 4.8908, | |
"eval_samples_per_second": 51.117, | |
"eval_steps_per_second": 1.636, | |
"step": 39 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 16.5, | |
"learning_rate": 3.79746835443038e-05, | |
"loss": 0.0525, | |
"step": 40 | |
}, | |
{ | |
"epoch": 2.5, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.4509994387626648, | |
"eval_runtime": 4.8936, | |
"eval_samples_per_second": 51.088, | |
"eval_steps_per_second": 1.635, | |
"step": 40 | |
}, | |
{ | |
"epoch": 2.5625, | |
"grad_norm": 3.609375, | |
"learning_rate": 3.765822784810127e-05, | |
"loss": 0.0232, | |
"step": 41 | |
}, | |
{ | |
"epoch": 2.5625, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.45242950320243835, | |
"eval_runtime": 4.8925, | |
"eval_samples_per_second": 51.099, | |
"eval_steps_per_second": 1.635, | |
"step": 41 | |
}, | |
{ | |
"epoch": 2.625, | |
"grad_norm": 1.734375, | |
"learning_rate": 3.7341772151898736e-05, | |
"loss": 0.0137, | |
"step": 42 | |
}, | |
{ | |
"epoch": 2.625, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.46332496404647827, | |
"eval_runtime": 4.8687, | |
"eval_samples_per_second": 51.349, | |
"eval_steps_per_second": 1.643, | |
"step": 42 | |
}, | |
{ | |
"epoch": 2.6875, | |
"grad_norm": 4.75, | |
"learning_rate": 3.70253164556962e-05, | |
"loss": 0.0202, | |
"step": 43 | |
}, | |
{ | |
"epoch": 2.6875, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.5035133957862854, | |
"eval_runtime": 4.8899, | |
"eval_samples_per_second": 51.126, | |
"eval_steps_per_second": 1.636, | |
"step": 43 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 15.6875, | |
"learning_rate": 3.670886075949367e-05, | |
"loss": 0.0383, | |
"step": 44 | |
}, | |
{ | |
"epoch": 2.75, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.5293290615081787, | |
"eval_runtime": 4.8882, | |
"eval_samples_per_second": 51.144, | |
"eval_steps_per_second": 1.637, | |
"step": 44 | |
}, | |
{ | |
"epoch": 2.8125, | |
"grad_norm": 24.75, | |
"learning_rate": 3.639240506329114e-05, | |
"loss": 0.073, | |
"step": 45 | |
}, | |
{ | |
"epoch": 2.8125, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.5254361033439636, | |
"eval_runtime": 4.865, | |
"eval_samples_per_second": 51.387, | |
"eval_steps_per_second": 1.644, | |
"step": 45 | |
}, | |
{ | |
"epoch": 2.875, | |
"grad_norm": 28.25, | |
"learning_rate": 3.607594936708861e-05, | |
"loss": 0.0725, | |
"step": 46 | |
}, | |
{ | |
"epoch": 2.875, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.5004922151565552, | |
"eval_runtime": 4.895, | |
"eval_samples_per_second": 51.072, | |
"eval_steps_per_second": 1.634, | |
"step": 46 | |
}, | |
{ | |
"epoch": 2.9375, | |
"grad_norm": 16.375, | |
"learning_rate": 3.575949367088608e-05, | |
"loss": 0.0516, | |
"step": 47 | |
}, | |
{ | |
"epoch": 2.9375, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.463413268327713, | |
"eval_runtime": 4.8708, | |
"eval_samples_per_second": 51.326, | |
"eval_steps_per_second": 1.642, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 10.875, | |
"learning_rate": 3.5443037974683544e-05, | |
"loss": 0.0255, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.868, | |
"eval_loss": 0.4455827474594116, | |
"eval_runtime": 4.8928, | |
"eval_samples_per_second": 51.096, | |
"eval_steps_per_second": 1.635, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.0625, | |
"grad_norm": 7.9375, | |
"learning_rate": 3.5126582278481015e-05, | |
"loss": 0.0191, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.0625, | |
"eval_accuracy": 0.868, | |
"eval_loss": 0.45068883895874023, | |
"eval_runtime": 4.8384, | |
"eval_samples_per_second": 51.67, | |
"eval_steps_per_second": 1.653, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.125, | |
"grad_norm": 0.23828125, | |
"learning_rate": 3.4810126582278487e-05, | |
"loss": 0.0021, | |
"step": 50 | |
}, | |
{ | |
"epoch": 3.125, | |
"eval_accuracy": 0.868, | |
"eval_loss": 0.4584948718547821, | |
"eval_runtime": 4.838, | |
"eval_samples_per_second": 51.675, | |
"eval_steps_per_second": 1.654, | |
"step": 50 | |
}, | |
{ | |
"epoch": 3.1875, | |
"grad_norm": 1.4453125, | |
"learning_rate": 3.449367088607595e-05, | |
"loss": 0.0034, | |
"step": 51 | |
}, | |
{ | |
"epoch": 3.1875, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.4626566767692566, | |
"eval_runtime": 4.8918, | |
"eval_samples_per_second": 51.105, | |
"eval_steps_per_second": 1.635, | |
"step": 51 | |
}, | |
{ | |
"epoch": 3.25, | |
"grad_norm": 1.2890625, | |
"learning_rate": 3.4177215189873416e-05, | |
"loss": 0.0057, | |
"step": 52 | |
}, | |
{ | |
"epoch": 3.25, | |
"eval_accuracy": 0.864, | |
"eval_loss": 0.46148645877838135, | |
"eval_runtime": 4.8885, | |
"eval_samples_per_second": 51.141, | |
"eval_steps_per_second": 1.637, | |
"step": 52 | |
}, | |
{ | |
"epoch": 3.3125, | |
"grad_norm": 2.984375, | |
"learning_rate": 3.386075949367089e-05, | |
"loss": 0.0123, | |
"step": 53 | |
}, | |
{ | |
"epoch": 3.3125, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.4723862409591675, | |
"eval_runtime": 4.8923, | |
"eval_samples_per_second": 51.101, | |
"eval_steps_per_second": 1.635, | |
"step": 53 | |
}, | |
{ | |
"epoch": 3.375, | |
"grad_norm": 5.75, | |
"learning_rate": 3.354430379746836e-05, | |
"loss": 0.0111, | |
"step": 54 | |
}, | |
{ | |
"epoch": 3.375, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.463609904050827, | |
"eval_runtime": 4.8347, | |
"eval_samples_per_second": 51.71, | |
"eval_steps_per_second": 1.655, | |
"step": 54 | |
}, | |
{ | |
"epoch": 3.4375, | |
"grad_norm": 4.21875, | |
"learning_rate": 3.322784810126582e-05, | |
"loss": 0.0072, | |
"step": 55 | |
}, | |
{ | |
"epoch": 3.4375, | |
"eval_accuracy": 0.88, | |
"eval_loss": 0.44373881816864014, | |
"eval_runtime": 4.8346, | |
"eval_samples_per_second": 51.71, | |
"eval_steps_per_second": 1.655, | |
"step": 55 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 2.6875, | |
"learning_rate": 3.291139240506329e-05, | |
"loss": 0.0044, | |
"step": 56 | |
}, | |
{ | |
"epoch": 3.5, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.4286611080169678, | |
"eval_runtime": 4.8885, | |
"eval_samples_per_second": 51.14, | |
"eval_steps_per_second": 1.636, | |
"step": 56 | |
}, | |
{ | |
"epoch": 3.5625, | |
"grad_norm": 0.34375, | |
"learning_rate": 3.2594936708860766e-05, | |
"loss": 0.0013, | |
"step": 57 | |
}, | |
{ | |
"epoch": 3.5625, | |
"eval_accuracy": 0.892, | |
"eval_loss": 0.4324968755245209, | |
"eval_runtime": 4.882, | |
"eval_samples_per_second": 51.209, | |
"eval_steps_per_second": 1.639, | |
"step": 57 | |
}, | |
{ | |
"epoch": 3.625, | |
"grad_norm": 0.111328125, | |
"learning_rate": 3.227848101265823e-05, | |
"loss": 0.0005, | |
"step": 58 | |
}, | |
{ | |
"epoch": 3.625, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.45419201254844666, | |
"eval_runtime": 4.8691, | |
"eval_samples_per_second": 51.344, | |
"eval_steps_per_second": 1.643, | |
"step": 58 | |
}, | |
{ | |
"epoch": 3.6875, | |
"grad_norm": 1.3984375, | |
"learning_rate": 3.1962025316455695e-05, | |
"loss": 0.0045, | |
"step": 59 | |
}, | |
{ | |
"epoch": 3.6875, | |
"eval_accuracy": 0.892, | |
"eval_loss": 0.4625495672225952, | |
"eval_runtime": 4.8885, | |
"eval_samples_per_second": 51.14, | |
"eval_steps_per_second": 1.636, | |
"step": 59 | |
}, | |
{ | |
"epoch": 3.75, | |
"grad_norm": 0.287109375, | |
"learning_rate": 3.1645569620253167e-05, | |
"loss": 0.0018, | |
"step": 60 | |
}, | |
{ | |
"epoch": 3.75, | |
"eval_accuracy": 0.888, | |
"eval_loss": 0.4779915511608124, | |
"eval_runtime": 4.8965, | |
"eval_samples_per_second": 51.057, | |
"eval_steps_per_second": 1.634, | |
"step": 60 | |
}, | |
{ | |
"epoch": 3.8125, | |
"grad_norm": 2.109375, | |
"learning_rate": 3.132911392405064e-05, | |
"loss": 0.0039, | |
"step": 61 | |
}, | |
{ | |
"epoch": 3.8125, | |
"eval_accuracy": 0.892, | |
"eval_loss": 0.47442150115966797, | |
"eval_runtime": 4.8755, | |
"eval_samples_per_second": 51.277, | |
"eval_steps_per_second": 1.641, | |
"step": 61 | |
}, | |
{ | |
"epoch": 3.875, | |
"grad_norm": 0.201171875, | |
"learning_rate": 3.10126582278481e-05, | |
"loss": 0.0007, | |
"step": 62 | |
}, | |
{ | |
"epoch": 3.875, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.47328320145606995, | |
"eval_runtime": 4.8693, | |
"eval_samples_per_second": 51.342, | |
"eval_steps_per_second": 1.643, | |
"step": 62 | |
}, | |
{ | |
"epoch": 3.9375, | |
"grad_norm": 0.56640625, | |
"learning_rate": 3.0696202531645574e-05, | |
"loss": 0.0009, | |
"step": 63 | |
}, | |
{ | |
"epoch": 3.9375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.4680858850479126, | |
"eval_runtime": 4.8481, | |
"eval_samples_per_second": 51.567, | |
"eval_steps_per_second": 1.65, | |
"step": 63 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 0.259765625, | |
"learning_rate": 3.0379746835443042e-05, | |
"loss": 0.0011, | |
"step": 64 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.46620070934295654, | |
"eval_runtime": 4.8197, | |
"eval_samples_per_second": 51.871, | |
"eval_steps_per_second": 1.66, | |
"step": 64 | |
}, | |
{ | |
"epoch": 4.0625, | |
"grad_norm": 0.01483154296875, | |
"learning_rate": 3.0063291139240506e-05, | |
"loss": 0.0001, | |
"step": 65 | |
}, | |
{ | |
"epoch": 4.0625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.47483450174331665, | |
"eval_runtime": 4.8902, | |
"eval_samples_per_second": 51.123, | |
"eval_steps_per_second": 1.636, | |
"step": 65 | |
}, | |
{ | |
"epoch": 4.125, | |
"grad_norm": 0.197265625, | |
"learning_rate": 2.9746835443037974e-05, | |
"loss": 0.0004, | |
"step": 66 | |
}, | |
{ | |
"epoch": 4.125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.4847821593284607, | |
"eval_runtime": 4.8909, | |
"eval_samples_per_second": 51.116, | |
"eval_steps_per_second": 1.636, | |
"step": 66 | |
}, | |
{ | |
"epoch": 4.1875, | |
"grad_norm": 0.283203125, | |
"learning_rate": 2.9430379746835446e-05, | |
"loss": 0.0008, | |
"step": 67 | |
}, | |
{ | |
"epoch": 4.1875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.49961230158805847, | |
"eval_runtime": 4.8722, | |
"eval_samples_per_second": 51.312, | |
"eval_steps_per_second": 1.642, | |
"step": 67 | |
}, | |
{ | |
"epoch": 4.25, | |
"grad_norm": 0.06103515625, | |
"learning_rate": 2.9113924050632914e-05, | |
"loss": 0.0001, | |
"step": 68 | |
}, | |
{ | |
"epoch": 4.25, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5119830965995789, | |
"eval_runtime": 4.872, | |
"eval_samples_per_second": 51.314, | |
"eval_steps_per_second": 1.642, | |
"step": 68 | |
}, | |
{ | |
"epoch": 4.3125, | |
"grad_norm": 0.10302734375, | |
"learning_rate": 2.879746835443038e-05, | |
"loss": 0.0002, | |
"step": 69 | |
}, | |
{ | |
"epoch": 4.3125, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5265066623687744, | |
"eval_runtime": 4.8698, | |
"eval_samples_per_second": 51.337, | |
"eval_steps_per_second": 1.643, | |
"step": 69 | |
}, | |
{ | |
"epoch": 4.375, | |
"grad_norm": 0.1904296875, | |
"learning_rate": 2.848101265822785e-05, | |
"loss": 0.0003, | |
"step": 70 | |
}, | |
{ | |
"epoch": 4.375, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.53618323802948, | |
"eval_runtime": 4.904, | |
"eval_samples_per_second": 50.978, | |
"eval_steps_per_second": 1.631, | |
"step": 70 | |
}, | |
{ | |
"epoch": 4.4375, | |
"grad_norm": 0.31640625, | |
"learning_rate": 2.8164556962025318e-05, | |
"loss": 0.0005, | |
"step": 71 | |
}, | |
{ | |
"epoch": 4.4375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5419101119041443, | |
"eval_runtime": 4.8958, | |
"eval_samples_per_second": 51.064, | |
"eval_steps_per_second": 1.634, | |
"step": 71 | |
}, | |
{ | |
"epoch": 4.5, | |
"grad_norm": 0.55859375, | |
"learning_rate": 2.7848101265822786e-05, | |
"loss": 0.001, | |
"step": 72 | |
}, | |
{ | |
"epoch": 4.5, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5409899950027466, | |
"eval_runtime": 4.8722, | |
"eval_samples_per_second": 51.311, | |
"eval_steps_per_second": 1.642, | |
"step": 72 | |
}, | |
{ | |
"epoch": 4.5625, | |
"grad_norm": 0.1376953125, | |
"learning_rate": 2.7531645569620257e-05, | |
"loss": 0.0002, | |
"step": 73 | |
}, | |
{ | |
"epoch": 4.5625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5402071475982666, | |
"eval_runtime": 4.8891, | |
"eval_samples_per_second": 51.135, | |
"eval_steps_per_second": 1.636, | |
"step": 73 | |
}, | |
{ | |
"epoch": 4.625, | |
"grad_norm": 0.107421875, | |
"learning_rate": 2.7215189873417722e-05, | |
"loss": 0.0002, | |
"step": 74 | |
}, | |
{ | |
"epoch": 4.625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5370295643806458, | |
"eval_runtime": 4.8731, | |
"eval_samples_per_second": 51.303, | |
"eval_steps_per_second": 1.642, | |
"step": 74 | |
}, | |
{ | |
"epoch": 4.6875, | |
"grad_norm": 0.34765625, | |
"learning_rate": 2.689873417721519e-05, | |
"loss": 0.0005, | |
"step": 75 | |
}, | |
{ | |
"epoch": 4.6875, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5307853817939758, | |
"eval_runtime": 4.868, | |
"eval_samples_per_second": 51.355, | |
"eval_steps_per_second": 1.643, | |
"step": 75 | |
}, | |
{ | |
"epoch": 4.75, | |
"grad_norm": 0.0732421875, | |
"learning_rate": 2.6582278481012658e-05, | |
"loss": 0.0001, | |
"step": 76 | |
}, | |
{ | |
"epoch": 4.75, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5289453268051147, | |
"eval_runtime": 4.8663, | |
"eval_samples_per_second": 51.374, | |
"eval_steps_per_second": 1.644, | |
"step": 76 | |
}, | |
{ | |
"epoch": 4.8125, | |
"grad_norm": 0.038818359375, | |
"learning_rate": 2.626582278481013e-05, | |
"loss": 0.0002, | |
"step": 77 | |
}, | |
{ | |
"epoch": 4.8125, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5269708037376404, | |
"eval_runtime": 4.9075, | |
"eval_samples_per_second": 50.942, | |
"eval_steps_per_second": 1.63, | |
"step": 77 | |
}, | |
{ | |
"epoch": 4.875, | |
"grad_norm": 0.015380859375, | |
"learning_rate": 2.5949367088607597e-05, | |
"loss": 0.0, | |
"step": 78 | |
}, | |
{ | |
"epoch": 4.875, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5278732776641846, | |
"eval_runtime": 4.899, | |
"eval_samples_per_second": 51.031, | |
"eval_steps_per_second": 1.633, | |
"step": 78 | |
}, | |
{ | |
"epoch": 4.9375, | |
"grad_norm": 0.056884765625, | |
"learning_rate": 2.5632911392405062e-05, | |
"loss": 0.0001, | |
"step": 79 | |
}, | |
{ | |
"epoch": 4.9375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5320713520050049, | |
"eval_runtime": 4.824, | |
"eval_samples_per_second": 51.825, | |
"eval_steps_per_second": 1.658, | |
"step": 79 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 0.01025390625, | |
"learning_rate": 2.5316455696202533e-05, | |
"loss": 0.0, | |
"step": 80 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5385279059410095, | |
"eval_runtime": 4.8713, | |
"eval_samples_per_second": 51.321, | |
"eval_steps_per_second": 1.642, | |
"step": 80 | |
}, | |
{ | |
"epoch": 5.0625, | |
"grad_norm": 0.03076171875, | |
"learning_rate": 2.5e-05, | |
"loss": 0.0001, | |
"step": 81 | |
}, | |
{ | |
"epoch": 5.0625, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5427132248878479, | |
"eval_runtime": 4.8863, | |
"eval_samples_per_second": 51.163, | |
"eval_steps_per_second": 1.637, | |
"step": 81 | |
}, | |
{ | |
"epoch": 5.125, | |
"grad_norm": 0.010986328125, | |
"learning_rate": 2.468354430379747e-05, | |
"loss": 0.0, | |
"step": 82 | |
}, | |
{ | |
"epoch": 5.125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5424291491508484, | |
"eval_runtime": 4.869, | |
"eval_samples_per_second": 51.345, | |
"eval_steps_per_second": 1.643, | |
"step": 82 | |
}, | |
{ | |
"epoch": 5.1875, | |
"grad_norm": 0.00860595703125, | |
"learning_rate": 2.4367088607594937e-05, | |
"loss": 0.0, | |
"step": 83 | |
}, | |
{ | |
"epoch": 5.1875, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5434779524803162, | |
"eval_runtime": 4.8938, | |
"eval_samples_per_second": 51.085, | |
"eval_steps_per_second": 1.635, | |
"step": 83 | |
}, | |
{ | |
"epoch": 5.25, | |
"grad_norm": 0.046630859375, | |
"learning_rate": 2.4050632911392405e-05, | |
"loss": 0.0001, | |
"step": 84 | |
}, | |
{ | |
"epoch": 5.25, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.552955687046051, | |
"eval_runtime": 4.8992, | |
"eval_samples_per_second": 51.028, | |
"eval_steps_per_second": 1.633, | |
"step": 84 | |
}, | |
{ | |
"epoch": 5.3125, | |
"grad_norm": 0.0242919921875, | |
"learning_rate": 2.3734177215189873e-05, | |
"loss": 0.0001, | |
"step": 85 | |
}, | |
{ | |
"epoch": 5.3125, | |
"eval_accuracy": 0.908, | |
"eval_loss": 0.5524942874908447, | |
"eval_runtime": 4.8893, | |
"eval_samples_per_second": 51.132, | |
"eval_steps_per_second": 1.636, | |
"step": 85 | |
}, | |
{ | |
"epoch": 5.375, | |
"grad_norm": 0.10302734375, | |
"learning_rate": 2.341772151898734e-05, | |
"loss": 0.0002, | |
"step": 86 | |
}, | |
{ | |
"epoch": 5.375, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5556187629699707, | |
"eval_runtime": 4.8844, | |
"eval_samples_per_second": 51.184, | |
"eval_steps_per_second": 1.638, | |
"step": 86 | |
}, | |
{ | |
"epoch": 5.4375, | |
"grad_norm": 0.058837890625, | |
"learning_rate": 2.3101265822784813e-05, | |
"loss": 0.0001, | |
"step": 87 | |
}, | |
{ | |
"epoch": 5.4375, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5552673935890198, | |
"eval_runtime": 4.8868, | |
"eval_samples_per_second": 51.158, | |
"eval_steps_per_second": 1.637, | |
"step": 87 | |
}, | |
{ | |
"epoch": 5.5, | |
"grad_norm": 0.049560546875, | |
"learning_rate": 2.278481012658228e-05, | |
"loss": 0.0001, | |
"step": 88 | |
}, | |
{ | |
"epoch": 5.5, | |
"eval_accuracy": 0.908, | |
"eval_loss": 0.5524196624755859, | |
"eval_runtime": 4.8881, | |
"eval_samples_per_second": 51.145, | |
"eval_steps_per_second": 1.637, | |
"step": 88 | |
}, | |
{ | |
"epoch": 5.5625, | |
"grad_norm": 0.1201171875, | |
"learning_rate": 2.246835443037975e-05, | |
"loss": 0.0002, | |
"step": 89 | |
}, | |
{ | |
"epoch": 5.5625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5546653866767883, | |
"eval_runtime": 4.888, | |
"eval_samples_per_second": 51.145, | |
"eval_steps_per_second": 1.637, | |
"step": 89 | |
}, | |
{ | |
"epoch": 5.625, | |
"grad_norm": 0.011474609375, | |
"learning_rate": 2.2151898734177217e-05, | |
"loss": 0.0, | |
"step": 90 | |
}, | |
{ | |
"epoch": 5.625, | |
"eval_accuracy": 0.908, | |
"eval_loss": 0.5581926107406616, | |
"eval_runtime": 4.8965, | |
"eval_samples_per_second": 51.056, | |
"eval_steps_per_second": 1.634, | |
"step": 90 | |
}, | |
{ | |
"epoch": 5.6875, | |
"grad_norm": 0.00494384765625, | |
"learning_rate": 2.1835443037974685e-05, | |
"loss": 0.0, | |
"step": 91 | |
}, | |
{ | |
"epoch": 5.6875, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5577874779701233, | |
"eval_runtime": 4.8978, | |
"eval_samples_per_second": 51.043, | |
"eval_steps_per_second": 1.633, | |
"step": 91 | |
}, | |
{ | |
"epoch": 5.75, | |
"grad_norm": 0.001556396484375, | |
"learning_rate": 2.1518987341772153e-05, | |
"loss": 0.0, | |
"step": 92 | |
}, | |
{ | |
"epoch": 5.75, | |
"eval_accuracy": 0.908, | |
"eval_loss": 0.5564088821411133, | |
"eval_runtime": 4.9024, | |
"eval_samples_per_second": 50.995, | |
"eval_steps_per_second": 1.632, | |
"step": 92 | |
}, | |
{ | |
"epoch": 5.8125, | |
"grad_norm": 0.04248046875, | |
"learning_rate": 2.120253164556962e-05, | |
"loss": 0.0001, | |
"step": 93 | |
}, | |
{ | |
"epoch": 5.8125, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5549535155296326, | |
"eval_runtime": 4.868, | |
"eval_samples_per_second": 51.355, | |
"eval_steps_per_second": 1.643, | |
"step": 93 | |
}, | |
{ | |
"epoch": 5.875, | |
"grad_norm": 0.0031890869140625, | |
"learning_rate": 2.088607594936709e-05, | |
"loss": 0.0, | |
"step": 94 | |
}, | |
{ | |
"epoch": 5.875, | |
"eval_accuracy": 0.908, | |
"eval_loss": 0.5585831999778748, | |
"eval_runtime": 4.8902, | |
"eval_samples_per_second": 51.123, | |
"eval_steps_per_second": 1.636, | |
"step": 94 | |
}, | |
{ | |
"epoch": 5.9375, | |
"grad_norm": 0.0191650390625, | |
"learning_rate": 2.056962025316456e-05, | |
"loss": 0.0, | |
"step": 95 | |
}, | |
{ | |
"epoch": 5.9375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5561901330947876, | |
"eval_runtime": 4.899, | |
"eval_samples_per_second": 51.031, | |
"eval_steps_per_second": 1.633, | |
"step": 95 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 0.0301513671875, | |
"learning_rate": 2.0253164556962025e-05, | |
"loss": 0.0001, | |
"step": 96 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.908, | |
"eval_loss": 0.557356595993042, | |
"eval_runtime": 4.8952, | |
"eval_samples_per_second": 51.071, | |
"eval_steps_per_second": 1.634, | |
"step": 96 | |
}, | |
{ | |
"epoch": 6.0625, | |
"grad_norm": 0.0029754638671875, | |
"learning_rate": 1.9936708860759496e-05, | |
"loss": 0.0, | |
"step": 97 | |
}, | |
{ | |
"epoch": 6.0625, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.557080090045929, | |
"eval_runtime": 4.8905, | |
"eval_samples_per_second": 51.119, | |
"eval_steps_per_second": 1.636, | |
"step": 97 | |
}, | |
{ | |
"epoch": 6.125, | |
"grad_norm": 0.013427734375, | |
"learning_rate": 1.962025316455696e-05, | |
"loss": 0.0, | |
"step": 98 | |
}, | |
{ | |
"epoch": 6.125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5583979487419128, | |
"eval_runtime": 4.8684, | |
"eval_samples_per_second": 51.352, | |
"eval_steps_per_second": 1.643, | |
"step": 98 | |
}, | |
{ | |
"epoch": 6.1875, | |
"grad_norm": 0.0205078125, | |
"learning_rate": 1.9303797468354432e-05, | |
"loss": 0.0, | |
"step": 99 | |
}, | |
{ | |
"epoch": 6.1875, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5552465319633484, | |
"eval_runtime": 4.8752, | |
"eval_samples_per_second": 51.28, | |
"eval_steps_per_second": 1.641, | |
"step": 99 | |
}, | |
{ | |
"epoch": 6.25, | |
"grad_norm": 0.03857421875, | |
"learning_rate": 1.89873417721519e-05, | |
"loss": 0.0001, | |
"step": 100 | |
}, | |
{ | |
"epoch": 6.25, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5585227608680725, | |
"eval_runtime": 4.8896, | |
"eval_samples_per_second": 51.129, | |
"eval_steps_per_second": 1.636, | |
"step": 100 | |
}, | |
{ | |
"epoch": 6.3125, | |
"grad_norm": 0.047607421875, | |
"learning_rate": 1.8670886075949368e-05, | |
"loss": 0.0001, | |
"step": 101 | |
}, | |
{ | |
"epoch": 6.3125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5532352328300476, | |
"eval_runtime": 4.8702, | |
"eval_samples_per_second": 51.332, | |
"eval_steps_per_second": 1.643, | |
"step": 101 | |
}, | |
{ | |
"epoch": 6.375, | |
"grad_norm": 0.00836181640625, | |
"learning_rate": 1.8354430379746836e-05, | |
"loss": 0.0, | |
"step": 102 | |
}, | |
{ | |
"epoch": 6.375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.555249035358429, | |
"eval_runtime": 4.8964, | |
"eval_samples_per_second": 51.058, | |
"eval_steps_per_second": 1.634, | |
"step": 102 | |
}, | |
{ | |
"epoch": 6.4375, | |
"grad_norm": 0.01055908203125, | |
"learning_rate": 1.8037974683544304e-05, | |
"loss": 0.0, | |
"step": 103 | |
}, | |
{ | |
"epoch": 6.4375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.556434690952301, | |
"eval_runtime": 4.9004, | |
"eval_samples_per_second": 51.016, | |
"eval_steps_per_second": 1.633, | |
"step": 103 | |
}, | |
{ | |
"epoch": 6.5, | |
"grad_norm": 0.022216796875, | |
"learning_rate": 1.7721518987341772e-05, | |
"loss": 0.0, | |
"step": 104 | |
}, | |
{ | |
"epoch": 6.5, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5526236891746521, | |
"eval_runtime": 4.87, | |
"eval_samples_per_second": 51.335, | |
"eval_steps_per_second": 1.643, | |
"step": 104 | |
}, | |
{ | |
"epoch": 6.5625, | |
"grad_norm": 0.0027618408203125, | |
"learning_rate": 1.7405063291139243e-05, | |
"loss": 0.0, | |
"step": 105 | |
}, | |
{ | |
"epoch": 6.5625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5563039779663086, | |
"eval_runtime": 4.892, | |
"eval_samples_per_second": 51.104, | |
"eval_steps_per_second": 1.635, | |
"step": 105 | |
}, | |
{ | |
"epoch": 6.625, | |
"grad_norm": 0.007110595703125, | |
"learning_rate": 1.7088607594936708e-05, | |
"loss": 0.0, | |
"step": 106 | |
}, | |
{ | |
"epoch": 6.625, | |
"eval_accuracy": 0.892, | |
"eval_loss": 0.5540825724601746, | |
"eval_runtime": 4.8978, | |
"eval_samples_per_second": 51.043, | |
"eval_steps_per_second": 1.633, | |
"step": 106 | |
}, | |
{ | |
"epoch": 6.6875, | |
"grad_norm": 0.0031890869140625, | |
"learning_rate": 1.677215189873418e-05, | |
"loss": 0.0, | |
"step": 107 | |
}, | |
{ | |
"epoch": 6.6875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5545496940612793, | |
"eval_runtime": 4.8688, | |
"eval_samples_per_second": 51.347, | |
"eval_steps_per_second": 1.643, | |
"step": 107 | |
}, | |
{ | |
"epoch": 6.75, | |
"grad_norm": 0.0223388671875, | |
"learning_rate": 1.6455696202531644e-05, | |
"loss": 0.0, | |
"step": 108 | |
}, | |
{ | |
"epoch": 6.75, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5558124780654907, | |
"eval_runtime": 4.8649, | |
"eval_samples_per_second": 51.389, | |
"eval_steps_per_second": 1.644, | |
"step": 108 | |
}, | |
{ | |
"epoch": 6.8125, | |
"grad_norm": 0.007354736328125, | |
"learning_rate": 1.6139240506329115e-05, | |
"loss": 0.0, | |
"step": 109 | |
}, | |
{ | |
"epoch": 6.8125, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5552881956100464, | |
"eval_runtime": 4.8707, | |
"eval_samples_per_second": 51.327, | |
"eval_steps_per_second": 1.642, | |
"step": 109 | |
}, | |
{ | |
"epoch": 6.875, | |
"grad_norm": 0.006011962890625, | |
"learning_rate": 1.5822784810126583e-05, | |
"loss": 0.0, | |
"step": 110 | |
}, | |
{ | |
"epoch": 6.875, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5591185092926025, | |
"eval_runtime": 4.8653, | |
"eval_samples_per_second": 51.384, | |
"eval_steps_per_second": 1.644, | |
"step": 110 | |
}, | |
{ | |
"epoch": 6.9375, | |
"grad_norm": 0.03759765625, | |
"learning_rate": 1.550632911392405e-05, | |
"loss": 0.0001, | |
"step": 111 | |
}, | |
{ | |
"epoch": 6.9375, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5544008612632751, | |
"eval_runtime": 4.8902, | |
"eval_samples_per_second": 51.122, | |
"eval_steps_per_second": 1.636, | |
"step": 111 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 0.00213623046875, | |
"learning_rate": 1.5189873417721521e-05, | |
"loss": 0.0, | |
"step": 112 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5563123822212219, | |
"eval_runtime": 4.8882, | |
"eval_samples_per_second": 51.143, | |
"eval_steps_per_second": 1.637, | |
"step": 112 | |
}, | |
{ | |
"epoch": 7.0625, | |
"grad_norm": 0.018798828125, | |
"learning_rate": 1.4873417721518987e-05, | |
"loss": 0.0, | |
"step": 113 | |
}, | |
{ | |
"epoch": 7.0625, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.55401611328125, | |
"eval_runtime": 4.8635, | |
"eval_samples_per_second": 51.403, | |
"eval_steps_per_second": 1.645, | |
"step": 113 | |
}, | |
{ | |
"epoch": 7.125, | |
"grad_norm": 0.0458984375, | |
"learning_rate": 1.4556962025316457e-05, | |
"loss": 0.0001, | |
"step": 114 | |
}, | |
{ | |
"epoch": 7.125, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5561829805374146, | |
"eval_runtime": 4.8843, | |
"eval_samples_per_second": 51.184, | |
"eval_steps_per_second": 1.638, | |
"step": 114 | |
}, | |
{ | |
"epoch": 7.1875, | |
"grad_norm": 0.01336669921875, | |
"learning_rate": 1.4240506329113925e-05, | |
"loss": 0.0, | |
"step": 115 | |
}, | |
{ | |
"epoch": 7.1875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5537428855895996, | |
"eval_runtime": 4.8887, | |
"eval_samples_per_second": 51.138, | |
"eval_steps_per_second": 1.636, | |
"step": 115 | |
}, | |
{ | |
"epoch": 7.25, | |
"grad_norm": 0.0113525390625, | |
"learning_rate": 1.3924050632911393e-05, | |
"loss": 0.0, | |
"step": 116 | |
}, | |
{ | |
"epoch": 7.25, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5549358129501343, | |
"eval_runtime": 4.8909, | |
"eval_samples_per_second": 51.115, | |
"eval_steps_per_second": 1.636, | |
"step": 116 | |
}, | |
{ | |
"epoch": 7.3125, | |
"grad_norm": 0.00762939453125, | |
"learning_rate": 1.3607594936708861e-05, | |
"loss": 0.0, | |
"step": 117 | |
}, | |
{ | |
"epoch": 7.3125, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5518209338188171, | |
"eval_runtime": 4.8812, | |
"eval_samples_per_second": 51.217, | |
"eval_steps_per_second": 1.639, | |
"step": 117 | |
}, | |
{ | |
"epoch": 7.375, | |
"grad_norm": 0.0038604736328125, | |
"learning_rate": 1.3291139240506329e-05, | |
"loss": 0.0, | |
"step": 118 | |
}, | |
{ | |
"epoch": 7.375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5559548139572144, | |
"eval_runtime": 4.889, | |
"eval_samples_per_second": 51.135, | |
"eval_steps_per_second": 1.636, | |
"step": 118 | |
}, | |
{ | |
"epoch": 7.4375, | |
"grad_norm": 0.0196533203125, | |
"learning_rate": 1.2974683544303799e-05, | |
"loss": 0.0, | |
"step": 119 | |
}, | |
{ | |
"epoch": 7.4375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5553652048110962, | |
"eval_runtime": 4.8764, | |
"eval_samples_per_second": 51.268, | |
"eval_steps_per_second": 1.641, | |
"step": 119 | |
}, | |
{ | |
"epoch": 7.5, | |
"grad_norm": 0.00958251953125, | |
"learning_rate": 1.2658227848101267e-05, | |
"loss": 0.0, | |
"step": 120 | |
}, | |
{ | |
"epoch": 7.5, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5554091930389404, | |
"eval_runtime": 4.8866, | |
"eval_samples_per_second": 51.161, | |
"eval_steps_per_second": 1.637, | |
"step": 120 | |
}, | |
{ | |
"epoch": 7.5625, | |
"grad_norm": 0.0029754638671875, | |
"learning_rate": 1.2341772151898735e-05, | |
"loss": 0.0, | |
"step": 121 | |
}, | |
{ | |
"epoch": 7.5625, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5547875761985779, | |
"eval_runtime": 4.8661, | |
"eval_samples_per_second": 51.376, | |
"eval_steps_per_second": 1.644, | |
"step": 121 | |
}, | |
{ | |
"epoch": 7.625, | |
"grad_norm": 0.001556396484375, | |
"learning_rate": 1.2025316455696203e-05, | |
"loss": 0.0, | |
"step": 122 | |
}, | |
{ | |
"epoch": 7.625, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.55680251121521, | |
"eval_runtime": 4.8915, | |
"eval_samples_per_second": 51.109, | |
"eval_steps_per_second": 1.635, | |
"step": 122 | |
}, | |
{ | |
"epoch": 7.6875, | |
"grad_norm": 0.0174560546875, | |
"learning_rate": 1.170886075949367e-05, | |
"loss": 0.0001, | |
"step": 123 | |
}, | |
{ | |
"epoch": 7.6875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5555019378662109, | |
"eval_runtime": 4.8666, | |
"eval_samples_per_second": 51.371, | |
"eval_steps_per_second": 1.644, | |
"step": 123 | |
}, | |
{ | |
"epoch": 7.75, | |
"grad_norm": 0.02734375, | |
"learning_rate": 1.139240506329114e-05, | |
"loss": 0.0001, | |
"step": 124 | |
}, | |
{ | |
"epoch": 7.75, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5568622946739197, | |
"eval_runtime": 4.865, | |
"eval_samples_per_second": 51.387, | |
"eval_steps_per_second": 1.644, | |
"step": 124 | |
}, | |
{ | |
"epoch": 7.8125, | |
"grad_norm": 0.0242919921875, | |
"learning_rate": 1.1075949367088608e-05, | |
"loss": 0.0, | |
"step": 125 | |
}, | |
{ | |
"epoch": 7.8125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5577536821365356, | |
"eval_runtime": 4.8877, | |
"eval_samples_per_second": 51.149, | |
"eval_steps_per_second": 1.637, | |
"step": 125 | |
}, | |
{ | |
"epoch": 7.875, | |
"grad_norm": 0.0020904541015625, | |
"learning_rate": 1.0759493670886076e-05, | |
"loss": 0.0, | |
"step": 126 | |
}, | |
{ | |
"epoch": 7.875, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5592789649963379, | |
"eval_runtime": 4.8614, | |
"eval_samples_per_second": 51.425, | |
"eval_steps_per_second": 1.646, | |
"step": 126 | |
}, | |
{ | |
"epoch": 7.9375, | |
"grad_norm": 0.00732421875, | |
"learning_rate": 1.0443037974683544e-05, | |
"loss": 0.0, | |
"step": 127 | |
}, | |
{ | |
"epoch": 7.9375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.556476891040802, | |
"eval_runtime": 4.892, | |
"eval_samples_per_second": 51.104, | |
"eval_steps_per_second": 1.635, | |
"step": 127 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 0.001861572265625, | |
"learning_rate": 1.0126582278481012e-05, | |
"loss": 0.0, | |
"step": 128 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.555403470993042, | |
"eval_runtime": 4.8901, | |
"eval_samples_per_second": 51.123, | |
"eval_steps_per_second": 1.636, | |
"step": 128 | |
}, | |
{ | |
"epoch": 8.0625, | |
"grad_norm": 0.0111083984375, | |
"learning_rate": 9.81012658227848e-06, | |
"loss": 0.0, | |
"step": 129 | |
}, | |
{ | |
"epoch": 8.0625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5534031987190247, | |
"eval_runtime": 4.8858, | |
"eval_samples_per_second": 51.169, | |
"eval_steps_per_second": 1.637, | |
"step": 129 | |
}, | |
{ | |
"epoch": 8.125, | |
"grad_norm": 0.00604248046875, | |
"learning_rate": 9.49367088607595e-06, | |
"loss": 0.0, | |
"step": 130 | |
}, | |
{ | |
"epoch": 8.125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5557398796081543, | |
"eval_runtime": 4.8638, | |
"eval_samples_per_second": 51.4, | |
"eval_steps_per_second": 1.645, | |
"step": 130 | |
}, | |
{ | |
"epoch": 8.1875, | |
"grad_norm": 0.008544921875, | |
"learning_rate": 9.177215189873418e-06, | |
"loss": 0.0, | |
"step": 131 | |
}, | |
{ | |
"epoch": 8.1875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5564188361167908, | |
"eval_runtime": 4.8832, | |
"eval_samples_per_second": 51.196, | |
"eval_steps_per_second": 1.638, | |
"step": 131 | |
}, | |
{ | |
"epoch": 8.25, | |
"grad_norm": 0.003875732421875, | |
"learning_rate": 8.860759493670886e-06, | |
"loss": 0.0, | |
"step": 132 | |
}, | |
{ | |
"epoch": 8.25, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5545617938041687, | |
"eval_runtime": 4.8916, | |
"eval_samples_per_second": 51.108, | |
"eval_steps_per_second": 1.635, | |
"step": 132 | |
}, | |
{ | |
"epoch": 8.3125, | |
"grad_norm": 0.0322265625, | |
"learning_rate": 8.544303797468354e-06, | |
"loss": 0.0001, | |
"step": 133 | |
}, | |
{ | |
"epoch": 8.3125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5550633668899536, | |
"eval_runtime": 4.8868, | |
"eval_samples_per_second": 51.158, | |
"eval_steps_per_second": 1.637, | |
"step": 133 | |
}, | |
{ | |
"epoch": 8.375, | |
"grad_norm": 0.0296630859375, | |
"learning_rate": 8.227848101265822e-06, | |
"loss": 0.0001, | |
"step": 134 | |
}, | |
{ | |
"epoch": 8.375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.553980827331543, | |
"eval_runtime": 4.8628, | |
"eval_samples_per_second": 51.411, | |
"eval_steps_per_second": 1.645, | |
"step": 134 | |
}, | |
{ | |
"epoch": 8.4375, | |
"grad_norm": 0.00396728515625, | |
"learning_rate": 7.911392405063292e-06, | |
"loss": 0.0, | |
"step": 135 | |
}, | |
{ | |
"epoch": 8.4375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5577096939086914, | |
"eval_runtime": 4.8664, | |
"eval_samples_per_second": 51.372, | |
"eval_steps_per_second": 1.644, | |
"step": 135 | |
}, | |
{ | |
"epoch": 8.5, | |
"grad_norm": 0.00089263916015625, | |
"learning_rate": 7.5949367088607605e-06, | |
"loss": 0.0, | |
"step": 136 | |
}, | |
{ | |
"epoch": 8.5, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5528184771537781, | |
"eval_runtime": 4.8633, | |
"eval_samples_per_second": 51.405, | |
"eval_steps_per_second": 1.645, | |
"step": 136 | |
}, | |
{ | |
"epoch": 8.5625, | |
"grad_norm": 0.01080322265625, | |
"learning_rate": 7.2784810126582285e-06, | |
"loss": 0.0, | |
"step": 137 | |
}, | |
{ | |
"epoch": 8.5625, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5544995069503784, | |
"eval_runtime": 4.8638, | |
"eval_samples_per_second": 51.4, | |
"eval_steps_per_second": 1.645, | |
"step": 137 | |
}, | |
{ | |
"epoch": 8.625, | |
"grad_norm": 0.006561279296875, | |
"learning_rate": 6.9620253164556965e-06, | |
"loss": 0.0, | |
"step": 138 | |
}, | |
{ | |
"epoch": 8.625, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5562748908996582, | |
"eval_runtime": 4.8677, | |
"eval_samples_per_second": 51.359, | |
"eval_steps_per_second": 1.643, | |
"step": 138 | |
}, | |
{ | |
"epoch": 8.6875, | |
"grad_norm": 0.01007080078125, | |
"learning_rate": 6.6455696202531645e-06, | |
"loss": 0.0, | |
"step": 139 | |
}, | |
{ | |
"epoch": 8.6875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5547692775726318, | |
"eval_runtime": 4.8897, | |
"eval_samples_per_second": 51.128, | |
"eval_steps_per_second": 1.636, | |
"step": 139 | |
}, | |
{ | |
"epoch": 8.75, | |
"grad_norm": 0.01171875, | |
"learning_rate": 6.329113924050633e-06, | |
"loss": 0.0, | |
"step": 140 | |
}, | |
{ | |
"epoch": 8.75, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5561813712120056, | |
"eval_runtime": 4.8875, | |
"eval_samples_per_second": 51.151, | |
"eval_steps_per_second": 1.637, | |
"step": 140 | |
}, | |
{ | |
"epoch": 8.8125, | |
"grad_norm": 0.0166015625, | |
"learning_rate": 6.012658227848101e-06, | |
"loss": 0.0, | |
"step": 141 | |
}, | |
{ | |
"epoch": 8.8125, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5564705729484558, | |
"eval_runtime": 4.8825, | |
"eval_samples_per_second": 51.203, | |
"eval_steps_per_second": 1.638, | |
"step": 141 | |
}, | |
{ | |
"epoch": 8.875, | |
"grad_norm": 0.002777099609375, | |
"learning_rate": 5.69620253164557e-06, | |
"loss": 0.0, | |
"step": 142 | |
}, | |
{ | |
"epoch": 8.875, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5576943159103394, | |
"eval_runtime": 4.8931, | |
"eval_samples_per_second": 51.092, | |
"eval_steps_per_second": 1.635, | |
"step": 142 | |
}, | |
{ | |
"epoch": 8.9375, | |
"grad_norm": 0.048583984375, | |
"learning_rate": 5.379746835443038e-06, | |
"loss": 0.0001, | |
"step": 143 | |
}, | |
{ | |
"epoch": 8.9375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5525559186935425, | |
"eval_runtime": 4.8612, | |
"eval_samples_per_second": 51.427, | |
"eval_steps_per_second": 1.646, | |
"step": 143 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 0.0019683837890625, | |
"learning_rate": 5.063291139240506e-06, | |
"loss": 0.0, | |
"step": 144 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5553069710731506, | |
"eval_runtime": 4.8923, | |
"eval_samples_per_second": 51.101, | |
"eval_steps_per_second": 1.635, | |
"step": 144 | |
}, | |
{ | |
"epoch": 9.0625, | |
"grad_norm": 0.0279541015625, | |
"learning_rate": 4.746835443037975e-06, | |
"loss": 0.0, | |
"step": 145 | |
}, | |
{ | |
"epoch": 9.0625, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5583046078681946, | |
"eval_runtime": 4.8879, | |
"eval_samples_per_second": 51.147, | |
"eval_steps_per_second": 1.637, | |
"step": 145 | |
}, | |
{ | |
"epoch": 9.125, | |
"grad_norm": 0.0130615234375, | |
"learning_rate": 4.430379746835443e-06, | |
"loss": 0.0, | |
"step": 146 | |
}, | |
{ | |
"epoch": 9.125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5578604936599731, | |
"eval_runtime": 4.8946, | |
"eval_samples_per_second": 51.077, | |
"eval_steps_per_second": 1.634, | |
"step": 146 | |
}, | |
{ | |
"epoch": 9.1875, | |
"grad_norm": 0.00732421875, | |
"learning_rate": 4.113924050632911e-06, | |
"loss": 0.0, | |
"step": 147 | |
}, | |
{ | |
"epoch": 9.1875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5572697520256042, | |
"eval_runtime": 4.8664, | |
"eval_samples_per_second": 51.372, | |
"eval_steps_per_second": 1.644, | |
"step": 147 | |
}, | |
{ | |
"epoch": 9.25, | |
"grad_norm": 0.004364013671875, | |
"learning_rate": 3.7974683544303802e-06, | |
"loss": 0.0, | |
"step": 148 | |
}, | |
{ | |
"epoch": 9.25, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5586851835250854, | |
"eval_runtime": 4.8653, | |
"eval_samples_per_second": 51.384, | |
"eval_steps_per_second": 1.644, | |
"step": 148 | |
}, | |
{ | |
"epoch": 9.3125, | |
"grad_norm": 0.010986328125, | |
"learning_rate": 3.4810126582278482e-06, | |
"loss": 0.0, | |
"step": 149 | |
}, | |
{ | |
"epoch": 9.3125, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5570804476737976, | |
"eval_runtime": 4.8876, | |
"eval_samples_per_second": 51.15, | |
"eval_steps_per_second": 1.637, | |
"step": 149 | |
}, | |
{ | |
"epoch": 9.375, | |
"grad_norm": 0.0025787353515625, | |
"learning_rate": 3.1645569620253167e-06, | |
"loss": 0.0, | |
"step": 150 | |
}, | |
{ | |
"epoch": 9.375, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5560451745986938, | |
"eval_runtime": 4.8867, | |
"eval_samples_per_second": 51.159, | |
"eval_steps_per_second": 1.637, | |
"step": 150 | |
}, | |
{ | |
"epoch": 9.4375, | |
"grad_norm": 0.00616455078125, | |
"learning_rate": 2.848101265822785e-06, | |
"loss": 0.0, | |
"step": 151 | |
}, | |
{ | |
"epoch": 9.4375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5584925413131714, | |
"eval_runtime": 4.8614, | |
"eval_samples_per_second": 51.425, | |
"eval_steps_per_second": 1.646, | |
"step": 151 | |
}, | |
{ | |
"epoch": 9.5, | |
"grad_norm": 0.01312255859375, | |
"learning_rate": 2.531645569620253e-06, | |
"loss": 0.0, | |
"step": 152 | |
}, | |
{ | |
"epoch": 9.5, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5539237260818481, | |
"eval_runtime": 4.8841, | |
"eval_samples_per_second": 51.186, | |
"eval_steps_per_second": 1.638, | |
"step": 152 | |
}, | |
{ | |
"epoch": 9.5625, | |
"grad_norm": 0.00604248046875, | |
"learning_rate": 2.2151898734177215e-06, | |
"loss": 0.0, | |
"step": 153 | |
}, | |
{ | |
"epoch": 9.5625, | |
"eval_accuracy": 0.904, | |
"eval_loss": 0.5538834929466248, | |
"eval_runtime": 4.8602, | |
"eval_samples_per_second": 51.438, | |
"eval_steps_per_second": 1.646, | |
"step": 153 | |
}, | |
{ | |
"epoch": 9.625, | |
"grad_norm": 0.032958984375, | |
"learning_rate": 1.8987341772151901e-06, | |
"loss": 0.0001, | |
"step": 154 | |
}, | |
{ | |
"epoch": 9.625, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5534281134605408, | |
"eval_runtime": 4.8627, | |
"eval_samples_per_second": 51.412, | |
"eval_steps_per_second": 1.645, | |
"step": 154 | |
}, | |
{ | |
"epoch": 9.6875, | |
"grad_norm": 0.007354736328125, | |
"learning_rate": 1.5822784810126583e-06, | |
"loss": 0.0, | |
"step": 155 | |
}, | |
{ | |
"epoch": 9.6875, | |
"eval_accuracy": 0.9, | |
"eval_loss": 0.5582576990127563, | |
"eval_runtime": 4.8653, | |
"eval_samples_per_second": 51.384, | |
"eval_steps_per_second": 1.644, | |
"step": 155 | |
}, | |
{ | |
"epoch": 9.75, | |
"grad_norm": 0.00421142578125, | |
"learning_rate": 1.2658227848101265e-06, | |
"loss": 0.0, | |
"step": 156 | |
}, | |
{ | |
"epoch": 9.75, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5575215816497803, | |
"eval_runtime": 4.878, | |
"eval_samples_per_second": 51.251, | |
"eval_steps_per_second": 1.64, | |
"step": 156 | |
}, | |
{ | |
"epoch": 9.8125, | |
"grad_norm": 0.00482177734375, | |
"learning_rate": 9.493670886075951e-07, | |
"loss": 0.0, | |
"step": 157 | |
}, | |
{ | |
"epoch": 9.8125, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5540234446525574, | |
"eval_runtime": 4.8905, | |
"eval_samples_per_second": 51.119, | |
"eval_steps_per_second": 1.636, | |
"step": 157 | |
}, | |
{ | |
"epoch": 9.875, | |
"grad_norm": 0.0147705078125, | |
"learning_rate": 6.329113924050633e-07, | |
"loss": 0.0, | |
"step": 158 | |
}, | |
{ | |
"epoch": 9.875, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.55516517162323, | |
"eval_runtime": 4.8871, | |
"eval_samples_per_second": 51.156, | |
"eval_steps_per_second": 1.637, | |
"step": 158 | |
}, | |
{ | |
"epoch": 9.9375, | |
"grad_norm": 0.0033416748046875, | |
"learning_rate": 3.1645569620253163e-07, | |
"loss": 0.0, | |
"step": 159 | |
}, | |
{ | |
"epoch": 9.9375, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.5555429458618164, | |
"eval_runtime": 4.8568, | |
"eval_samples_per_second": 51.474, | |
"eval_steps_per_second": 1.647, | |
"step": 159 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 0.0185546875, | |
"learning_rate": 0.0, | |
"loss": 0.0, | |
"step": 160 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.896, | |
"eval_loss": 0.557336688041687, | |
"eval_runtime": 4.8633, | |
"eval_samples_per_second": 51.405, | |
"eval_steps_per_second": 1.645, | |
"step": 160 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 160, | |
"total_flos": 7.024653098391962e+16, | |
"train_loss": 0.1563696864293888, | |
"train_runtime": 1332.2795, | |
"train_samples_per_second": 7.506, | |
"train_steps_per_second": 0.12 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 160, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": false, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 7.024653098391962e+16, | |
"train_batch_size": 8, | |
"trial_name": null, | |
"trial_params": null | |
} | |