{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 268.0, "learning_rate": 2.5e-05, "loss": 2.5702, "step": 1 }, { "epoch": 0.0625, "eval_accuracy": 0.508, "eval_loss": 2.8505053520202637, "eval_runtime": 4.8412, "eval_samples_per_second": 51.64, "eval_steps_per_second": 1.652, "step": 1 }, { "epoch": 0.125, "grad_norm": 227.0, "learning_rate": 5e-05, "loss": 2.6389, "step": 2 }, { "epoch": 0.125, "eval_accuracy": 0.528, "eval_loss": 2.2588462829589844, "eval_runtime": 4.8967, "eval_samples_per_second": 51.054, "eval_steps_per_second": 1.634, "step": 2 }, { "epoch": 0.1875, "grad_norm": 236.0, "learning_rate": 4.968354430379747e-05, "loss": 2.3551, "step": 3 }, { "epoch": 0.1875, "eval_accuracy": 0.536, "eval_loss": 1.4990876913070679, "eval_runtime": 4.8929, "eval_samples_per_second": 51.095, "eval_steps_per_second": 1.635, "step": 3 }, { "epoch": 0.25, "grad_norm": 89.0, "learning_rate": 4.936708860759494e-05, "loss": 1.5552, "step": 4 }, { "epoch": 0.25, "eval_accuracy": 0.544, "eval_loss": 1.8495231866836548, "eval_runtime": 4.8705, "eval_samples_per_second": 51.33, "eval_steps_per_second": 1.643, "step": 4 }, { "epoch": 0.3125, "grad_norm": 130.0, "learning_rate": 4.905063291139241e-05, "loss": 1.6022, "step": 5 }, { "epoch": 0.3125, "eval_accuracy": 0.552, "eval_loss": 1.5980929136276245, "eval_runtime": 4.8961, "eval_samples_per_second": 51.061, "eval_steps_per_second": 1.634, "step": 5 }, { "epoch": 0.375, "grad_norm": 190.0, "learning_rate": 4.8734177215189874e-05, "loss": 1.8478, "step": 6 }, { "epoch": 0.375, "eval_accuracy": 0.62, "eval_loss": 1.179288625717163, "eval_runtime": 4.8667, "eval_samples_per_second": 51.37, "eval_steps_per_second": 1.644, "step": 6 }, { "epoch": 0.4375, "grad_norm": 92.5, "learning_rate": 4.8417721518987346e-05, "loss": 1.1437, "step": 7 }, { "epoch": 0.4375, "eval_accuracy": 0.624, "eval_loss": 1.0324233770370483, "eval_runtime": 4.8984, "eval_samples_per_second": 51.037, "eval_steps_per_second": 1.633, "step": 7 }, { "epoch": 0.5, "grad_norm": 52.25, "learning_rate": 4.810126582278481e-05, "loss": 0.7386, "step": 8 }, { "epoch": 0.5, "eval_accuracy": 0.628, "eval_loss": 1.0248664617538452, "eval_runtime": 4.8699, "eval_samples_per_second": 51.335, "eval_steps_per_second": 1.643, "step": 8 }, { "epoch": 0.5625, "grad_norm": 69.0, "learning_rate": 4.778481012658228e-05, "loss": 1.0636, "step": 9 }, { "epoch": 0.5625, "eval_accuracy": 0.656, "eval_loss": 0.9664216041564941, "eval_runtime": 4.8671, "eval_samples_per_second": 51.365, "eval_steps_per_second": 1.644, "step": 9 }, { "epoch": 0.625, "grad_norm": 104.0, "learning_rate": 4.7468354430379746e-05, "loss": 0.9956, "step": 10 }, { "epoch": 0.625, "eval_accuracy": 0.696, "eval_loss": 0.8491688966751099, "eval_runtime": 4.8707, "eval_samples_per_second": 51.328, "eval_steps_per_second": 1.642, "step": 10 }, { "epoch": 0.6875, "grad_norm": 51.75, "learning_rate": 4.715189873417722e-05, "loss": 0.7195, "step": 11 }, { "epoch": 0.6875, "eval_accuracy": 0.724, "eval_loss": 0.7724543809890747, "eval_runtime": 4.8634, "eval_samples_per_second": 51.405, "eval_steps_per_second": 1.645, "step": 11 }, { "epoch": 0.75, "grad_norm": 24.625, "learning_rate": 4.683544303797468e-05, "loss": 0.7704, "step": 12 }, { "epoch": 0.75, "eval_accuracy": 0.728, "eval_loss": 0.7519556283950806, "eval_runtime": 4.8881, "eval_samples_per_second": 51.144, "eval_steps_per_second": 1.637, "step": 12 }, { "epoch": 0.8125, "grad_norm": 18.625, "learning_rate": 4.6518987341772154e-05, "loss": 0.4636, "step": 13 }, { "epoch": 0.8125, "eval_accuracy": 0.744, "eval_loss": 0.7026970386505127, "eval_runtime": 4.8695, "eval_samples_per_second": 51.34, "eval_steps_per_second": 1.643, "step": 13 }, { "epoch": 0.875, "grad_norm": 22.25, "learning_rate": 4.6202531645569625e-05, "loss": 0.771, "step": 14 }, { "epoch": 0.875, "eval_accuracy": 0.788, "eval_loss": 0.6045461893081665, "eval_runtime": 4.8902, "eval_samples_per_second": 51.123, "eval_steps_per_second": 1.636, "step": 14 }, { "epoch": 0.9375, "grad_norm": 15.5, "learning_rate": 4.588607594936709e-05, "loss": 0.7138, "step": 15 }, { "epoch": 0.9375, "eval_accuracy": 0.82, "eval_loss": 0.5602908730506897, "eval_runtime": 4.8902, "eval_samples_per_second": 51.123, "eval_steps_per_second": 1.636, "step": 15 }, { "epoch": 1.0, "grad_norm": 30.875, "learning_rate": 4.556962025316456e-05, "loss": 0.7263, "step": 16 }, { "epoch": 1.0, "eval_accuracy": 0.804, "eval_loss": 0.5666066408157349, "eval_runtime": 4.8404, "eval_samples_per_second": 51.649, "eval_steps_per_second": 1.653, "step": 16 }, { "epoch": 1.0625, "grad_norm": 77.5, "learning_rate": 4.525316455696203e-05, "loss": 0.3919, "step": 17 }, { "epoch": 1.0625, "eval_accuracy": 0.808, "eval_loss": 0.5534331202507019, "eval_runtime": 4.816, "eval_samples_per_second": 51.911, "eval_steps_per_second": 1.661, "step": 17 }, { "epoch": 1.125, "grad_norm": 44.5, "learning_rate": 4.49367088607595e-05, "loss": 0.2448, "step": 18 }, { "epoch": 1.125, "eval_accuracy": 0.836, "eval_loss": 0.5073856711387634, "eval_runtime": 4.8889, "eval_samples_per_second": 51.137, "eval_steps_per_second": 1.636, "step": 18 }, { "epoch": 1.1875, "grad_norm": 31.75, "learning_rate": 4.462025316455696e-05, "loss": 0.2112, "step": 19 }, { "epoch": 1.1875, "eval_accuracy": 0.852, "eval_loss": 0.4510812759399414, "eval_runtime": 4.8632, "eval_samples_per_second": 51.407, "eval_steps_per_second": 1.645, "step": 19 }, { "epoch": 1.25, "grad_norm": 33.25, "learning_rate": 4.430379746835443e-05, "loss": 0.335, "step": 20 }, { "epoch": 1.25, "eval_accuracy": 0.828, "eval_loss": 0.4451099932193756, "eval_runtime": 4.8972, "eval_samples_per_second": 51.05, "eval_steps_per_second": 1.634, "step": 20 }, { "epoch": 1.3125, "grad_norm": 17.375, "learning_rate": 4.3987341772151904e-05, "loss": 0.196, "step": 21 }, { "epoch": 1.3125, "eval_accuracy": 0.82, "eval_loss": 0.47517192363739014, "eval_runtime": 4.8852, "eval_samples_per_second": 51.175, "eval_steps_per_second": 1.638, "step": 21 }, { "epoch": 1.375, "grad_norm": 45.75, "learning_rate": 4.367088607594937e-05, "loss": 0.1765, "step": 22 }, { "epoch": 1.375, "eval_accuracy": 0.82, "eval_loss": 0.4687165319919586, "eval_runtime": 4.8161, "eval_samples_per_second": 51.909, "eval_steps_per_second": 1.661, "step": 22 }, { "epoch": 1.4375, "grad_norm": 18.625, "learning_rate": 4.3354430379746834e-05, "loss": 0.2245, "step": 23 }, { "epoch": 1.4375, "eval_accuracy": 0.824, "eval_loss": 0.4411630928516388, "eval_runtime": 4.8861, "eval_samples_per_second": 51.165, "eval_steps_per_second": 1.637, "step": 23 }, { "epoch": 1.5, "grad_norm": 10.0, "learning_rate": 4.3037974683544305e-05, "loss": 0.137, "step": 24 }, { "epoch": 1.5, "eval_accuracy": 0.844, "eval_loss": 0.4266666769981384, "eval_runtime": 4.8898, "eval_samples_per_second": 51.127, "eval_steps_per_second": 1.636, "step": 24 }, { "epoch": 1.5625, "grad_norm": 13.125, "learning_rate": 4.2721518987341776e-05, "loss": 0.1957, "step": 25 }, { "epoch": 1.5625, "eval_accuracy": 0.86, "eval_loss": 0.42926496267318726, "eval_runtime": 4.8436, "eval_samples_per_second": 51.614, "eval_steps_per_second": 1.652, "step": 25 }, { "epoch": 1.625, "grad_norm": 62.0, "learning_rate": 4.240506329113924e-05, "loss": 0.3881, "step": 26 }, { "epoch": 1.625, "eval_accuracy": 0.864, "eval_loss": 0.43251192569732666, "eval_runtime": 4.8871, "eval_samples_per_second": 51.155, "eval_steps_per_second": 1.637, "step": 26 }, { "epoch": 1.6875, "grad_norm": 7.9375, "learning_rate": 4.208860759493671e-05, "loss": 0.1298, "step": 27 }, { "epoch": 1.6875, "eval_accuracy": 0.856, "eval_loss": 0.46340441703796387, "eval_runtime": 4.8777, "eval_samples_per_second": 51.254, "eval_steps_per_second": 1.64, "step": 27 }, { "epoch": 1.75, "grad_norm": 13.625, "learning_rate": 4.177215189873418e-05, "loss": 0.1124, "step": 28 }, { "epoch": 1.75, "eval_accuracy": 0.86, "eval_loss": 0.46482232213020325, "eval_runtime": 4.8682, "eval_samples_per_second": 51.353, "eval_steps_per_second": 1.643, "step": 28 }, { "epoch": 1.8125, "grad_norm": 59.25, "learning_rate": 4.145569620253165e-05, "loss": 0.2744, "step": 29 }, { "epoch": 1.8125, "eval_accuracy": 0.848, "eval_loss": 0.43966910243034363, "eval_runtime": 4.8941, "eval_samples_per_second": 51.082, "eval_steps_per_second": 1.635, "step": 29 }, { "epoch": 1.875, "grad_norm": 5.28125, "learning_rate": 4.113924050632912e-05, "loss": 0.0496, "step": 30 }, { "epoch": 1.875, "eval_accuracy": 0.844, "eval_loss": 0.4173106849193573, "eval_runtime": 4.8998, "eval_samples_per_second": 51.023, "eval_steps_per_second": 1.633, "step": 30 }, { "epoch": 1.9375, "grad_norm": 29.625, "learning_rate": 4.0822784810126584e-05, "loss": 0.1658, "step": 31 }, { "epoch": 1.9375, "eval_accuracy": 0.832, "eval_loss": 0.41856226325035095, "eval_runtime": 4.8693, "eval_samples_per_second": 51.343, "eval_steps_per_second": 1.643, "step": 31 }, { "epoch": 2.0, "grad_norm": 12.625, "learning_rate": 4.050632911392405e-05, "loss": 0.1718, "step": 32 }, { "epoch": 2.0, "eval_accuracy": 0.836, "eval_loss": 0.4255797863006592, "eval_runtime": 4.8685, "eval_samples_per_second": 51.35, "eval_steps_per_second": 1.643, "step": 32 }, { "epoch": 2.0625, "grad_norm": 14.125, "learning_rate": 4.018987341772152e-05, "loss": 0.0979, "step": 33 }, { "epoch": 2.0625, "eval_accuracy": 0.836, "eval_loss": 0.4206787645816803, "eval_runtime": 4.8232, "eval_samples_per_second": 51.833, "eval_steps_per_second": 1.659, "step": 33 }, { "epoch": 2.125, "grad_norm": 4.65625, "learning_rate": 3.987341772151899e-05, "loss": 0.0284, "step": 34 }, { "epoch": 2.125, "eval_accuracy": 0.844, "eval_loss": 0.41741058230400085, "eval_runtime": 4.8675, "eval_samples_per_second": 51.361, "eval_steps_per_second": 1.644, "step": 34 }, { "epoch": 2.1875, "grad_norm": 8.875, "learning_rate": 3.9556962025316456e-05, "loss": 0.0558, "step": 35 }, { "epoch": 2.1875, "eval_accuracy": 0.844, "eval_loss": 0.4207935929298401, "eval_runtime": 4.7688, "eval_samples_per_second": 52.424, "eval_steps_per_second": 1.678, "step": 35 }, { "epoch": 2.25, "grad_norm": 4.75, "learning_rate": 3.924050632911392e-05, "loss": 0.1725, "step": 36 }, { "epoch": 2.25, "eval_accuracy": 0.84, "eval_loss": 0.43104878067970276, "eval_runtime": 4.8634, "eval_samples_per_second": 51.404, "eval_steps_per_second": 1.645, "step": 36 }, { "epoch": 2.3125, "grad_norm": 11.25, "learning_rate": 3.89240506329114e-05, "loss": 0.0363, "step": 37 }, { "epoch": 2.3125, "eval_accuracy": 0.852, "eval_loss": 0.4319431185722351, "eval_runtime": 4.8918, "eval_samples_per_second": 51.106, "eval_steps_per_second": 1.635, "step": 37 }, { "epoch": 2.375, "grad_norm": 10.125, "learning_rate": 3.8607594936708864e-05, "loss": 0.0429, "step": 38 }, { "epoch": 2.375, "eval_accuracy": 0.856, "eval_loss": 0.4509444534778595, "eval_runtime": 4.8964, "eval_samples_per_second": 51.058, "eval_steps_per_second": 1.634, "step": 38 }, { "epoch": 2.4375, "grad_norm": 8.3125, "learning_rate": 3.829113924050633e-05, "loss": 0.0468, "step": 39 }, { "epoch": 2.4375, "eval_accuracy": 0.856, "eval_loss": 0.4592805802822113, "eval_runtime": 4.8908, "eval_samples_per_second": 51.117, "eval_steps_per_second": 1.636, "step": 39 }, { "epoch": 2.5, "grad_norm": 16.5, "learning_rate": 3.79746835443038e-05, "loss": 0.0525, "step": 40 }, { "epoch": 2.5, "eval_accuracy": 0.864, "eval_loss": 0.4509994387626648, "eval_runtime": 4.8936, "eval_samples_per_second": 51.088, "eval_steps_per_second": 1.635, "step": 40 }, { "epoch": 2.5625, "grad_norm": 3.609375, "learning_rate": 3.765822784810127e-05, "loss": 0.0232, "step": 41 }, { "epoch": 2.5625, "eval_accuracy": 0.852, "eval_loss": 0.45242950320243835, "eval_runtime": 4.8925, "eval_samples_per_second": 51.099, "eval_steps_per_second": 1.635, "step": 41 }, { "epoch": 2.625, "grad_norm": 1.734375, "learning_rate": 3.7341772151898736e-05, "loss": 0.0137, "step": 42 }, { "epoch": 2.625, "eval_accuracy": 0.852, "eval_loss": 0.46332496404647827, "eval_runtime": 4.8687, "eval_samples_per_second": 51.349, "eval_steps_per_second": 1.643, "step": 42 }, { "epoch": 2.6875, "grad_norm": 4.75, "learning_rate": 3.70253164556962e-05, "loss": 0.0202, "step": 43 }, { "epoch": 2.6875, "eval_accuracy": 0.86, "eval_loss": 0.5035133957862854, "eval_runtime": 4.8899, "eval_samples_per_second": 51.126, "eval_steps_per_second": 1.636, "step": 43 }, { "epoch": 2.75, "grad_norm": 15.6875, "learning_rate": 3.670886075949367e-05, "loss": 0.0383, "step": 44 }, { "epoch": 2.75, "eval_accuracy": 0.852, "eval_loss": 0.5293290615081787, "eval_runtime": 4.8882, "eval_samples_per_second": 51.144, "eval_steps_per_second": 1.637, "step": 44 }, { "epoch": 2.8125, "grad_norm": 24.75, "learning_rate": 3.639240506329114e-05, "loss": 0.073, "step": 45 }, { "epoch": 2.8125, "eval_accuracy": 0.856, "eval_loss": 0.5254361033439636, "eval_runtime": 4.865, "eval_samples_per_second": 51.387, "eval_steps_per_second": 1.644, "step": 45 }, { "epoch": 2.875, "grad_norm": 28.25, "learning_rate": 3.607594936708861e-05, "loss": 0.0725, "step": 46 }, { "epoch": 2.875, "eval_accuracy": 0.864, "eval_loss": 0.5004922151565552, "eval_runtime": 4.895, "eval_samples_per_second": 51.072, "eval_steps_per_second": 1.634, "step": 46 }, { "epoch": 2.9375, "grad_norm": 16.375, "learning_rate": 3.575949367088608e-05, "loss": 0.0516, "step": 47 }, { "epoch": 2.9375, "eval_accuracy": 0.864, "eval_loss": 0.463413268327713, "eval_runtime": 4.8708, "eval_samples_per_second": 51.326, "eval_steps_per_second": 1.642, "step": 47 }, { "epoch": 3.0, "grad_norm": 10.875, "learning_rate": 3.5443037974683544e-05, "loss": 0.0255, "step": 48 }, { "epoch": 3.0, "eval_accuracy": 0.868, "eval_loss": 0.4455827474594116, "eval_runtime": 4.8928, "eval_samples_per_second": 51.096, "eval_steps_per_second": 1.635, "step": 48 }, { "epoch": 3.0625, "grad_norm": 7.9375, "learning_rate": 3.5126582278481015e-05, "loss": 0.0191, "step": 49 }, { "epoch": 3.0625, "eval_accuracy": 0.868, "eval_loss": 0.45068883895874023, "eval_runtime": 4.8384, "eval_samples_per_second": 51.67, "eval_steps_per_second": 1.653, "step": 49 }, { "epoch": 3.125, "grad_norm": 0.23828125, "learning_rate": 3.4810126582278487e-05, "loss": 0.0021, "step": 50 }, { "epoch": 3.125, "eval_accuracy": 0.868, "eval_loss": 0.4584948718547821, "eval_runtime": 4.838, "eval_samples_per_second": 51.675, "eval_steps_per_second": 1.654, "step": 50 }, { "epoch": 3.1875, "grad_norm": 1.4453125, "learning_rate": 3.449367088607595e-05, "loss": 0.0034, "step": 51 }, { "epoch": 3.1875, "eval_accuracy": 0.864, "eval_loss": 0.4626566767692566, "eval_runtime": 4.8918, "eval_samples_per_second": 51.105, "eval_steps_per_second": 1.635, "step": 51 }, { "epoch": 3.25, "grad_norm": 1.2890625, "learning_rate": 3.4177215189873416e-05, "loss": 0.0057, "step": 52 }, { "epoch": 3.25, "eval_accuracy": 0.864, "eval_loss": 0.46148645877838135, "eval_runtime": 4.8885, "eval_samples_per_second": 51.141, "eval_steps_per_second": 1.637, "step": 52 }, { "epoch": 3.3125, "grad_norm": 2.984375, "learning_rate": 3.386075949367089e-05, "loss": 0.0123, "step": 53 }, { "epoch": 3.3125, "eval_accuracy": 0.852, "eval_loss": 0.4723862409591675, "eval_runtime": 4.8923, "eval_samples_per_second": 51.101, "eval_steps_per_second": 1.635, "step": 53 }, { "epoch": 3.375, "grad_norm": 5.75, "learning_rate": 3.354430379746836e-05, "loss": 0.0111, "step": 54 }, { "epoch": 3.375, "eval_accuracy": 0.856, "eval_loss": 0.463609904050827, "eval_runtime": 4.8347, "eval_samples_per_second": 51.71, "eval_steps_per_second": 1.655, "step": 54 }, { "epoch": 3.4375, "grad_norm": 4.21875, "learning_rate": 3.322784810126582e-05, "loss": 0.0072, "step": 55 }, { "epoch": 3.4375, "eval_accuracy": 0.88, "eval_loss": 0.44373881816864014, "eval_runtime": 4.8346, "eval_samples_per_second": 51.71, "eval_steps_per_second": 1.655, "step": 55 }, { "epoch": 3.5, "grad_norm": 2.6875, "learning_rate": 3.291139240506329e-05, "loss": 0.0044, "step": 56 }, { "epoch": 3.5, "eval_accuracy": 0.896, "eval_loss": 0.4286611080169678, "eval_runtime": 4.8885, "eval_samples_per_second": 51.14, "eval_steps_per_second": 1.636, "step": 56 }, { "epoch": 3.5625, "grad_norm": 0.34375, "learning_rate": 3.2594936708860766e-05, "loss": 0.0013, "step": 57 }, { "epoch": 3.5625, "eval_accuracy": 0.892, "eval_loss": 0.4324968755245209, "eval_runtime": 4.882, "eval_samples_per_second": 51.209, "eval_steps_per_second": 1.639, "step": 57 }, { "epoch": 3.625, "grad_norm": 0.111328125, "learning_rate": 3.227848101265823e-05, "loss": 0.0005, "step": 58 }, { "epoch": 3.625, "eval_accuracy": 0.896, "eval_loss": 0.45419201254844666, "eval_runtime": 4.8691, "eval_samples_per_second": 51.344, "eval_steps_per_second": 1.643, "step": 58 }, { "epoch": 3.6875, "grad_norm": 1.3984375, "learning_rate": 3.1962025316455695e-05, "loss": 0.0045, "step": 59 }, { "epoch": 3.6875, "eval_accuracy": 0.892, "eval_loss": 0.4625495672225952, "eval_runtime": 4.8885, "eval_samples_per_second": 51.14, "eval_steps_per_second": 1.636, "step": 59 }, { "epoch": 3.75, "grad_norm": 0.287109375, "learning_rate": 3.1645569620253167e-05, "loss": 0.0018, "step": 60 }, { "epoch": 3.75, "eval_accuracy": 0.888, "eval_loss": 0.4779915511608124, "eval_runtime": 4.8965, "eval_samples_per_second": 51.057, "eval_steps_per_second": 1.634, "step": 60 }, { "epoch": 3.8125, "grad_norm": 2.109375, "learning_rate": 3.132911392405064e-05, "loss": 0.0039, "step": 61 }, { "epoch": 3.8125, "eval_accuracy": 0.892, "eval_loss": 0.47442150115966797, "eval_runtime": 4.8755, "eval_samples_per_second": 51.277, "eval_steps_per_second": 1.641, "step": 61 }, { "epoch": 3.875, "grad_norm": 0.201171875, "learning_rate": 3.10126582278481e-05, "loss": 0.0007, "step": 62 }, { "epoch": 3.875, "eval_accuracy": 0.896, "eval_loss": 0.47328320145606995, "eval_runtime": 4.8693, "eval_samples_per_second": 51.342, "eval_steps_per_second": 1.643, "step": 62 }, { "epoch": 3.9375, "grad_norm": 0.56640625, "learning_rate": 3.0696202531645574e-05, "loss": 0.0009, "step": 63 }, { "epoch": 3.9375, "eval_accuracy": 0.896, "eval_loss": 0.4680858850479126, "eval_runtime": 4.8481, "eval_samples_per_second": 51.567, "eval_steps_per_second": 1.65, "step": 63 }, { "epoch": 4.0, "grad_norm": 0.259765625, "learning_rate": 3.0379746835443042e-05, "loss": 0.0011, "step": 64 }, { "epoch": 4.0, "eval_accuracy": 0.9, "eval_loss": 0.46620070934295654, "eval_runtime": 4.8197, "eval_samples_per_second": 51.871, "eval_steps_per_second": 1.66, "step": 64 }, { "epoch": 4.0625, "grad_norm": 0.01483154296875, "learning_rate": 3.0063291139240506e-05, "loss": 0.0001, "step": 65 }, { "epoch": 4.0625, "eval_accuracy": 0.904, "eval_loss": 0.47483450174331665, "eval_runtime": 4.8902, "eval_samples_per_second": 51.123, "eval_steps_per_second": 1.636, "step": 65 }, { "epoch": 4.125, "grad_norm": 0.197265625, "learning_rate": 2.9746835443037974e-05, "loss": 0.0004, "step": 66 }, { "epoch": 4.125, "eval_accuracy": 0.9, "eval_loss": 0.4847821593284607, "eval_runtime": 4.8909, "eval_samples_per_second": 51.116, "eval_steps_per_second": 1.636, "step": 66 }, { "epoch": 4.1875, "grad_norm": 0.283203125, "learning_rate": 2.9430379746835446e-05, "loss": 0.0008, "step": 67 }, { "epoch": 4.1875, "eval_accuracy": 0.9, "eval_loss": 0.49961230158805847, "eval_runtime": 4.8722, "eval_samples_per_second": 51.312, "eval_steps_per_second": 1.642, "step": 67 }, { "epoch": 4.25, "grad_norm": 0.06103515625, "learning_rate": 2.9113924050632914e-05, "loss": 0.0001, "step": 68 }, { "epoch": 4.25, "eval_accuracy": 0.904, "eval_loss": 0.5119830965995789, "eval_runtime": 4.872, "eval_samples_per_second": 51.314, "eval_steps_per_second": 1.642, "step": 68 }, { "epoch": 4.3125, "grad_norm": 0.10302734375, "learning_rate": 2.879746835443038e-05, "loss": 0.0002, "step": 69 }, { "epoch": 4.3125, "eval_accuracy": 0.904, "eval_loss": 0.5265066623687744, "eval_runtime": 4.8698, "eval_samples_per_second": 51.337, "eval_steps_per_second": 1.643, "step": 69 }, { "epoch": 4.375, "grad_norm": 0.1904296875, "learning_rate": 2.848101265822785e-05, "loss": 0.0003, "step": 70 }, { "epoch": 4.375, "eval_accuracy": 0.904, "eval_loss": 0.53618323802948, "eval_runtime": 4.904, "eval_samples_per_second": 50.978, "eval_steps_per_second": 1.631, "step": 70 }, { "epoch": 4.4375, "grad_norm": 0.31640625, "learning_rate": 2.8164556962025318e-05, "loss": 0.0005, "step": 71 }, { "epoch": 4.4375, "eval_accuracy": 0.9, "eval_loss": 0.5419101119041443, "eval_runtime": 4.8958, "eval_samples_per_second": 51.064, "eval_steps_per_second": 1.634, "step": 71 }, { "epoch": 4.5, "grad_norm": 0.55859375, "learning_rate": 2.7848101265822786e-05, "loss": 0.001, "step": 72 }, { "epoch": 4.5, "eval_accuracy": 0.9, "eval_loss": 0.5409899950027466, "eval_runtime": 4.8722, "eval_samples_per_second": 51.311, "eval_steps_per_second": 1.642, "step": 72 }, { "epoch": 4.5625, "grad_norm": 0.1376953125, "learning_rate": 2.7531645569620257e-05, "loss": 0.0002, "step": 73 }, { "epoch": 4.5625, "eval_accuracy": 0.904, "eval_loss": 0.5402071475982666, "eval_runtime": 4.8891, "eval_samples_per_second": 51.135, "eval_steps_per_second": 1.636, "step": 73 }, { "epoch": 4.625, "grad_norm": 0.107421875, "learning_rate": 2.7215189873417722e-05, "loss": 0.0002, "step": 74 }, { "epoch": 4.625, "eval_accuracy": 0.904, "eval_loss": 0.5370295643806458, "eval_runtime": 4.8731, "eval_samples_per_second": 51.303, "eval_steps_per_second": 1.642, "step": 74 }, { "epoch": 4.6875, "grad_norm": 0.34765625, "learning_rate": 2.689873417721519e-05, "loss": 0.0005, "step": 75 }, { "epoch": 4.6875, "eval_accuracy": 0.904, "eval_loss": 0.5307853817939758, "eval_runtime": 4.868, "eval_samples_per_second": 51.355, "eval_steps_per_second": 1.643, "step": 75 }, { "epoch": 4.75, "grad_norm": 0.0732421875, "learning_rate": 2.6582278481012658e-05, "loss": 0.0001, "step": 76 }, { "epoch": 4.75, "eval_accuracy": 0.9, "eval_loss": 0.5289453268051147, "eval_runtime": 4.8663, "eval_samples_per_second": 51.374, "eval_steps_per_second": 1.644, "step": 76 }, { "epoch": 4.8125, "grad_norm": 0.038818359375, "learning_rate": 2.626582278481013e-05, "loss": 0.0002, "step": 77 }, { "epoch": 4.8125, "eval_accuracy": 0.896, "eval_loss": 0.5269708037376404, "eval_runtime": 4.9075, "eval_samples_per_second": 50.942, "eval_steps_per_second": 1.63, "step": 77 }, { "epoch": 4.875, "grad_norm": 0.015380859375, "learning_rate": 2.5949367088607597e-05, "loss": 0.0, "step": 78 }, { "epoch": 4.875, "eval_accuracy": 0.896, "eval_loss": 0.5278732776641846, "eval_runtime": 4.899, "eval_samples_per_second": 51.031, "eval_steps_per_second": 1.633, "step": 78 }, { "epoch": 4.9375, "grad_norm": 0.056884765625, "learning_rate": 2.5632911392405062e-05, "loss": 0.0001, "step": 79 }, { "epoch": 4.9375, "eval_accuracy": 0.896, "eval_loss": 0.5320713520050049, "eval_runtime": 4.824, "eval_samples_per_second": 51.825, "eval_steps_per_second": 1.658, "step": 79 }, { "epoch": 5.0, "grad_norm": 0.01025390625, "learning_rate": 2.5316455696202533e-05, "loss": 0.0, "step": 80 }, { "epoch": 5.0, "eval_accuracy": 0.896, "eval_loss": 0.5385279059410095, "eval_runtime": 4.8713, "eval_samples_per_second": 51.321, "eval_steps_per_second": 1.642, "step": 80 }, { "epoch": 5.0625, "grad_norm": 0.03076171875, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 81 }, { "epoch": 5.0625, "eval_accuracy": 0.9, "eval_loss": 0.5427132248878479, "eval_runtime": 4.8863, "eval_samples_per_second": 51.163, "eval_steps_per_second": 1.637, "step": 81 }, { "epoch": 5.125, "grad_norm": 0.010986328125, "learning_rate": 2.468354430379747e-05, "loss": 0.0, "step": 82 }, { "epoch": 5.125, "eval_accuracy": 0.9, "eval_loss": 0.5424291491508484, "eval_runtime": 4.869, "eval_samples_per_second": 51.345, "eval_steps_per_second": 1.643, "step": 82 }, { "epoch": 5.1875, "grad_norm": 0.00860595703125, "learning_rate": 2.4367088607594937e-05, "loss": 0.0, "step": 83 }, { "epoch": 5.1875, "eval_accuracy": 0.904, "eval_loss": 0.5434779524803162, "eval_runtime": 4.8938, "eval_samples_per_second": 51.085, "eval_steps_per_second": 1.635, "step": 83 }, { "epoch": 5.25, "grad_norm": 0.046630859375, "learning_rate": 2.4050632911392405e-05, "loss": 0.0001, "step": 84 }, { "epoch": 5.25, "eval_accuracy": 0.9, "eval_loss": 0.552955687046051, "eval_runtime": 4.8992, "eval_samples_per_second": 51.028, "eval_steps_per_second": 1.633, "step": 84 }, { "epoch": 5.3125, "grad_norm": 0.0242919921875, "learning_rate": 2.3734177215189873e-05, "loss": 0.0001, "step": 85 }, { "epoch": 5.3125, "eval_accuracy": 0.908, "eval_loss": 0.5524942874908447, "eval_runtime": 4.8893, "eval_samples_per_second": 51.132, "eval_steps_per_second": 1.636, "step": 85 }, { "epoch": 5.375, "grad_norm": 0.10302734375, "learning_rate": 2.341772151898734e-05, "loss": 0.0002, "step": 86 }, { "epoch": 5.375, "eval_accuracy": 0.904, "eval_loss": 0.5556187629699707, "eval_runtime": 4.8844, "eval_samples_per_second": 51.184, "eval_steps_per_second": 1.638, "step": 86 }, { "epoch": 5.4375, "grad_norm": 0.058837890625, "learning_rate": 2.3101265822784813e-05, "loss": 0.0001, "step": 87 }, { "epoch": 5.4375, "eval_accuracy": 0.904, "eval_loss": 0.5552673935890198, "eval_runtime": 4.8868, "eval_samples_per_second": 51.158, "eval_steps_per_second": 1.637, "step": 87 }, { "epoch": 5.5, "grad_norm": 0.049560546875, "learning_rate": 2.278481012658228e-05, "loss": 0.0001, "step": 88 }, { "epoch": 5.5, "eval_accuracy": 0.908, "eval_loss": 0.5524196624755859, "eval_runtime": 4.8881, "eval_samples_per_second": 51.145, "eval_steps_per_second": 1.637, "step": 88 }, { "epoch": 5.5625, "grad_norm": 0.1201171875, "learning_rate": 2.246835443037975e-05, "loss": 0.0002, "step": 89 }, { "epoch": 5.5625, "eval_accuracy": 0.904, "eval_loss": 0.5546653866767883, "eval_runtime": 4.888, "eval_samples_per_second": 51.145, "eval_steps_per_second": 1.637, "step": 89 }, { "epoch": 5.625, "grad_norm": 0.011474609375, "learning_rate": 2.2151898734177217e-05, "loss": 0.0, "step": 90 }, { "epoch": 5.625, "eval_accuracy": 0.908, "eval_loss": 0.5581926107406616, "eval_runtime": 4.8965, "eval_samples_per_second": 51.056, "eval_steps_per_second": 1.634, "step": 90 }, { "epoch": 5.6875, "grad_norm": 0.00494384765625, "learning_rate": 2.1835443037974685e-05, "loss": 0.0, "step": 91 }, { "epoch": 5.6875, "eval_accuracy": 0.904, "eval_loss": 0.5577874779701233, "eval_runtime": 4.8978, "eval_samples_per_second": 51.043, "eval_steps_per_second": 1.633, "step": 91 }, { "epoch": 5.75, "grad_norm": 0.001556396484375, "learning_rate": 2.1518987341772153e-05, "loss": 0.0, "step": 92 }, { "epoch": 5.75, "eval_accuracy": 0.908, "eval_loss": 0.5564088821411133, "eval_runtime": 4.9024, "eval_samples_per_second": 50.995, "eval_steps_per_second": 1.632, "step": 92 }, { "epoch": 5.8125, "grad_norm": 0.04248046875, "learning_rate": 2.120253164556962e-05, "loss": 0.0001, "step": 93 }, { "epoch": 5.8125, "eval_accuracy": 0.904, "eval_loss": 0.5549535155296326, "eval_runtime": 4.868, "eval_samples_per_second": 51.355, "eval_steps_per_second": 1.643, "step": 93 }, { "epoch": 5.875, "grad_norm": 0.0031890869140625, "learning_rate": 2.088607594936709e-05, "loss": 0.0, "step": 94 }, { "epoch": 5.875, "eval_accuracy": 0.908, "eval_loss": 0.5585831999778748, "eval_runtime": 4.8902, "eval_samples_per_second": 51.123, "eval_steps_per_second": 1.636, "step": 94 }, { "epoch": 5.9375, "grad_norm": 0.0191650390625, "learning_rate": 2.056962025316456e-05, "loss": 0.0, "step": 95 }, { "epoch": 5.9375, "eval_accuracy": 0.9, "eval_loss": 0.5561901330947876, "eval_runtime": 4.899, "eval_samples_per_second": 51.031, "eval_steps_per_second": 1.633, "step": 95 }, { "epoch": 6.0, "grad_norm": 0.0301513671875, "learning_rate": 2.0253164556962025e-05, "loss": 0.0001, "step": 96 }, { "epoch": 6.0, "eval_accuracy": 0.908, "eval_loss": 0.557356595993042, "eval_runtime": 4.8952, "eval_samples_per_second": 51.071, "eval_steps_per_second": 1.634, "step": 96 }, { "epoch": 6.0625, "grad_norm": 0.0029754638671875, "learning_rate": 1.9936708860759496e-05, "loss": 0.0, "step": 97 }, { "epoch": 6.0625, "eval_accuracy": 0.9, "eval_loss": 0.557080090045929, "eval_runtime": 4.8905, "eval_samples_per_second": 51.119, "eval_steps_per_second": 1.636, "step": 97 }, { "epoch": 6.125, "grad_norm": 0.013427734375, "learning_rate": 1.962025316455696e-05, "loss": 0.0, "step": 98 }, { "epoch": 6.125, "eval_accuracy": 0.9, "eval_loss": 0.5583979487419128, "eval_runtime": 4.8684, "eval_samples_per_second": 51.352, "eval_steps_per_second": 1.643, "step": 98 }, { "epoch": 6.1875, "grad_norm": 0.0205078125, "learning_rate": 1.9303797468354432e-05, "loss": 0.0, "step": 99 }, { "epoch": 6.1875, "eval_accuracy": 0.904, "eval_loss": 0.5552465319633484, "eval_runtime": 4.8752, "eval_samples_per_second": 51.28, "eval_steps_per_second": 1.641, "step": 99 }, { "epoch": 6.25, "grad_norm": 0.03857421875, "learning_rate": 1.89873417721519e-05, "loss": 0.0001, "step": 100 }, { "epoch": 6.25, "eval_accuracy": 0.904, "eval_loss": 0.5585227608680725, "eval_runtime": 4.8896, "eval_samples_per_second": 51.129, "eval_steps_per_second": 1.636, "step": 100 }, { "epoch": 6.3125, "grad_norm": 0.047607421875, "learning_rate": 1.8670886075949368e-05, "loss": 0.0001, "step": 101 }, { "epoch": 6.3125, "eval_accuracy": 0.9, "eval_loss": 0.5532352328300476, "eval_runtime": 4.8702, "eval_samples_per_second": 51.332, "eval_steps_per_second": 1.643, "step": 101 }, { "epoch": 6.375, "grad_norm": 0.00836181640625, "learning_rate": 1.8354430379746836e-05, "loss": 0.0, "step": 102 }, { "epoch": 6.375, "eval_accuracy": 0.896, "eval_loss": 0.555249035358429, "eval_runtime": 4.8964, "eval_samples_per_second": 51.058, "eval_steps_per_second": 1.634, "step": 102 }, { "epoch": 6.4375, "grad_norm": 0.01055908203125, "learning_rate": 1.8037974683544304e-05, "loss": 0.0, "step": 103 }, { "epoch": 6.4375, "eval_accuracy": 0.896, "eval_loss": 0.556434690952301, "eval_runtime": 4.9004, "eval_samples_per_second": 51.016, "eval_steps_per_second": 1.633, "step": 103 }, { "epoch": 6.5, "grad_norm": 0.022216796875, "learning_rate": 1.7721518987341772e-05, "loss": 0.0, "step": 104 }, { "epoch": 6.5, "eval_accuracy": 0.9, "eval_loss": 0.5526236891746521, "eval_runtime": 4.87, "eval_samples_per_second": 51.335, "eval_steps_per_second": 1.643, "step": 104 }, { "epoch": 6.5625, "grad_norm": 0.0027618408203125, "learning_rate": 1.7405063291139243e-05, "loss": 0.0, "step": 105 }, { "epoch": 6.5625, "eval_accuracy": 0.904, "eval_loss": 0.5563039779663086, "eval_runtime": 4.892, "eval_samples_per_second": 51.104, "eval_steps_per_second": 1.635, "step": 105 }, { "epoch": 6.625, "grad_norm": 0.007110595703125, "learning_rate": 1.7088607594936708e-05, "loss": 0.0, "step": 106 }, { "epoch": 6.625, "eval_accuracy": 0.892, "eval_loss": 0.5540825724601746, "eval_runtime": 4.8978, "eval_samples_per_second": 51.043, "eval_steps_per_second": 1.633, "step": 106 }, { "epoch": 6.6875, "grad_norm": 0.0031890869140625, "learning_rate": 1.677215189873418e-05, "loss": 0.0, "step": 107 }, { "epoch": 6.6875, "eval_accuracy": 0.9, "eval_loss": 0.5545496940612793, "eval_runtime": 4.8688, "eval_samples_per_second": 51.347, "eval_steps_per_second": 1.643, "step": 107 }, { "epoch": 6.75, "grad_norm": 0.0223388671875, "learning_rate": 1.6455696202531644e-05, "loss": 0.0, "step": 108 }, { "epoch": 6.75, "eval_accuracy": 0.904, "eval_loss": 0.5558124780654907, "eval_runtime": 4.8649, "eval_samples_per_second": 51.389, "eval_steps_per_second": 1.644, "step": 108 }, { "epoch": 6.8125, "grad_norm": 0.007354736328125, "learning_rate": 1.6139240506329115e-05, "loss": 0.0, "step": 109 }, { "epoch": 6.8125, "eval_accuracy": 0.904, "eval_loss": 0.5552881956100464, "eval_runtime": 4.8707, "eval_samples_per_second": 51.327, "eval_steps_per_second": 1.642, "step": 109 }, { "epoch": 6.875, "grad_norm": 0.006011962890625, "learning_rate": 1.5822784810126583e-05, "loss": 0.0, "step": 110 }, { "epoch": 6.875, "eval_accuracy": 0.896, "eval_loss": 0.5591185092926025, "eval_runtime": 4.8653, "eval_samples_per_second": 51.384, "eval_steps_per_second": 1.644, "step": 110 }, { "epoch": 6.9375, "grad_norm": 0.03759765625, "learning_rate": 1.550632911392405e-05, "loss": 0.0001, "step": 111 }, { "epoch": 6.9375, "eval_accuracy": 0.904, "eval_loss": 0.5544008612632751, "eval_runtime": 4.8902, "eval_samples_per_second": 51.122, "eval_steps_per_second": 1.636, "step": 111 }, { "epoch": 7.0, "grad_norm": 0.00213623046875, "learning_rate": 1.5189873417721521e-05, "loss": 0.0, "step": 112 }, { "epoch": 7.0, "eval_accuracy": 0.896, "eval_loss": 0.5563123822212219, "eval_runtime": 4.8882, "eval_samples_per_second": 51.143, "eval_steps_per_second": 1.637, "step": 112 }, { "epoch": 7.0625, "grad_norm": 0.018798828125, "learning_rate": 1.4873417721518987e-05, "loss": 0.0, "step": 113 }, { "epoch": 7.0625, "eval_accuracy": 0.896, "eval_loss": 0.55401611328125, "eval_runtime": 4.8635, "eval_samples_per_second": 51.403, "eval_steps_per_second": 1.645, "step": 113 }, { "epoch": 7.125, "grad_norm": 0.0458984375, "learning_rate": 1.4556962025316457e-05, "loss": 0.0001, "step": 114 }, { "epoch": 7.125, "eval_accuracy": 0.896, "eval_loss": 0.5561829805374146, "eval_runtime": 4.8843, "eval_samples_per_second": 51.184, "eval_steps_per_second": 1.638, "step": 114 }, { "epoch": 7.1875, "grad_norm": 0.01336669921875, "learning_rate": 1.4240506329113925e-05, "loss": 0.0, "step": 115 }, { "epoch": 7.1875, "eval_accuracy": 0.9, "eval_loss": 0.5537428855895996, "eval_runtime": 4.8887, "eval_samples_per_second": 51.138, "eval_steps_per_second": 1.636, "step": 115 }, { "epoch": 7.25, "grad_norm": 0.0113525390625, "learning_rate": 1.3924050632911393e-05, "loss": 0.0, "step": 116 }, { "epoch": 7.25, "eval_accuracy": 0.896, "eval_loss": 0.5549358129501343, "eval_runtime": 4.8909, "eval_samples_per_second": 51.115, "eval_steps_per_second": 1.636, "step": 116 }, { "epoch": 7.3125, "grad_norm": 0.00762939453125, "learning_rate": 1.3607594936708861e-05, "loss": 0.0, "step": 117 }, { "epoch": 7.3125, "eval_accuracy": 0.904, "eval_loss": 0.5518209338188171, "eval_runtime": 4.8812, "eval_samples_per_second": 51.217, "eval_steps_per_second": 1.639, "step": 117 }, { "epoch": 7.375, "grad_norm": 0.0038604736328125, "learning_rate": 1.3291139240506329e-05, "loss": 0.0, "step": 118 }, { "epoch": 7.375, "eval_accuracy": 0.9, "eval_loss": 0.5559548139572144, "eval_runtime": 4.889, "eval_samples_per_second": 51.135, "eval_steps_per_second": 1.636, "step": 118 }, { "epoch": 7.4375, "grad_norm": 0.0196533203125, "learning_rate": 1.2974683544303799e-05, "loss": 0.0, "step": 119 }, { "epoch": 7.4375, "eval_accuracy": 0.896, "eval_loss": 0.5553652048110962, "eval_runtime": 4.8764, "eval_samples_per_second": 51.268, "eval_steps_per_second": 1.641, "step": 119 }, { "epoch": 7.5, "grad_norm": 0.00958251953125, "learning_rate": 1.2658227848101267e-05, "loss": 0.0, "step": 120 }, { "epoch": 7.5, "eval_accuracy": 0.9, "eval_loss": 0.5554091930389404, "eval_runtime": 4.8866, "eval_samples_per_second": 51.161, "eval_steps_per_second": 1.637, "step": 120 }, { "epoch": 7.5625, "grad_norm": 0.0029754638671875, "learning_rate": 1.2341772151898735e-05, "loss": 0.0, "step": 121 }, { "epoch": 7.5625, "eval_accuracy": 0.9, "eval_loss": 0.5547875761985779, "eval_runtime": 4.8661, "eval_samples_per_second": 51.376, "eval_steps_per_second": 1.644, "step": 121 }, { "epoch": 7.625, "grad_norm": 0.001556396484375, "learning_rate": 1.2025316455696203e-05, "loss": 0.0, "step": 122 }, { "epoch": 7.625, "eval_accuracy": 0.9, "eval_loss": 0.55680251121521, "eval_runtime": 4.8915, "eval_samples_per_second": 51.109, "eval_steps_per_second": 1.635, "step": 122 }, { "epoch": 7.6875, "grad_norm": 0.0174560546875, "learning_rate": 1.170886075949367e-05, "loss": 0.0001, "step": 123 }, { "epoch": 7.6875, "eval_accuracy": 0.9, "eval_loss": 0.5555019378662109, "eval_runtime": 4.8666, "eval_samples_per_second": 51.371, "eval_steps_per_second": 1.644, "step": 123 }, { "epoch": 7.75, "grad_norm": 0.02734375, "learning_rate": 1.139240506329114e-05, "loss": 0.0001, "step": 124 }, { "epoch": 7.75, "eval_accuracy": 0.896, "eval_loss": 0.5568622946739197, "eval_runtime": 4.865, "eval_samples_per_second": 51.387, "eval_steps_per_second": 1.644, "step": 124 }, { "epoch": 7.8125, "grad_norm": 0.0242919921875, "learning_rate": 1.1075949367088608e-05, "loss": 0.0, "step": 125 }, { "epoch": 7.8125, "eval_accuracy": 0.9, "eval_loss": 0.5577536821365356, "eval_runtime": 4.8877, "eval_samples_per_second": 51.149, "eval_steps_per_second": 1.637, "step": 125 }, { "epoch": 7.875, "grad_norm": 0.0020904541015625, "learning_rate": 1.0759493670886076e-05, "loss": 0.0, "step": 126 }, { "epoch": 7.875, "eval_accuracy": 0.896, "eval_loss": 0.5592789649963379, "eval_runtime": 4.8614, "eval_samples_per_second": 51.425, "eval_steps_per_second": 1.646, "step": 126 }, { "epoch": 7.9375, "grad_norm": 0.00732421875, "learning_rate": 1.0443037974683544e-05, "loss": 0.0, "step": 127 }, { "epoch": 7.9375, "eval_accuracy": 0.9, "eval_loss": 0.556476891040802, "eval_runtime": 4.892, "eval_samples_per_second": 51.104, "eval_steps_per_second": 1.635, "step": 127 }, { "epoch": 8.0, "grad_norm": 0.001861572265625, "learning_rate": 1.0126582278481012e-05, "loss": 0.0, "step": 128 }, { "epoch": 8.0, "eval_accuracy": 0.9, "eval_loss": 0.555403470993042, "eval_runtime": 4.8901, "eval_samples_per_second": 51.123, "eval_steps_per_second": 1.636, "step": 128 }, { "epoch": 8.0625, "grad_norm": 0.0111083984375, "learning_rate": 9.81012658227848e-06, "loss": 0.0, "step": 129 }, { "epoch": 8.0625, "eval_accuracy": 0.904, "eval_loss": 0.5534031987190247, "eval_runtime": 4.8858, "eval_samples_per_second": 51.169, "eval_steps_per_second": 1.637, "step": 129 }, { "epoch": 8.125, "grad_norm": 0.00604248046875, "learning_rate": 9.49367088607595e-06, "loss": 0.0, "step": 130 }, { "epoch": 8.125, "eval_accuracy": 0.9, "eval_loss": 0.5557398796081543, "eval_runtime": 4.8638, "eval_samples_per_second": 51.4, "eval_steps_per_second": 1.645, "step": 130 }, { "epoch": 8.1875, "grad_norm": 0.008544921875, "learning_rate": 9.177215189873418e-06, "loss": 0.0, "step": 131 }, { "epoch": 8.1875, "eval_accuracy": 0.9, "eval_loss": 0.5564188361167908, "eval_runtime": 4.8832, "eval_samples_per_second": 51.196, "eval_steps_per_second": 1.638, "step": 131 }, { "epoch": 8.25, "grad_norm": 0.003875732421875, "learning_rate": 8.860759493670886e-06, "loss": 0.0, "step": 132 }, { "epoch": 8.25, "eval_accuracy": 0.9, "eval_loss": 0.5545617938041687, "eval_runtime": 4.8916, "eval_samples_per_second": 51.108, "eval_steps_per_second": 1.635, "step": 132 }, { "epoch": 8.3125, "grad_norm": 0.0322265625, "learning_rate": 8.544303797468354e-06, "loss": 0.0001, "step": 133 }, { "epoch": 8.3125, "eval_accuracy": 0.9, "eval_loss": 0.5550633668899536, "eval_runtime": 4.8868, "eval_samples_per_second": 51.158, "eval_steps_per_second": 1.637, "step": 133 }, { "epoch": 8.375, "grad_norm": 0.0296630859375, "learning_rate": 8.227848101265822e-06, "loss": 0.0001, "step": 134 }, { "epoch": 8.375, "eval_accuracy": 0.9, "eval_loss": 0.553980827331543, "eval_runtime": 4.8628, "eval_samples_per_second": 51.411, "eval_steps_per_second": 1.645, "step": 134 }, { "epoch": 8.4375, "grad_norm": 0.00396728515625, "learning_rate": 7.911392405063292e-06, "loss": 0.0, "step": 135 }, { "epoch": 8.4375, "eval_accuracy": 0.9, "eval_loss": 0.5577096939086914, "eval_runtime": 4.8664, "eval_samples_per_second": 51.372, "eval_steps_per_second": 1.644, "step": 135 }, { "epoch": 8.5, "grad_norm": 0.00089263916015625, "learning_rate": 7.5949367088607605e-06, "loss": 0.0, "step": 136 }, { "epoch": 8.5, "eval_accuracy": 0.9, "eval_loss": 0.5528184771537781, "eval_runtime": 4.8633, "eval_samples_per_second": 51.405, "eval_steps_per_second": 1.645, "step": 136 }, { "epoch": 8.5625, "grad_norm": 0.01080322265625, "learning_rate": 7.2784810126582285e-06, "loss": 0.0, "step": 137 }, { "epoch": 8.5625, "eval_accuracy": 0.896, "eval_loss": 0.5544995069503784, "eval_runtime": 4.8638, "eval_samples_per_second": 51.4, "eval_steps_per_second": 1.645, "step": 137 }, { "epoch": 8.625, "grad_norm": 0.006561279296875, "learning_rate": 6.9620253164556965e-06, "loss": 0.0, "step": 138 }, { "epoch": 8.625, "eval_accuracy": 0.896, "eval_loss": 0.5562748908996582, "eval_runtime": 4.8677, "eval_samples_per_second": 51.359, "eval_steps_per_second": 1.643, "step": 138 }, { "epoch": 8.6875, "grad_norm": 0.01007080078125, "learning_rate": 6.6455696202531645e-06, "loss": 0.0, "step": 139 }, { "epoch": 8.6875, "eval_accuracy": 0.9, "eval_loss": 0.5547692775726318, "eval_runtime": 4.8897, "eval_samples_per_second": 51.128, "eval_steps_per_second": 1.636, "step": 139 }, { "epoch": 8.75, "grad_norm": 0.01171875, "learning_rate": 6.329113924050633e-06, "loss": 0.0, "step": 140 }, { "epoch": 8.75, "eval_accuracy": 0.9, "eval_loss": 0.5561813712120056, "eval_runtime": 4.8875, "eval_samples_per_second": 51.151, "eval_steps_per_second": 1.637, "step": 140 }, { "epoch": 8.8125, "grad_norm": 0.0166015625, "learning_rate": 6.012658227848101e-06, "loss": 0.0, "step": 141 }, { "epoch": 8.8125, "eval_accuracy": 0.896, "eval_loss": 0.5564705729484558, "eval_runtime": 4.8825, "eval_samples_per_second": 51.203, "eval_steps_per_second": 1.638, "step": 141 }, { "epoch": 8.875, "grad_norm": 0.002777099609375, "learning_rate": 5.69620253164557e-06, "loss": 0.0, "step": 142 }, { "epoch": 8.875, "eval_accuracy": 0.896, "eval_loss": 0.5576943159103394, "eval_runtime": 4.8931, "eval_samples_per_second": 51.092, "eval_steps_per_second": 1.635, "step": 142 }, { "epoch": 8.9375, "grad_norm": 0.048583984375, "learning_rate": 5.379746835443038e-06, "loss": 0.0001, "step": 143 }, { "epoch": 8.9375, "eval_accuracy": 0.9, "eval_loss": 0.5525559186935425, "eval_runtime": 4.8612, "eval_samples_per_second": 51.427, "eval_steps_per_second": 1.646, "step": 143 }, { "epoch": 9.0, "grad_norm": 0.0019683837890625, "learning_rate": 5.063291139240506e-06, "loss": 0.0, "step": 144 }, { "epoch": 9.0, "eval_accuracy": 0.9, "eval_loss": 0.5553069710731506, "eval_runtime": 4.8923, "eval_samples_per_second": 51.101, "eval_steps_per_second": 1.635, "step": 144 }, { "epoch": 9.0625, "grad_norm": 0.0279541015625, "learning_rate": 4.746835443037975e-06, "loss": 0.0, "step": 145 }, { "epoch": 9.0625, "eval_accuracy": 0.9, "eval_loss": 0.5583046078681946, "eval_runtime": 4.8879, "eval_samples_per_second": 51.147, "eval_steps_per_second": 1.637, "step": 145 }, { "epoch": 9.125, "grad_norm": 0.0130615234375, "learning_rate": 4.430379746835443e-06, "loss": 0.0, "step": 146 }, { "epoch": 9.125, "eval_accuracy": 0.9, "eval_loss": 0.5578604936599731, "eval_runtime": 4.8946, "eval_samples_per_second": 51.077, "eval_steps_per_second": 1.634, "step": 146 }, { "epoch": 9.1875, "grad_norm": 0.00732421875, "learning_rate": 4.113924050632911e-06, "loss": 0.0, "step": 147 }, { "epoch": 9.1875, "eval_accuracy": 0.9, "eval_loss": 0.5572697520256042, "eval_runtime": 4.8664, "eval_samples_per_second": 51.372, "eval_steps_per_second": 1.644, "step": 147 }, { "epoch": 9.25, "grad_norm": 0.004364013671875, "learning_rate": 3.7974683544303802e-06, "loss": 0.0, "step": 148 }, { "epoch": 9.25, "eval_accuracy": 0.9, "eval_loss": 0.5586851835250854, "eval_runtime": 4.8653, "eval_samples_per_second": 51.384, "eval_steps_per_second": 1.644, "step": 148 }, { "epoch": 9.3125, "grad_norm": 0.010986328125, "learning_rate": 3.4810126582278482e-06, "loss": 0.0, "step": 149 }, { "epoch": 9.3125, "eval_accuracy": 0.9, "eval_loss": 0.5570804476737976, "eval_runtime": 4.8876, "eval_samples_per_second": 51.15, "eval_steps_per_second": 1.637, "step": 149 }, { "epoch": 9.375, "grad_norm": 0.0025787353515625, "learning_rate": 3.1645569620253167e-06, "loss": 0.0, "step": 150 }, { "epoch": 9.375, "eval_accuracy": 0.9, "eval_loss": 0.5560451745986938, "eval_runtime": 4.8867, "eval_samples_per_second": 51.159, "eval_steps_per_second": 1.637, "step": 150 }, { "epoch": 9.4375, "grad_norm": 0.00616455078125, "learning_rate": 2.848101265822785e-06, "loss": 0.0, "step": 151 }, { "epoch": 9.4375, "eval_accuracy": 0.896, "eval_loss": 0.5584925413131714, "eval_runtime": 4.8614, "eval_samples_per_second": 51.425, "eval_steps_per_second": 1.646, "step": 151 }, { "epoch": 9.5, "grad_norm": 0.01312255859375, "learning_rate": 2.531645569620253e-06, "loss": 0.0, "step": 152 }, { "epoch": 9.5, "eval_accuracy": 0.9, "eval_loss": 0.5539237260818481, "eval_runtime": 4.8841, "eval_samples_per_second": 51.186, "eval_steps_per_second": 1.638, "step": 152 }, { "epoch": 9.5625, "grad_norm": 0.00604248046875, "learning_rate": 2.2151898734177215e-06, "loss": 0.0, "step": 153 }, { "epoch": 9.5625, "eval_accuracy": 0.904, "eval_loss": 0.5538834929466248, "eval_runtime": 4.8602, "eval_samples_per_second": 51.438, "eval_steps_per_second": 1.646, "step": 153 }, { "epoch": 9.625, "grad_norm": 0.032958984375, "learning_rate": 1.8987341772151901e-06, "loss": 0.0001, "step": 154 }, { "epoch": 9.625, "eval_accuracy": 0.9, "eval_loss": 0.5534281134605408, "eval_runtime": 4.8627, "eval_samples_per_second": 51.412, "eval_steps_per_second": 1.645, "step": 154 }, { "epoch": 9.6875, "grad_norm": 0.007354736328125, "learning_rate": 1.5822784810126583e-06, "loss": 0.0, "step": 155 }, { "epoch": 9.6875, "eval_accuracy": 0.9, "eval_loss": 0.5582576990127563, "eval_runtime": 4.8653, "eval_samples_per_second": 51.384, "eval_steps_per_second": 1.644, "step": 155 }, { "epoch": 9.75, "grad_norm": 0.00421142578125, "learning_rate": 1.2658227848101265e-06, "loss": 0.0, "step": 156 }, { "epoch": 9.75, "eval_accuracy": 0.896, "eval_loss": 0.5575215816497803, "eval_runtime": 4.878, "eval_samples_per_second": 51.251, "eval_steps_per_second": 1.64, "step": 156 }, { "epoch": 9.8125, "grad_norm": 0.00482177734375, "learning_rate": 9.493670886075951e-07, "loss": 0.0, "step": 157 }, { "epoch": 9.8125, "eval_accuracy": 0.896, "eval_loss": 0.5540234446525574, "eval_runtime": 4.8905, "eval_samples_per_second": 51.119, "eval_steps_per_second": 1.636, "step": 157 }, { "epoch": 9.875, "grad_norm": 0.0147705078125, "learning_rate": 6.329113924050633e-07, "loss": 0.0, "step": 158 }, { "epoch": 9.875, "eval_accuracy": 0.896, "eval_loss": 0.55516517162323, "eval_runtime": 4.8871, "eval_samples_per_second": 51.156, "eval_steps_per_second": 1.637, "step": 158 }, { "epoch": 9.9375, "grad_norm": 0.0033416748046875, "learning_rate": 3.1645569620253163e-07, "loss": 0.0, "step": 159 }, { "epoch": 9.9375, "eval_accuracy": 0.896, "eval_loss": 0.5555429458618164, "eval_runtime": 4.8568, "eval_samples_per_second": 51.474, "eval_steps_per_second": 1.647, "step": 159 }, { "epoch": 10.0, "grad_norm": 0.0185546875, "learning_rate": 0.0, "loss": 0.0, "step": 160 }, { "epoch": 10.0, "eval_accuracy": 0.896, "eval_loss": 0.557336688041687, "eval_runtime": 4.8633, "eval_samples_per_second": 51.405, "eval_steps_per_second": 1.645, "step": 160 }, { "epoch": 10.0, "step": 160, "total_flos": 7.024653098391962e+16, "train_loss": 0.1563696864293888, "train_runtime": 1332.2795, "train_samples_per_second": 7.506, "train_steps_per_second": 0.12 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.024653098391962e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }