{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.984, "eval_steps": 1, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 58.75, "learning_rate": 2.5e-05, "loss": 1.6327, "step": 1 }, { "epoch": 0.016, "eval_accuracy": 0.344, "eval_loss": 1.8050549030303955, "eval_runtime": 8.5979, "eval_samples_per_second": 29.077, "eval_steps_per_second": 3.722, "step": 1 }, { "epoch": 0.032, "grad_norm": 173.0, "learning_rate": 5e-05, "loss": 1.2182, "step": 2 }, { "epoch": 0.032, "eval_accuracy": 0.368, "eval_loss": 1.5831865072250366, "eval_runtime": 8.6593, "eval_samples_per_second": 28.871, "eval_steps_per_second": 3.695, "step": 2 }, { "epoch": 0.048, "grad_norm": 78.5, "learning_rate": 4.959016393442623e-05, "loss": 1.0166, "step": 3 }, { "epoch": 0.048, "eval_accuracy": 0.48, "eval_loss": 1.2497016191482544, "eval_runtime": 8.6547, "eval_samples_per_second": 28.886, "eval_steps_per_second": 3.697, "step": 3 }, { "epoch": 0.064, "grad_norm": 205.0, "learning_rate": 4.918032786885246e-05, "loss": 1.1151, "step": 4 }, { "epoch": 0.064, "eval_accuracy": 0.592, "eval_loss": 0.9809591174125671, "eval_runtime": 8.6606, "eval_samples_per_second": 28.866, "eval_steps_per_second": 3.695, "step": 4 }, { "epoch": 0.08, "grad_norm": 92.0, "learning_rate": 4.8770491803278687e-05, "loss": 1.1203, "step": 5 }, { "epoch": 0.08, "eval_accuracy": 0.616, "eval_loss": 0.9002195000648499, "eval_runtime": 8.6562, "eval_samples_per_second": 28.881, "eval_steps_per_second": 3.697, "step": 5 }, { "epoch": 0.096, "grad_norm": 39.5, "learning_rate": 4.836065573770492e-05, "loss": 0.3129, "step": 6 }, { "epoch": 0.096, "eval_accuracy": 0.692, "eval_loss": 0.8504685759544373, "eval_runtime": 8.6632, "eval_samples_per_second": 28.858, "eval_steps_per_second": 3.694, "step": 6 }, { "epoch": 0.112, "grad_norm": 93.0, "learning_rate": 4.795081967213115e-05, "loss": 0.989, "step": 7 }, { "epoch": 0.112, "eval_accuracy": 0.72, "eval_loss": 0.8811690807342529, "eval_runtime": 8.6664, "eval_samples_per_second": 28.847, "eval_steps_per_second": 3.692, "step": 7 }, { "epoch": 0.128, "grad_norm": 69.5, "learning_rate": 4.754098360655738e-05, "loss": 0.6991, "step": 8 }, { "epoch": 0.128, "eval_accuracy": 0.68, "eval_loss": 1.079397439956665, "eval_runtime": 8.6622, "eval_samples_per_second": 28.861, "eval_steps_per_second": 3.694, "step": 8 }, { "epoch": 0.144, "grad_norm": 161.0, "learning_rate": 4.713114754098361e-05, "loss": 1.2626, "step": 9 }, { "epoch": 0.144, "eval_accuracy": 0.688, "eval_loss": 1.0678237676620483, "eval_runtime": 8.6644, "eval_samples_per_second": 28.854, "eval_steps_per_second": 3.693, "step": 9 }, { "epoch": 0.16, "grad_norm": 155.0, "learning_rate": 4.672131147540984e-05, "loss": 0.7883, "step": 10 }, { "epoch": 0.16, "eval_accuracy": 0.696, "eval_loss": 0.88979172706604, "eval_runtime": 8.6685, "eval_samples_per_second": 28.84, "eval_steps_per_second": 3.692, "step": 10 }, { "epoch": 0.176, "grad_norm": 71.5, "learning_rate": 4.631147540983607e-05, "loss": 0.2973, "step": 11 }, { "epoch": 0.176, "eval_accuracy": 0.768, "eval_loss": 0.7034730315208435, "eval_runtime": 8.6628, "eval_samples_per_second": 28.859, "eval_steps_per_second": 3.694, "step": 11 }, { "epoch": 0.192, "grad_norm": 32.5, "learning_rate": 4.59016393442623e-05, "loss": 0.3976, "step": 12 }, { "epoch": 0.192, "eval_accuracy": 0.772, "eval_loss": 0.64277583360672, "eval_runtime": 8.6669, "eval_samples_per_second": 28.845, "eval_steps_per_second": 3.692, "step": 12 }, { "epoch": 0.208, "grad_norm": 85.0, "learning_rate": 4.549180327868853e-05, "loss": 0.8966, "step": 13 }, { "epoch": 0.208, "eval_accuracy": 0.776, "eval_loss": 0.5894673466682434, "eval_runtime": 8.6718, "eval_samples_per_second": 28.829, "eval_steps_per_second": 3.69, "step": 13 }, { "epoch": 0.224, "grad_norm": 93.0, "learning_rate": 4.508196721311476e-05, "loss": 0.3748, "step": 14 }, { "epoch": 0.224, "eval_accuracy": 0.748, "eval_loss": 0.6436864137649536, "eval_runtime": 8.6619, "eval_samples_per_second": 28.862, "eval_steps_per_second": 3.694, "step": 14 }, { "epoch": 0.24, "grad_norm": 102.0, "learning_rate": 4.467213114754098e-05, "loss": 0.6883, "step": 15 }, { "epoch": 0.24, "eval_accuracy": 0.74, "eval_loss": 0.6454311609268188, "eval_runtime": 8.6684, "eval_samples_per_second": 28.841, "eval_steps_per_second": 3.692, "step": 15 }, { "epoch": 0.256, "grad_norm": 40.5, "learning_rate": 4.426229508196721e-05, "loss": 0.3292, "step": 16 }, { "epoch": 0.256, "eval_accuracy": 0.708, "eval_loss": 0.8357064127922058, "eval_runtime": 8.6666, "eval_samples_per_second": 28.846, "eval_steps_per_second": 3.692, "step": 16 }, { "epoch": 0.272, "grad_norm": 138.0, "learning_rate": 4.3852459016393444e-05, "loss": 1.0341, "step": 17 }, { "epoch": 0.272, "eval_accuracy": 0.692, "eval_loss": 0.920940101146698, "eval_runtime": 8.6644, "eval_samples_per_second": 28.854, "eval_steps_per_second": 3.693, "step": 17 }, { "epoch": 0.288, "grad_norm": 97.5, "learning_rate": 4.3442622950819674e-05, "loss": 0.8867, "step": 18 }, { "epoch": 0.288, "eval_accuracy": 0.708, "eval_loss": 0.8621469736099243, "eval_runtime": 8.6638, "eval_samples_per_second": 28.856, "eval_steps_per_second": 3.694, "step": 18 }, { "epoch": 0.304, "grad_norm": 176.0, "learning_rate": 4.3032786885245904e-05, "loss": 1.2041, "step": 19 }, { "epoch": 0.304, "eval_accuracy": 0.744, "eval_loss": 0.67635178565979, "eval_runtime": 8.6671, "eval_samples_per_second": 28.845, "eval_steps_per_second": 3.692, "step": 19 }, { "epoch": 0.32, "grad_norm": 150.0, "learning_rate": 4.262295081967213e-05, "loss": 0.9002, "step": 20 }, { "epoch": 0.32, "eval_accuracy": 0.732, "eval_loss": 0.5985668301582336, "eval_runtime": 8.6631, "eval_samples_per_second": 28.858, "eval_steps_per_second": 3.694, "step": 20 }, { "epoch": 0.336, "grad_norm": 53.0, "learning_rate": 4.2213114754098365e-05, "loss": 0.8948, "step": 21 }, { "epoch": 0.336, "eval_accuracy": 0.716, "eval_loss": 0.652230978012085, "eval_runtime": 8.6655, "eval_samples_per_second": 28.85, "eval_steps_per_second": 3.693, "step": 21 }, { "epoch": 0.352, "grad_norm": 174.0, "learning_rate": 4.1803278688524595e-05, "loss": 0.86, "step": 22 }, { "epoch": 0.352, "eval_accuracy": 0.728, "eval_loss": 0.6597179174423218, "eval_runtime": 8.6672, "eval_samples_per_second": 28.844, "eval_steps_per_second": 3.692, "step": 22 }, { "epoch": 0.368, "grad_norm": 156.0, "learning_rate": 4.1393442622950826e-05, "loss": 0.6364, "step": 23 }, { "epoch": 0.368, "eval_accuracy": 0.744, "eval_loss": 0.5796850919723511, "eval_runtime": 8.664, "eval_samples_per_second": 28.855, "eval_steps_per_second": 3.693, "step": 23 }, { "epoch": 0.384, "grad_norm": 53.25, "learning_rate": 4.098360655737705e-05, "loss": 0.2094, "step": 24 }, { "epoch": 0.384, "eval_accuracy": 0.748, "eval_loss": 0.5883631706237793, "eval_runtime": 8.6686, "eval_samples_per_second": 28.84, "eval_steps_per_second": 3.692, "step": 24 }, { "epoch": 0.4, "grad_norm": 87.5, "learning_rate": 4.057377049180328e-05, "loss": 0.4607, "step": 25 }, { "epoch": 0.4, "eval_accuracy": 0.768, "eval_loss": 0.5390456318855286, "eval_runtime": 8.6866, "eval_samples_per_second": 28.78, "eval_steps_per_second": 3.684, "step": 25 }, { "epoch": 0.416, "grad_norm": 155.0, "learning_rate": 4.016393442622951e-05, "loss": 0.814, "step": 26 }, { "epoch": 0.416, "eval_accuracy": 0.78, "eval_loss": 0.4743637144565582, "eval_runtime": 8.6531, "eval_samples_per_second": 28.892, "eval_steps_per_second": 3.698, "step": 26 }, { "epoch": 0.432, "grad_norm": 41.0, "learning_rate": 3.975409836065574e-05, "loss": 0.5358, "step": 27 }, { "epoch": 0.432, "eval_accuracy": 0.776, "eval_loss": 0.4668542146682739, "eval_runtime": 8.6595, "eval_samples_per_second": 28.87, "eval_steps_per_second": 3.695, "step": 27 }, { "epoch": 0.448, "grad_norm": 131.0, "learning_rate": 3.934426229508197e-05, "loss": 0.5556, "step": 28 }, { "epoch": 0.448, "eval_accuracy": 0.736, "eval_loss": 0.6067003011703491, "eval_runtime": 8.6518, "eval_samples_per_second": 28.896, "eval_steps_per_second": 3.699, "step": 28 }, { "epoch": 0.464, "grad_norm": 126.5, "learning_rate": 3.89344262295082e-05, "loss": 0.505, "step": 29 }, { "epoch": 0.464, "eval_accuracy": 0.712, "eval_loss": 0.7375366687774658, "eval_runtime": 8.6519, "eval_samples_per_second": 28.895, "eval_steps_per_second": 3.699, "step": 29 }, { "epoch": 0.48, "grad_norm": 171.0, "learning_rate": 3.8524590163934424e-05, "loss": 0.9589, "step": 30 }, { "epoch": 0.48, "eval_accuracy": 0.704, "eval_loss": 0.7679601311683655, "eval_runtime": 8.6582, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 30 }, { "epoch": 0.496, "grad_norm": 150.0, "learning_rate": 3.8114754098360655e-05, "loss": 0.74, "step": 31 }, { "epoch": 0.496, "eval_accuracy": 0.732, "eval_loss": 0.6937733888626099, "eval_runtime": 8.6569, "eval_samples_per_second": 28.879, "eval_steps_per_second": 3.696, "step": 31 }, { "epoch": 0.512, "grad_norm": 79.5, "learning_rate": 3.7704918032786885e-05, "loss": 0.5474, "step": 32 }, { "epoch": 0.512, "eval_accuracy": 0.748, "eval_loss": 0.5756805539131165, "eval_runtime": 8.6562, "eval_samples_per_second": 28.881, "eval_steps_per_second": 3.697, "step": 32 }, { "epoch": 0.528, "grad_norm": 112.5, "learning_rate": 3.729508196721312e-05, "loss": 0.4916, "step": 33 }, { "epoch": 0.528, "eval_accuracy": 0.792, "eval_loss": 0.47289371490478516, "eval_runtime": 8.6581, "eval_samples_per_second": 28.875, "eval_steps_per_second": 3.696, "step": 33 }, { "epoch": 0.544, "grad_norm": 33.0, "learning_rate": 3.6885245901639346e-05, "loss": 0.8822, "step": 34 }, { "epoch": 0.544, "eval_accuracy": 0.82, "eval_loss": 0.4487142264842987, "eval_runtime": 8.6584, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 34 }, { "epoch": 0.56, "grad_norm": 84.5, "learning_rate": 3.6475409836065576e-05, "loss": 0.7691, "step": 35 }, { "epoch": 0.56, "eval_accuracy": 0.812, "eval_loss": 0.45519721508026123, "eval_runtime": 8.6547, "eval_samples_per_second": 28.886, "eval_steps_per_second": 3.697, "step": 35 }, { "epoch": 0.576, "grad_norm": 28.625, "learning_rate": 3.6065573770491806e-05, "loss": 0.4743, "step": 36 }, { "epoch": 0.576, "eval_accuracy": 0.764, "eval_loss": 0.5331873893737793, "eval_runtime": 8.6566, "eval_samples_per_second": 28.88, "eval_steps_per_second": 3.697, "step": 36 }, { "epoch": 0.592, "grad_norm": 23.875, "learning_rate": 3.5655737704918037e-05, "loss": 0.3101, "step": 37 }, { "epoch": 0.592, "eval_accuracy": 0.744, "eval_loss": 0.6849313974380493, "eval_runtime": 8.6571, "eval_samples_per_second": 28.878, "eval_steps_per_second": 3.696, "step": 37 }, { "epoch": 0.608, "grad_norm": 103.5, "learning_rate": 3.524590163934427e-05, "loss": 0.962, "step": 38 }, { "epoch": 0.608, "eval_accuracy": 0.724, "eval_loss": 0.7783421874046326, "eval_runtime": 8.6583, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 38 }, { "epoch": 0.624, "grad_norm": 133.0, "learning_rate": 3.483606557377049e-05, "loss": 0.5671, "step": 39 }, { "epoch": 0.624, "eval_accuracy": 0.712, "eval_loss": 0.7919518947601318, "eval_runtime": 8.661, "eval_samples_per_second": 28.865, "eval_steps_per_second": 3.695, "step": 39 }, { "epoch": 0.64, "grad_norm": 193.0, "learning_rate": 3.442622950819672e-05, "loss": 0.7741, "step": 40 }, { "epoch": 0.64, "eval_accuracy": 0.724, "eval_loss": 0.7195008397102356, "eval_runtime": 8.6644, "eval_samples_per_second": 28.854, "eval_steps_per_second": 3.693, "step": 40 }, { "epoch": 0.656, "grad_norm": 236.0, "learning_rate": 3.401639344262295e-05, "loss": 0.9336, "step": 41 }, { "epoch": 0.656, "eval_accuracy": 0.784, "eval_loss": 0.5999830365180969, "eval_runtime": 8.6611, "eval_samples_per_second": 28.865, "eval_steps_per_second": 3.695, "step": 41 }, { "epoch": 0.672, "grad_norm": 194.0, "learning_rate": 3.360655737704918e-05, "loss": 0.9252, "step": 42 }, { "epoch": 0.672, "eval_accuracy": 0.812, "eval_loss": 0.4787631928920746, "eval_runtime": 8.6643, "eval_samples_per_second": 28.854, "eval_steps_per_second": 3.693, "step": 42 }, { "epoch": 0.688, "grad_norm": 102.0, "learning_rate": 3.319672131147541e-05, "loss": 0.2934, "step": 43 }, { "epoch": 0.688, "eval_accuracy": 0.812, "eval_loss": 0.41090723872184753, "eval_runtime": 8.6614, "eval_samples_per_second": 28.864, "eval_steps_per_second": 3.695, "step": 43 }, { "epoch": 0.704, "grad_norm": 87.5, "learning_rate": 3.2786885245901635e-05, "loss": 0.4936, "step": 44 }, { "epoch": 0.704, "eval_accuracy": 0.78, "eval_loss": 0.46753987669944763, "eval_runtime": 8.6615, "eval_samples_per_second": 28.863, "eval_steps_per_second": 3.694, "step": 44 }, { "epoch": 0.72, "grad_norm": 18.375, "learning_rate": 3.237704918032787e-05, "loss": 0.3223, "step": 45 }, { "epoch": 0.72, "eval_accuracy": 0.748, "eval_loss": 0.5864301919937134, "eval_runtime": 8.6597, "eval_samples_per_second": 28.869, "eval_steps_per_second": 3.695, "step": 45 }, { "epoch": 0.736, "grad_norm": 121.0, "learning_rate": 3.19672131147541e-05, "loss": 0.408, "step": 46 }, { "epoch": 0.736, "eval_accuracy": 0.728, "eval_loss": 0.6596755981445312, "eval_runtime": 8.663, "eval_samples_per_second": 28.858, "eval_steps_per_second": 3.694, "step": 46 }, { "epoch": 0.752, "grad_norm": 54.0, "learning_rate": 3.155737704918033e-05, "loss": 0.759, "step": 47 }, { "epoch": 0.752, "eval_accuracy": 0.728, "eval_loss": 0.6460751891136169, "eval_runtime": 8.6565, "eval_samples_per_second": 28.88, "eval_steps_per_second": 3.697, "step": 47 }, { "epoch": 0.768, "grad_norm": 114.0, "learning_rate": 3.114754098360656e-05, "loss": 0.6628, "step": 48 }, { "epoch": 0.768, "eval_accuracy": 0.744, "eval_loss": 0.5938560962677002, "eval_runtime": 8.6567, "eval_samples_per_second": 28.879, "eval_steps_per_second": 3.697, "step": 48 }, { "epoch": 0.784, "grad_norm": 111.0, "learning_rate": 3.073770491803279e-05, "loss": 0.761, "step": 49 }, { "epoch": 0.784, "eval_accuracy": 0.804, "eval_loss": 0.5164662003517151, "eval_runtime": 8.6557, "eval_samples_per_second": 28.883, "eval_steps_per_second": 3.697, "step": 49 }, { "epoch": 0.8, "grad_norm": 32.0, "learning_rate": 3.0327868852459017e-05, "loss": 0.308, "step": 50 }, { "epoch": 0.8, "eval_accuracy": 0.836, "eval_loss": 0.43705108761787415, "eval_runtime": 8.6584, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 50 }, { "epoch": 0.816, "grad_norm": 78.0, "learning_rate": 2.9918032786885248e-05, "loss": 0.4859, "step": 51 }, { "epoch": 0.816, "eval_accuracy": 0.856, "eval_loss": 0.3826155364513397, "eval_runtime": 8.6539, "eval_samples_per_second": 28.889, "eval_steps_per_second": 3.698, "step": 51 }, { "epoch": 0.832, "grad_norm": 24.5, "learning_rate": 2.9508196721311478e-05, "loss": 0.6841, "step": 52 }, { "epoch": 0.832, "eval_accuracy": 0.828, "eval_loss": 0.3742530345916748, "eval_runtime": 8.6541, "eval_samples_per_second": 28.888, "eval_steps_per_second": 3.698, "step": 52 }, { "epoch": 0.848, "grad_norm": 37.5, "learning_rate": 2.9098360655737705e-05, "loss": 0.7852, "step": 53 }, { "epoch": 0.848, "eval_accuracy": 0.8, "eval_loss": 0.43144190311431885, "eval_runtime": 8.653, "eval_samples_per_second": 28.892, "eval_steps_per_second": 3.698, "step": 53 }, { "epoch": 0.864, "grad_norm": 91.0, "learning_rate": 2.8688524590163935e-05, "loss": 0.3388, "step": 54 }, { "epoch": 0.864, "eval_accuracy": 0.792, "eval_loss": 0.501422107219696, "eval_runtime": 8.6518, "eval_samples_per_second": 28.896, "eval_steps_per_second": 3.699, "step": 54 }, { "epoch": 0.88, "grad_norm": 17.625, "learning_rate": 2.8278688524590162e-05, "loss": 0.3829, "step": 55 }, { "epoch": 0.88, "eval_accuracy": 0.768, "eval_loss": 0.5729050040245056, "eval_runtime": 8.6468, "eval_samples_per_second": 28.912, "eval_steps_per_second": 3.701, "step": 55 }, { "epoch": 0.896, "grad_norm": 93.5, "learning_rate": 2.7868852459016392e-05, "loss": 0.6144, "step": 56 }, { "epoch": 0.896, "eval_accuracy": 0.764, "eval_loss": 0.6807990074157715, "eval_runtime": 8.6452, "eval_samples_per_second": 28.918, "eval_steps_per_second": 3.701, "step": 56 }, { "epoch": 0.912, "grad_norm": 28.5, "learning_rate": 2.7459016393442626e-05, "loss": 0.3515, "step": 57 }, { "epoch": 0.912, "eval_accuracy": 0.756, "eval_loss": 0.7396586537361145, "eval_runtime": 8.6535, "eval_samples_per_second": 28.89, "eval_steps_per_second": 3.698, "step": 57 }, { "epoch": 0.928, "grad_norm": 112.5, "learning_rate": 2.7049180327868856e-05, "loss": 0.3028, "step": 58 }, { "epoch": 0.928, "eval_accuracy": 0.756, "eval_loss": 0.745948314666748, "eval_runtime": 8.6584, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 58 }, { "epoch": 0.944, "grad_norm": 164.0, "learning_rate": 2.6639344262295087e-05, "loss": 0.6729, "step": 59 }, { "epoch": 0.944, "eval_accuracy": 0.752, "eval_loss": 0.7118371725082397, "eval_runtime": 8.6567, "eval_samples_per_second": 28.879, "eval_steps_per_second": 3.697, "step": 59 }, { "epoch": 0.96, "grad_norm": 131.0, "learning_rate": 2.6229508196721314e-05, "loss": 0.4634, "step": 60 }, { "epoch": 0.96, "eval_accuracy": 0.76, "eval_loss": 0.6441870331764221, "eval_runtime": 8.6557, "eval_samples_per_second": 28.883, "eval_steps_per_second": 3.697, "step": 60 }, { "epoch": 0.976, "grad_norm": 127.5, "learning_rate": 2.5819672131147544e-05, "loss": 0.5924, "step": 61 }, { "epoch": 0.976, "eval_accuracy": 0.776, "eval_loss": 0.5635260939598083, "eval_runtime": 8.6577, "eval_samples_per_second": 28.876, "eval_steps_per_second": 3.696, "step": 61 }, { "epoch": 0.992, "grad_norm": 130.0, "learning_rate": 2.540983606557377e-05, "loss": 0.5527, "step": 62 }, { "epoch": 0.992, "eval_accuracy": 0.796, "eval_loss": 0.4781284034252167, "eval_runtime": 8.6636, "eval_samples_per_second": 28.856, "eval_steps_per_second": 3.694, "step": 62 }, { "epoch": 1.008, "grad_norm": 43.0, "learning_rate": 2.5e-05, "loss": 0.1542, "step": 63 }, { "epoch": 1.008, "eval_accuracy": 0.82, "eval_loss": 0.4085061252117157, "eval_runtime": 8.6612, "eval_samples_per_second": 28.864, "eval_steps_per_second": 3.695, "step": 63 }, { "epoch": 1.024, "grad_norm": 147.0, "learning_rate": 2.459016393442623e-05, "loss": 0.3714, "step": 64 }, { "epoch": 1.024, "eval_accuracy": 0.848, "eval_loss": 0.37276288866996765, "eval_runtime": 8.6673, "eval_samples_per_second": 28.844, "eval_steps_per_second": 3.692, "step": 64 }, { "epoch": 1.04, "grad_norm": 34.5, "learning_rate": 2.418032786885246e-05, "loss": 0.1124, "step": 65 }, { "epoch": 1.04, "eval_accuracy": 0.848, "eval_loss": 0.36895105242729187, "eval_runtime": 8.6746, "eval_samples_per_second": 28.82, "eval_steps_per_second": 3.689, "step": 65 }, { "epoch": 1.056, "grad_norm": 50.25, "learning_rate": 2.377049180327869e-05, "loss": 0.1433, "step": 66 }, { "epoch": 1.056, "eval_accuracy": 0.844, "eval_loss": 0.3762807548046112, "eval_runtime": 8.6794, "eval_samples_per_second": 28.804, "eval_steps_per_second": 3.687, "step": 66 }, { "epoch": 1.072, "grad_norm": 85.5, "learning_rate": 2.336065573770492e-05, "loss": 0.2446, "step": 67 }, { "epoch": 1.072, "eval_accuracy": 0.84, "eval_loss": 0.38033661246299744, "eval_runtime": 8.6709, "eval_samples_per_second": 28.832, "eval_steps_per_second": 3.691, "step": 67 }, { "epoch": 1.088, "grad_norm": 120.5, "learning_rate": 2.295081967213115e-05, "loss": 0.6573, "step": 68 }, { "epoch": 1.088, "eval_accuracy": 0.848, "eval_loss": 0.37577661871910095, "eval_runtime": 8.6746, "eval_samples_per_second": 28.82, "eval_steps_per_second": 3.689, "step": 68 }, { "epoch": 1.104, "grad_norm": 32.25, "learning_rate": 2.254098360655738e-05, "loss": 0.1509, "step": 69 }, { "epoch": 1.104, "eval_accuracy": 0.848, "eval_loss": 0.36732277274131775, "eval_runtime": 8.6668, "eval_samples_per_second": 28.846, "eval_steps_per_second": 3.692, "step": 69 }, { "epoch": 1.12, "grad_norm": 36.0, "learning_rate": 2.2131147540983607e-05, "loss": 0.2131, "step": 70 }, { "epoch": 1.12, "eval_accuracy": 0.856, "eval_loss": 0.36693572998046875, "eval_runtime": 8.667, "eval_samples_per_second": 28.845, "eval_steps_per_second": 3.692, "step": 70 }, { "epoch": 1.1360000000000001, "grad_norm": 35.0, "learning_rate": 2.1721311475409837e-05, "loss": 0.077, "step": 71 }, { "epoch": 1.1360000000000001, "eval_accuracy": 0.836, "eval_loss": 0.3619978427886963, "eval_runtime": 8.671, "eval_samples_per_second": 28.832, "eval_steps_per_second": 3.69, "step": 71 }, { "epoch": 1.152, "grad_norm": 21.625, "learning_rate": 2.1311475409836064e-05, "loss": 0.2332, "step": 72 }, { "epoch": 1.152, "eval_accuracy": 0.832, "eval_loss": 0.36414313316345215, "eval_runtime": 8.6706, "eval_samples_per_second": 28.833, "eval_steps_per_second": 3.691, "step": 72 }, { "epoch": 1.168, "grad_norm": 69.5, "learning_rate": 2.0901639344262298e-05, "loss": 0.2056, "step": 73 }, { "epoch": 1.168, "eval_accuracy": 0.836, "eval_loss": 0.36293938755989075, "eval_runtime": 8.6724, "eval_samples_per_second": 28.827, "eval_steps_per_second": 3.69, "step": 73 }, { "epoch": 1.184, "grad_norm": 9.5, "learning_rate": 2.0491803278688525e-05, "loss": 0.1412, "step": 74 }, { "epoch": 1.184, "eval_accuracy": 0.844, "eval_loss": 0.3655231297016144, "eval_runtime": 8.6711, "eval_samples_per_second": 28.831, "eval_steps_per_second": 3.69, "step": 74 }, { "epoch": 1.2, "grad_norm": 35.25, "learning_rate": 2.0081967213114755e-05, "loss": 0.1982, "step": 75 }, { "epoch": 1.2, "eval_accuracy": 0.84, "eval_loss": 0.3644102215766907, "eval_runtime": 8.6641, "eval_samples_per_second": 28.855, "eval_steps_per_second": 3.693, "step": 75 }, { "epoch": 1.216, "grad_norm": 12.875, "learning_rate": 1.9672131147540985e-05, "loss": 0.2003, "step": 76 }, { "epoch": 1.216, "eval_accuracy": 0.84, "eval_loss": 0.3651863932609558, "eval_runtime": 8.6665, "eval_samples_per_second": 28.847, "eval_steps_per_second": 3.692, "step": 76 }, { "epoch": 1.232, "grad_norm": 7.28125, "learning_rate": 1.9262295081967212e-05, "loss": 0.0934, "step": 77 }, { "epoch": 1.232, "eval_accuracy": 0.84, "eval_loss": 0.3709143102169037, "eval_runtime": 8.6583, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 77 }, { "epoch": 1.248, "grad_norm": 42.25, "learning_rate": 1.8852459016393442e-05, "loss": 0.1577, "step": 78 }, { "epoch": 1.248, "eval_accuracy": 0.836, "eval_loss": 0.37103718519210815, "eval_runtime": 8.6594, "eval_samples_per_second": 28.87, "eval_steps_per_second": 3.695, "step": 78 }, { "epoch": 1.264, "grad_norm": 25.25, "learning_rate": 1.8442622950819673e-05, "loss": 0.3063, "step": 79 }, { "epoch": 1.264, "eval_accuracy": 0.832, "eval_loss": 0.3689051866531372, "eval_runtime": 8.6658, "eval_samples_per_second": 28.849, "eval_steps_per_second": 3.693, "step": 79 }, { "epoch": 1.28, "grad_norm": 31.625, "learning_rate": 1.8032786885245903e-05, "loss": 0.2724, "step": 80 }, { "epoch": 1.28, "eval_accuracy": 0.832, "eval_loss": 0.3685128688812256, "eval_runtime": 8.6623, "eval_samples_per_second": 28.861, "eval_steps_per_second": 3.694, "step": 80 }, { "epoch": 1.296, "grad_norm": 34.75, "learning_rate": 1.7622950819672133e-05, "loss": 0.4324, "step": 81 }, { "epoch": 1.296, "eval_accuracy": 0.836, "eval_loss": 0.3717711567878723, "eval_runtime": 8.6564, "eval_samples_per_second": 28.88, "eval_steps_per_second": 3.697, "step": 81 }, { "epoch": 1.312, "grad_norm": 33.0, "learning_rate": 1.721311475409836e-05, "loss": 0.1911, "step": 82 }, { "epoch": 1.312, "eval_accuracy": 0.84, "eval_loss": 0.3723936080932617, "eval_runtime": 8.6687, "eval_samples_per_second": 28.839, "eval_steps_per_second": 3.691, "step": 82 }, { "epoch": 1.328, "grad_norm": 16.125, "learning_rate": 1.680327868852459e-05, "loss": 0.1936, "step": 83 }, { "epoch": 1.328, "eval_accuracy": 0.84, "eval_loss": 0.3704240024089813, "eval_runtime": 8.6668, "eval_samples_per_second": 28.846, "eval_steps_per_second": 3.692, "step": 83 }, { "epoch": 1.3439999999999999, "grad_norm": 34.75, "learning_rate": 1.6393442622950818e-05, "loss": 0.0839, "step": 84 }, { "epoch": 1.3439999999999999, "eval_accuracy": 0.832, "eval_loss": 0.36510899662971497, "eval_runtime": 8.661, "eval_samples_per_second": 28.865, "eval_steps_per_second": 3.695, "step": 84 }, { "epoch": 1.3599999999999999, "grad_norm": 40.0, "learning_rate": 1.598360655737705e-05, "loss": 0.2661, "step": 85 }, { "epoch": 1.3599999999999999, "eval_accuracy": 0.84, "eval_loss": 0.3661534786224365, "eval_runtime": 8.6702, "eval_samples_per_second": 28.834, "eval_steps_per_second": 3.691, "step": 85 }, { "epoch": 1.376, "grad_norm": 52.5, "learning_rate": 1.557377049180328e-05, "loss": 0.1679, "step": 86 }, { "epoch": 1.376, "eval_accuracy": 0.848, "eval_loss": 0.36859577894210815, "eval_runtime": 8.6649, "eval_samples_per_second": 28.852, "eval_steps_per_second": 3.693, "step": 86 }, { "epoch": 1.392, "grad_norm": 12.75, "learning_rate": 1.5163934426229509e-05, "loss": 0.0698, "step": 87 }, { "epoch": 1.392, "eval_accuracy": 0.852, "eval_loss": 0.3691750466823578, "eval_runtime": 8.6861, "eval_samples_per_second": 28.782, "eval_steps_per_second": 3.684, "step": 87 }, { "epoch": 1.408, "grad_norm": 39.25, "learning_rate": 1.4754098360655739e-05, "loss": 0.1173, "step": 88 }, { "epoch": 1.408, "eval_accuracy": 0.856, "eval_loss": 0.3779418170452118, "eval_runtime": 8.6673, "eval_samples_per_second": 28.844, "eval_steps_per_second": 3.692, "step": 88 }, { "epoch": 1.424, "grad_norm": 21.5, "learning_rate": 1.4344262295081968e-05, "loss": 0.3727, "step": 89 }, { "epoch": 1.424, "eval_accuracy": 0.86, "eval_loss": 0.38709089159965515, "eval_runtime": 8.6636, "eval_samples_per_second": 28.856, "eval_steps_per_second": 3.694, "step": 89 }, { "epoch": 1.44, "grad_norm": 18.25, "learning_rate": 1.3934426229508196e-05, "loss": 0.3828, "step": 90 }, { "epoch": 1.44, "eval_accuracy": 0.86, "eval_loss": 0.3986479640007019, "eval_runtime": 8.6565, "eval_samples_per_second": 28.88, "eval_steps_per_second": 3.697, "step": 90 }, { "epoch": 1.456, "grad_norm": 29.875, "learning_rate": 1.3524590163934428e-05, "loss": 0.0911, "step": 91 }, { "epoch": 1.456, "eval_accuracy": 0.84, "eval_loss": 0.4078799784183502, "eval_runtime": 8.654, "eval_samples_per_second": 28.888, "eval_steps_per_second": 3.698, "step": 91 }, { "epoch": 1.472, "grad_norm": 40.75, "learning_rate": 1.3114754098360657e-05, "loss": 0.1798, "step": 92 }, { "epoch": 1.472, "eval_accuracy": 0.832, "eval_loss": 0.4203779399394989, "eval_runtime": 8.6654, "eval_samples_per_second": 28.85, "eval_steps_per_second": 3.693, "step": 92 }, { "epoch": 1.488, "grad_norm": 15.6875, "learning_rate": 1.2704918032786885e-05, "loss": 0.0851, "step": 93 }, { "epoch": 1.488, "eval_accuracy": 0.832, "eval_loss": 0.4253535568714142, "eval_runtime": 8.6605, "eval_samples_per_second": 28.867, "eval_steps_per_second": 3.695, "step": 93 }, { "epoch": 1.504, "grad_norm": 21.0, "learning_rate": 1.2295081967213116e-05, "loss": 0.0962, "step": 94 }, { "epoch": 1.504, "eval_accuracy": 0.832, "eval_loss": 0.42336249351501465, "eval_runtime": 8.6599, "eval_samples_per_second": 28.869, "eval_steps_per_second": 3.695, "step": 94 }, { "epoch": 1.52, "grad_norm": 111.0, "learning_rate": 1.1885245901639344e-05, "loss": 0.3427, "step": 95 }, { "epoch": 1.52, "eval_accuracy": 0.828, "eval_loss": 0.4188750684261322, "eval_runtime": 8.6648, "eval_samples_per_second": 28.852, "eval_steps_per_second": 3.693, "step": 95 }, { "epoch": 1.536, "grad_norm": 27.5, "learning_rate": 1.1475409836065575e-05, "loss": 0.0881, "step": 96 }, { "epoch": 1.536, "eval_accuracy": 0.84, "eval_loss": 0.4100199043750763, "eval_runtime": 8.6603, "eval_samples_per_second": 28.867, "eval_steps_per_second": 3.695, "step": 96 }, { "epoch": 1.552, "grad_norm": 69.0, "learning_rate": 1.1065573770491803e-05, "loss": 0.1664, "step": 97 }, { "epoch": 1.552, "eval_accuracy": 0.844, "eval_loss": 0.39891311526298523, "eval_runtime": 8.6581, "eval_samples_per_second": 28.875, "eval_steps_per_second": 3.696, "step": 97 }, { "epoch": 1.568, "grad_norm": 46.25, "learning_rate": 1.0655737704918032e-05, "loss": 0.6757, "step": 98 }, { "epoch": 1.568, "eval_accuracy": 0.856, "eval_loss": 0.3860258162021637, "eval_runtime": 8.66, "eval_samples_per_second": 28.868, "eval_steps_per_second": 3.695, "step": 98 }, { "epoch": 1.584, "grad_norm": 5.875, "learning_rate": 1.0245901639344262e-05, "loss": 0.0751, "step": 99 }, { "epoch": 1.584, "eval_accuracy": 0.86, "eval_loss": 0.3817059397697449, "eval_runtime": 8.6608, "eval_samples_per_second": 28.866, "eval_steps_per_second": 3.695, "step": 99 }, { "epoch": 1.6, "grad_norm": 64.0, "learning_rate": 9.836065573770493e-06, "loss": 0.1923, "step": 100 }, { "epoch": 1.6, "eval_accuracy": 0.856, "eval_loss": 0.37669360637664795, "eval_runtime": 8.6602, "eval_samples_per_second": 28.868, "eval_steps_per_second": 3.695, "step": 100 }, { "epoch": 1.616, "grad_norm": 11.75, "learning_rate": 9.426229508196721e-06, "loss": 0.0365, "step": 101 }, { "epoch": 1.616, "eval_accuracy": 0.848, "eval_loss": 0.3779665231704712, "eval_runtime": 8.6661, "eval_samples_per_second": 28.848, "eval_steps_per_second": 3.693, "step": 101 }, { "epoch": 1.6320000000000001, "grad_norm": 13.75, "learning_rate": 9.016393442622952e-06, "loss": 0.0895, "step": 102 }, { "epoch": 1.6320000000000001, "eval_accuracy": 0.848, "eval_loss": 0.3783411383628845, "eval_runtime": 8.6625, "eval_samples_per_second": 28.86, "eval_steps_per_second": 3.694, "step": 102 }, { "epoch": 1.6480000000000001, "grad_norm": 87.0, "learning_rate": 8.60655737704918e-06, "loss": 0.3337, "step": 103 }, { "epoch": 1.6480000000000001, "eval_accuracy": 0.852, "eval_loss": 0.3828529417514801, "eval_runtime": 8.6587, "eval_samples_per_second": 28.873, "eval_steps_per_second": 3.696, "step": 103 }, { "epoch": 1.6640000000000001, "grad_norm": 29.375, "learning_rate": 8.196721311475409e-06, "loss": 0.1525, "step": 104 }, { "epoch": 1.6640000000000001, "eval_accuracy": 0.852, "eval_loss": 0.38398581743240356, "eval_runtime": 8.6634, "eval_samples_per_second": 28.857, "eval_steps_per_second": 3.694, "step": 104 }, { "epoch": 1.6800000000000002, "grad_norm": 10.0625, "learning_rate": 7.78688524590164e-06, "loss": 0.0973, "step": 105 }, { "epoch": 1.6800000000000002, "eval_accuracy": 0.844, "eval_loss": 0.3848567605018616, "eval_runtime": 8.6582, "eval_samples_per_second": 28.874, "eval_steps_per_second": 3.696, "step": 105 }, { "epoch": 1.696, "grad_norm": 77.5, "learning_rate": 7.3770491803278695e-06, "loss": 0.2626, "step": 106 }, { "epoch": 1.696, "eval_accuracy": 0.852, "eval_loss": 0.385408878326416, "eval_runtime": 8.6577, "eval_samples_per_second": 28.876, "eval_steps_per_second": 3.696, "step": 106 }, { "epoch": 1.712, "grad_norm": 9.5, "learning_rate": 6.967213114754098e-06, "loss": 0.0585, "step": 107 }, { "epoch": 1.712, "eval_accuracy": 0.848, "eval_loss": 0.38454535603523254, "eval_runtime": 8.6544, "eval_samples_per_second": 28.887, "eval_steps_per_second": 3.698, "step": 107 }, { "epoch": 1.728, "grad_norm": 55.0, "learning_rate": 6.557377049180328e-06, "loss": 0.2257, "step": 108 }, { "epoch": 1.728, "eval_accuracy": 0.852, "eval_loss": 0.38379326462745667, "eval_runtime": 8.6523, "eval_samples_per_second": 28.894, "eval_steps_per_second": 3.698, "step": 108 }, { "epoch": 1.744, "grad_norm": 32.5, "learning_rate": 6.147540983606558e-06, "loss": 0.1137, "step": 109 }, { "epoch": 1.744, "eval_accuracy": 0.852, "eval_loss": 0.3833220303058624, "eval_runtime": 8.652, "eval_samples_per_second": 28.895, "eval_steps_per_second": 3.699, "step": 109 }, { "epoch": 1.76, "grad_norm": 15.4375, "learning_rate": 5.737704918032787e-06, "loss": 0.283, "step": 110 }, { "epoch": 1.76, "eval_accuracy": 0.852, "eval_loss": 0.37939703464508057, "eval_runtime": 8.6539, "eval_samples_per_second": 28.889, "eval_steps_per_second": 3.698, "step": 110 }, { "epoch": 1.776, "grad_norm": 27.375, "learning_rate": 5.327868852459016e-06, "loss": 0.1111, "step": 111 }, { "epoch": 1.776, "eval_accuracy": 0.852, "eval_loss": 0.3771066665649414, "eval_runtime": 8.6565, "eval_samples_per_second": 28.88, "eval_steps_per_second": 3.697, "step": 111 }, { "epoch": 1.792, "grad_norm": 26.125, "learning_rate": 4.918032786885246e-06, "loss": 0.1367, "step": 112 }, { "epoch": 1.792, "eval_accuracy": 0.852, "eval_loss": 0.3757225275039673, "eval_runtime": 8.6575, "eval_samples_per_second": 28.877, "eval_steps_per_second": 3.696, "step": 112 }, { "epoch": 1.808, "grad_norm": 24.875, "learning_rate": 4.508196721311476e-06, "loss": 0.0762, "step": 113 }, { "epoch": 1.808, "eval_accuracy": 0.852, "eval_loss": 0.3756250739097595, "eval_runtime": 8.6535, "eval_samples_per_second": 28.89, "eval_steps_per_second": 3.698, "step": 113 }, { "epoch": 1.8239999999999998, "grad_norm": 47.5, "learning_rate": 4.098360655737704e-06, "loss": 0.133, "step": 114 }, { "epoch": 1.8239999999999998, "eval_accuracy": 0.852, "eval_loss": 0.37420740723609924, "eval_runtime": 8.6587, "eval_samples_per_second": 28.873, "eval_steps_per_second": 3.696, "step": 114 }, { "epoch": 1.8399999999999999, "grad_norm": 22.625, "learning_rate": 3.6885245901639347e-06, "loss": 0.2904, "step": 115 }, { "epoch": 1.8399999999999999, "eval_accuracy": 0.852, "eval_loss": 0.372751921415329, "eval_runtime": 8.6548, "eval_samples_per_second": 28.886, "eval_steps_per_second": 3.697, "step": 115 }, { "epoch": 1.8559999999999999, "grad_norm": 16.75, "learning_rate": 3.278688524590164e-06, "loss": 0.1686, "step": 116 }, { "epoch": 1.8559999999999999, "eval_accuracy": 0.848, "eval_loss": 0.3734797239303589, "eval_runtime": 8.6629, "eval_samples_per_second": 28.859, "eval_steps_per_second": 3.694, "step": 116 }, { "epoch": 1.8719999999999999, "grad_norm": 35.25, "learning_rate": 2.8688524590163937e-06, "loss": 0.0737, "step": 117 }, { "epoch": 1.8719999999999999, "eval_accuracy": 0.848, "eval_loss": 0.3713564872741699, "eval_runtime": 8.6633, "eval_samples_per_second": 28.857, "eval_steps_per_second": 3.694, "step": 117 }, { "epoch": 1.888, "grad_norm": 100.0, "learning_rate": 2.459016393442623e-06, "loss": 0.2758, "step": 118 }, { "epoch": 1.888, "eval_accuracy": 0.848, "eval_loss": 0.3682093322277069, "eval_runtime": 8.6631, "eval_samples_per_second": 28.858, "eval_steps_per_second": 3.694, "step": 118 }, { "epoch": 1.904, "grad_norm": 22.5, "learning_rate": 2.049180327868852e-06, "loss": 0.0542, "step": 119 }, { "epoch": 1.904, "eval_accuracy": 0.848, "eval_loss": 0.3716946542263031, "eval_runtime": 8.6618, "eval_samples_per_second": 28.862, "eval_steps_per_second": 3.694, "step": 119 }, { "epoch": 1.92, "grad_norm": 69.0, "learning_rate": 1.639344262295082e-06, "loss": 0.1646, "step": 120 }, { "epoch": 1.92, "eval_accuracy": 0.848, "eval_loss": 0.3682910203933716, "eval_runtime": 8.6617, "eval_samples_per_second": 28.863, "eval_steps_per_second": 3.694, "step": 120 }, { "epoch": 1.936, "grad_norm": 31.125, "learning_rate": 1.2295081967213116e-06, "loss": 0.4908, "step": 121 }, { "epoch": 1.936, "eval_accuracy": 0.848, "eval_loss": 0.3708224594593048, "eval_runtime": 8.6585, "eval_samples_per_second": 28.873, "eval_steps_per_second": 3.696, "step": 121 }, { "epoch": 1.952, "grad_norm": 55.75, "learning_rate": 8.19672131147541e-07, "loss": 0.3249, "step": 122 }, { "epoch": 1.952, "eval_accuracy": 0.848, "eval_loss": 0.36828938126564026, "eval_runtime": 8.6603, "eval_samples_per_second": 28.867, "eval_steps_per_second": 3.695, "step": 122 }, { "epoch": 1.968, "grad_norm": 22.375, "learning_rate": 4.098360655737705e-07, "loss": 0.1096, "step": 123 }, { "epoch": 1.968, "eval_accuracy": 0.852, "eval_loss": 0.3706204891204834, "eval_runtime": 8.6631, "eval_samples_per_second": 28.858, "eval_steps_per_second": 3.694, "step": 123 }, { "epoch": 1.984, "grad_norm": 52.5, "learning_rate": 0.0, "loss": 0.1758, "step": 124 }, { "epoch": 1.984, "eval_accuracy": 0.852, "eval_loss": 0.36876150965690613, "eval_runtime": 8.6609, "eval_samples_per_second": 28.865, "eval_steps_per_second": 3.695, "step": 124 }, { "epoch": 1.984, "step": 124, "total_flos": 1.693315531538432e+16, "train_loss": 0.4407010670871504, "train_runtime": 1267.2396, "train_samples_per_second": 1.578, "train_steps_per_second": 0.098 } ], "logging_steps": 1, "max_steps": 124, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.693315531538432e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }