{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.6, "eval_steps": 1, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 136.0, "learning_rate": 2.5e-05, "loss": 1.0785, "step": 1 }, { "epoch": 0.08, "eval_accuracy": 0.516, "eval_loss": 1.023781657218933, "eval_runtime": 2.0454, "eval_samples_per_second": 122.227, "eval_steps_per_second": 3.422, "step": 1 }, { "epoch": 0.16, "grad_norm": 185.0, "learning_rate": 5e-05, "loss": 1.2776, "step": 2 }, { "epoch": 0.16, "eval_accuracy": 0.504, "eval_loss": 1.0729936361312866, "eval_runtime": 2.1411, "eval_samples_per_second": 116.764, "eval_steps_per_second": 3.269, "step": 2 }, { "epoch": 0.24, "grad_norm": 162.0, "learning_rate": 4.957627118644068e-05, "loss": 1.1542, "step": 3 }, { "epoch": 0.24, "eval_accuracy": 0.516, "eval_loss": 1.3786845207214355, "eval_runtime": 2.1405, "eval_samples_per_second": 116.796, "eval_steps_per_second": 3.27, "step": 3 }, { "epoch": 0.32, "grad_norm": 262.0, "learning_rate": 4.915254237288136e-05, "loss": 1.6628, "step": 4 }, { "epoch": 0.32, "eval_accuracy": 0.552, "eval_loss": 0.850824236869812, "eval_runtime": 2.1406, "eval_samples_per_second": 116.792, "eval_steps_per_second": 3.27, "step": 4 }, { "epoch": 0.4, "grad_norm": 86.5, "learning_rate": 4.8728813559322034e-05, "loss": 1.0706, "step": 5 }, { "epoch": 0.4, "eval_accuracy": 0.512, "eval_loss": 1.2434474229812622, "eval_runtime": 2.1419, "eval_samples_per_second": 116.717, "eval_steps_per_second": 3.268, "step": 5 }, { "epoch": 0.48, "grad_norm": 221.0, "learning_rate": 4.8305084745762714e-05, "loss": 1.4146, "step": 6 }, { "epoch": 0.48, "eval_accuracy": 0.488, "eval_loss": 1.4112776517868042, "eval_runtime": 2.1428, "eval_samples_per_second": 116.672, "eval_steps_per_second": 3.267, "step": 6 }, { "epoch": 0.56, "grad_norm": 239.0, "learning_rate": 4.788135593220339e-05, "loss": 1.4855, "step": 7 }, { "epoch": 0.56, "eval_accuracy": 0.564, "eval_loss": 0.9813472032546997, "eval_runtime": 2.1413, "eval_samples_per_second": 116.75, "eval_steps_per_second": 3.269, "step": 7 }, { "epoch": 0.64, "grad_norm": 158.0, "learning_rate": 4.745762711864407e-05, "loss": 1.0638, "step": 8 }, { "epoch": 0.64, "eval_accuracy": 0.624, "eval_loss": 0.7241154909133911, "eval_runtime": 2.1408, "eval_samples_per_second": 116.78, "eval_steps_per_second": 3.27, "step": 8 }, { "epoch": 0.72, "grad_norm": 70.0, "learning_rate": 4.703389830508475e-05, "loss": 0.6901, "step": 9 }, { "epoch": 0.72, "eval_accuracy": 0.628, "eval_loss": 0.7069931626319885, "eval_runtime": 2.1404, "eval_samples_per_second": 116.801, "eval_steps_per_second": 3.27, "step": 9 }, { "epoch": 0.8, "grad_norm": 87.0, "learning_rate": 4.6610169491525425e-05, "loss": 0.7535, "step": 10 }, { "epoch": 0.8, "eval_accuracy": 0.64, "eval_loss": 0.6583124995231628, "eval_runtime": 2.1407, "eval_samples_per_second": 116.785, "eval_steps_per_second": 3.27, "step": 10 }, { "epoch": 0.88, "grad_norm": 52.75, "learning_rate": 4.6186440677966104e-05, "loss": 0.6397, "step": 11 }, { "epoch": 0.88, "eval_accuracy": 0.608, "eval_loss": 0.7135878801345825, "eval_runtime": 2.14, "eval_samples_per_second": 116.822, "eval_steps_per_second": 3.271, "step": 11 }, { "epoch": 0.96, "grad_norm": 34.0, "learning_rate": 4.5762711864406784e-05, "loss": 0.6484, "step": 12 }, { "epoch": 0.96, "eval_accuracy": 0.616, "eval_loss": 0.6957533955574036, "eval_runtime": 2.141, "eval_samples_per_second": 116.766, "eval_steps_per_second": 3.269, "step": 12 }, { "epoch": 1.04, "grad_norm": 13.6875, "learning_rate": 4.533898305084746e-05, "loss": 0.66, "step": 13 }, { "epoch": 1.04, "eval_accuracy": 0.652, "eval_loss": 0.6610312461853027, "eval_runtime": 2.1413, "eval_samples_per_second": 116.749, "eval_steps_per_second": 3.269, "step": 13 }, { "epoch": 1.12, "grad_norm": 7.90625, "learning_rate": 4.491525423728814e-05, "loss": 0.6312, "step": 14 }, { "epoch": 1.12, "eval_accuracy": 0.66, "eval_loss": 0.6188369393348694, "eval_runtime": 2.1393, "eval_samples_per_second": 116.863, "eval_steps_per_second": 3.272, "step": 14 }, { "epoch": 1.2, "grad_norm": 44.25, "learning_rate": 4.4491525423728816e-05, "loss": 0.4925, "step": 15 }, { "epoch": 1.2, "eval_accuracy": 0.7, "eval_loss": 0.6158857345581055, "eval_runtime": 2.1409, "eval_samples_per_second": 116.773, "eval_steps_per_second": 3.27, "step": 15 }, { "epoch": 1.28, "grad_norm": 101.0, "learning_rate": 4.4067796610169495e-05, "loss": 0.613, "step": 16 }, { "epoch": 1.28, "eval_accuracy": 0.708, "eval_loss": 0.5807080268859863, "eval_runtime": 2.1421, "eval_samples_per_second": 116.71, "eval_steps_per_second": 3.268, "step": 16 }, { "epoch": 1.3599999999999999, "grad_norm": 6.03125, "learning_rate": 4.3644067796610175e-05, "loss": 0.47, "step": 17 }, { "epoch": 1.3599999999999999, "eval_accuracy": 0.688, "eval_loss": 0.5807793140411377, "eval_runtime": 2.0911, "eval_samples_per_second": 119.555, "eval_steps_per_second": 3.348, "step": 17 }, { "epoch": 1.44, "grad_norm": 41.25, "learning_rate": 4.3220338983050854e-05, "loss": 0.5137, "step": 18 }, { "epoch": 1.44, "eval_accuracy": 0.704, "eval_loss": 0.5797407031059265, "eval_runtime": 2.1399, "eval_samples_per_second": 116.829, "eval_steps_per_second": 3.271, "step": 18 }, { "epoch": 1.52, "grad_norm": 32.25, "learning_rate": 4.279661016949153e-05, "loss": 0.5839, "step": 19 }, { "epoch": 1.52, "eval_accuracy": 0.704, "eval_loss": 0.628451406955719, "eval_runtime": 2.1414, "eval_samples_per_second": 116.747, "eval_steps_per_second": 3.269, "step": 19 }, { "epoch": 1.6, "grad_norm": 32.75, "learning_rate": 4.2372881355932206e-05, "loss": 0.4523, "step": 20 }, { "epoch": 1.6, "eval_accuracy": 0.692, "eval_loss": 0.6559668183326721, "eval_runtime": 2.1409, "eval_samples_per_second": 116.776, "eval_steps_per_second": 3.27, "step": 20 }, { "epoch": 1.6800000000000002, "grad_norm": 110.5, "learning_rate": 4.1949152542372886e-05, "loss": 0.6259, "step": 21 }, { "epoch": 1.6800000000000002, "eval_accuracy": 0.716, "eval_loss": 0.6154900193214417, "eval_runtime": 2.1418, "eval_samples_per_second": 116.724, "eval_steps_per_second": 3.268, "step": 21 }, { "epoch": 1.76, "grad_norm": 52.75, "learning_rate": 4.152542372881356e-05, "loss": 0.5299, "step": 22 }, { "epoch": 1.76, "eval_accuracy": 0.712, "eval_loss": 0.5984148979187012, "eval_runtime": 2.1432, "eval_samples_per_second": 116.647, "eval_steps_per_second": 3.266, "step": 22 }, { "epoch": 1.8399999999999999, "grad_norm": 49.0, "learning_rate": 4.110169491525424e-05, "loss": 0.5721, "step": 23 }, { "epoch": 1.8399999999999999, "eval_accuracy": 0.712, "eval_loss": 0.6128161549568176, "eval_runtime": 2.1427, "eval_samples_per_second": 116.677, "eval_steps_per_second": 3.267, "step": 23 }, { "epoch": 1.92, "grad_norm": 97.5, "learning_rate": 4.067796610169492e-05, "loss": 0.7519, "step": 24 }, { "epoch": 1.92, "eval_accuracy": 0.728, "eval_loss": 0.5905612707138062, "eval_runtime": 2.1425, "eval_samples_per_second": 116.687, "eval_steps_per_second": 3.267, "step": 24 }, { "epoch": 2.0, "grad_norm": 17.0, "learning_rate": 4.025423728813559e-05, "loss": 0.4132, "step": 25 }, { "epoch": 2.0, "eval_accuracy": 0.716, "eval_loss": 0.6202563643455505, "eval_runtime": 2.1404, "eval_samples_per_second": 116.802, "eval_steps_per_second": 3.27, "step": 25 }, { "epoch": 2.08, "grad_norm": 69.5, "learning_rate": 3.983050847457627e-05, "loss": 0.4021, "step": 26 }, { "epoch": 2.08, "eval_accuracy": 0.708, "eval_loss": 0.6416750550270081, "eval_runtime": 2.1429, "eval_samples_per_second": 116.663, "eval_steps_per_second": 3.267, "step": 26 }, { "epoch": 2.16, "grad_norm": 73.5, "learning_rate": 3.940677966101695e-05, "loss": 0.4837, "step": 27 }, { "epoch": 2.16, "eval_accuracy": 0.724, "eval_loss": 0.609797477722168, "eval_runtime": 2.1417, "eval_samples_per_second": 116.727, "eval_steps_per_second": 3.268, "step": 27 }, { "epoch": 2.24, "grad_norm": 31.0, "learning_rate": 3.898305084745763e-05, "loss": 0.382, "step": 28 }, { "epoch": 2.24, "eval_accuracy": 0.732, "eval_loss": 0.5821030139923096, "eval_runtime": 2.1399, "eval_samples_per_second": 116.826, "eval_steps_per_second": 3.271, "step": 28 }, { "epoch": 2.32, "grad_norm": 49.25, "learning_rate": 3.855932203389831e-05, "loss": 0.4402, "step": 29 }, { "epoch": 2.32, "eval_accuracy": 0.74, "eval_loss": 0.5874293446540833, "eval_runtime": 2.1406, "eval_samples_per_second": 116.787, "eval_steps_per_second": 3.27, "step": 29 }, { "epoch": 2.4, "grad_norm": 26.5, "learning_rate": 3.813559322033898e-05, "loss": 0.3664, "step": 30 }, { "epoch": 2.4, "eval_accuracy": 0.732, "eval_loss": 0.5800690650939941, "eval_runtime": 2.1453, "eval_samples_per_second": 116.536, "eval_steps_per_second": 3.263, "step": 30 }, { "epoch": 2.48, "grad_norm": 34.5, "learning_rate": 3.771186440677966e-05, "loss": 0.313, "step": 31 }, { "epoch": 2.48, "eval_accuracy": 0.728, "eval_loss": 0.6099223494529724, "eval_runtime": 2.1422, "eval_samples_per_second": 116.701, "eval_steps_per_second": 3.268, "step": 31 }, { "epoch": 2.56, "grad_norm": 42.5, "learning_rate": 3.728813559322034e-05, "loss": 0.4822, "step": 32 }, { "epoch": 2.56, "eval_accuracy": 0.716, "eval_loss": 0.626610517501831, "eval_runtime": 2.1438, "eval_samples_per_second": 116.615, "eval_steps_per_second": 3.265, "step": 32 }, { "epoch": 2.64, "grad_norm": 48.75, "learning_rate": 3.686440677966102e-05, "loss": 0.3374, "step": 33 }, { "epoch": 2.64, "eval_accuracy": 0.74, "eval_loss": 0.6000227332115173, "eval_runtime": 2.144, "eval_samples_per_second": 116.603, "eval_steps_per_second": 3.265, "step": 33 }, { "epoch": 2.7199999999999998, "grad_norm": 4.59375, "learning_rate": 3.644067796610169e-05, "loss": 0.2605, "step": 34 }, { "epoch": 2.7199999999999998, "eval_accuracy": 0.748, "eval_loss": 0.5826870203018188, "eval_runtime": 2.1443, "eval_samples_per_second": 116.588, "eval_steps_per_second": 3.264, "step": 34 }, { "epoch": 2.8, "grad_norm": 46.25, "learning_rate": 3.601694915254237e-05, "loss": 0.4195, "step": 35 }, { "epoch": 2.8, "eval_accuracy": 0.748, "eval_loss": 0.5882580280303955, "eval_runtime": 2.0917, "eval_samples_per_second": 119.522, "eval_steps_per_second": 3.347, "step": 35 }, { "epoch": 2.88, "grad_norm": 23.125, "learning_rate": 3.559322033898305e-05, "loss": 0.3838, "step": 36 }, { "epoch": 2.88, "eval_accuracy": 0.728, "eval_loss": 0.5986873507499695, "eval_runtime": 2.1402, "eval_samples_per_second": 116.811, "eval_steps_per_second": 3.271, "step": 36 }, { "epoch": 2.96, "grad_norm": 25.375, "learning_rate": 3.516949152542373e-05, "loss": 0.4504, "step": 37 }, { "epoch": 2.96, "eval_accuracy": 0.74, "eval_loss": 0.599513828754425, "eval_runtime": 2.1411, "eval_samples_per_second": 116.761, "eval_steps_per_second": 3.269, "step": 37 }, { "epoch": 3.04, "grad_norm": 12.625, "learning_rate": 3.474576271186441e-05, "loss": 0.3745, "step": 38 }, { "epoch": 3.04, "eval_accuracy": 0.744, "eval_loss": 0.602383017539978, "eval_runtime": 2.1404, "eval_samples_per_second": 116.803, "eval_steps_per_second": 3.27, "step": 38 }, { "epoch": 3.12, "grad_norm": 38.5, "learning_rate": 3.432203389830508e-05, "loss": 0.2068, "step": 39 }, { "epoch": 3.12, "eval_accuracy": 0.728, "eval_loss": 0.6505258083343506, "eval_runtime": 2.1409, "eval_samples_per_second": 116.772, "eval_steps_per_second": 3.27, "step": 39 }, { "epoch": 3.2, "grad_norm": 57.25, "learning_rate": 3.389830508474576e-05, "loss": 0.354, "step": 40 }, { "epoch": 3.2, "eval_accuracy": 0.724, "eval_loss": 0.681692361831665, "eval_runtime": 2.1411, "eval_samples_per_second": 116.762, "eval_steps_per_second": 3.269, "step": 40 }, { "epoch": 3.2800000000000002, "grad_norm": 20.25, "learning_rate": 3.347457627118644e-05, "loss": 0.2488, "step": 41 }, { "epoch": 3.2800000000000002, "eval_accuracy": 0.744, "eval_loss": 0.6564323306083679, "eval_runtime": 2.1392, "eval_samples_per_second": 116.864, "eval_steps_per_second": 3.272, "step": 41 }, { "epoch": 3.36, "grad_norm": 28.625, "learning_rate": 3.305084745762712e-05, "loss": 0.267, "step": 42 }, { "epoch": 3.36, "eval_accuracy": 0.744, "eval_loss": 0.6272720694541931, "eval_runtime": 2.1431, "eval_samples_per_second": 116.651, "eval_steps_per_second": 3.266, "step": 42 }, { "epoch": 3.44, "grad_norm": 6.90625, "learning_rate": 3.26271186440678e-05, "loss": 0.2534, "step": 43 }, { "epoch": 3.44, "eval_accuracy": 0.72, "eval_loss": 0.679269015789032, "eval_runtime": 2.1434, "eval_samples_per_second": 116.636, "eval_steps_per_second": 3.266, "step": 43 }, { "epoch": 3.52, "grad_norm": 67.0, "learning_rate": 3.2203389830508473e-05, "loss": 0.3636, "step": 44 }, { "epoch": 3.52, "eval_accuracy": 0.72, "eval_loss": 0.7287683486938477, "eval_runtime": 2.1422, "eval_samples_per_second": 116.7, "eval_steps_per_second": 3.268, "step": 44 }, { "epoch": 3.6, "grad_norm": 68.5, "learning_rate": 3.177966101694915e-05, "loss": 0.3003, "step": 45 }, { "epoch": 3.6, "eval_accuracy": 0.72, "eval_loss": 0.7087514400482178, "eval_runtime": 2.1436, "eval_samples_per_second": 116.624, "eval_steps_per_second": 3.265, "step": 45 }, { "epoch": 3.68, "grad_norm": 84.0, "learning_rate": 3.135593220338983e-05, "loss": 0.3875, "step": 46 }, { "epoch": 3.68, "eval_accuracy": 0.74, "eval_loss": 0.6582326292991638, "eval_runtime": 2.142, "eval_samples_per_second": 116.713, "eval_steps_per_second": 3.268, "step": 46 }, { "epoch": 3.76, "grad_norm": 72.5, "learning_rate": 3.093220338983051e-05, "loss": 0.3178, "step": 47 }, { "epoch": 3.76, "eval_accuracy": 0.732, "eval_loss": 0.6546837687492371, "eval_runtime": 2.1409, "eval_samples_per_second": 116.776, "eval_steps_per_second": 3.27, "step": 47 }, { "epoch": 3.84, "grad_norm": 29.0, "learning_rate": 3.050847457627119e-05, "loss": 0.3452, "step": 48 }, { "epoch": 3.84, "eval_accuracy": 0.74, "eval_loss": 0.6847663521766663, "eval_runtime": 2.1429, "eval_samples_per_second": 116.662, "eval_steps_per_second": 3.267, "step": 48 }, { "epoch": 3.92, "grad_norm": 24.625, "learning_rate": 3.0084745762711864e-05, "loss": 0.2246, "step": 49 }, { "epoch": 3.92, "eval_accuracy": 0.732, "eval_loss": 0.6859081983566284, "eval_runtime": 2.1464, "eval_samples_per_second": 116.477, "eval_steps_per_second": 3.261, "step": 49 }, { "epoch": 4.0, "grad_norm": 50.5, "learning_rate": 2.9661016949152544e-05, "loss": 0.3548, "step": 50 }, { "epoch": 4.0, "eval_accuracy": 0.732, "eval_loss": 0.654508650302887, "eval_runtime": 2.1438, "eval_samples_per_second": 116.615, "eval_steps_per_second": 3.265, "step": 50 }, { "epoch": 4.08, "grad_norm": 18.25, "learning_rate": 2.9237288135593223e-05, "loss": 0.1941, "step": 51 }, { "epoch": 4.08, "eval_accuracy": 0.756, "eval_loss": 0.6475279331207275, "eval_runtime": 2.1439, "eval_samples_per_second": 116.612, "eval_steps_per_second": 3.265, "step": 51 }, { "epoch": 4.16, "grad_norm": 28.5, "learning_rate": 2.88135593220339e-05, "loss": 0.2195, "step": 52 }, { "epoch": 4.16, "eval_accuracy": 0.752, "eval_loss": 0.6630749702453613, "eval_runtime": 2.1424, "eval_samples_per_second": 116.691, "eval_steps_per_second": 3.267, "step": 52 }, { "epoch": 4.24, "grad_norm": 33.25, "learning_rate": 2.838983050847458e-05, "loss": 0.2597, "step": 53 }, { "epoch": 4.24, "eval_accuracy": 0.752, "eval_loss": 0.6564229726791382, "eval_runtime": 2.1436, "eval_samples_per_second": 116.628, "eval_steps_per_second": 3.266, "step": 53 }, { "epoch": 4.32, "grad_norm": 46.0, "learning_rate": 2.7966101694915255e-05, "loss": 0.1398, "step": 54 }, { "epoch": 4.32, "eval_accuracy": 0.744, "eval_loss": 0.646704912185669, "eval_runtime": 2.1406, "eval_samples_per_second": 116.789, "eval_steps_per_second": 3.27, "step": 54 }, { "epoch": 4.4, "grad_norm": 10.0625, "learning_rate": 2.754237288135593e-05, "loss": 0.1407, "step": 55 }, { "epoch": 4.4, "eval_accuracy": 0.72, "eval_loss": 0.6935080289840698, "eval_runtime": 2.145, "eval_samples_per_second": 116.552, "eval_steps_per_second": 3.263, "step": 55 }, { "epoch": 4.48, "grad_norm": 37.25, "learning_rate": 2.711864406779661e-05, "loss": 0.2388, "step": 56 }, { "epoch": 4.48, "eval_accuracy": 0.712, "eval_loss": 0.7364786267280579, "eval_runtime": 2.0957, "eval_samples_per_second": 119.291, "eval_steps_per_second": 3.34, "step": 56 }, { "epoch": 4.5600000000000005, "grad_norm": 35.0, "learning_rate": 2.669491525423729e-05, "loss": 0.2071, "step": 57 }, { "epoch": 4.5600000000000005, "eval_accuracy": 0.716, "eval_loss": 0.7429905533790588, "eval_runtime": 2.1434, "eval_samples_per_second": 116.637, "eval_steps_per_second": 3.266, "step": 57 }, { "epoch": 4.64, "grad_norm": 24.25, "learning_rate": 2.627118644067797e-05, "loss": 0.1246, "step": 58 }, { "epoch": 4.64, "eval_accuracy": 0.716, "eval_loss": 0.7141672372817993, "eval_runtime": 2.1431, "eval_samples_per_second": 116.652, "eval_steps_per_second": 3.266, "step": 58 }, { "epoch": 4.72, "grad_norm": 46.5, "learning_rate": 2.5847457627118642e-05, "loss": 0.2217, "step": 59 }, { "epoch": 4.72, "eval_accuracy": 0.736, "eval_loss": 0.6908359527587891, "eval_runtime": 2.1425, "eval_samples_per_second": 116.687, "eval_steps_per_second": 3.267, "step": 59 }, { "epoch": 4.8, "grad_norm": 6.0, "learning_rate": 2.5423728813559322e-05, "loss": 0.1806, "step": 60 }, { "epoch": 4.8, "eval_accuracy": 0.736, "eval_loss": 0.7421625852584839, "eval_runtime": 2.1421, "eval_samples_per_second": 116.709, "eval_steps_per_second": 3.268, "step": 60 }, { "epoch": 4.88, "grad_norm": 62.25, "learning_rate": 2.5e-05, "loss": 0.2022, "step": 61 }, { "epoch": 4.88, "eval_accuracy": 0.74, "eval_loss": 0.8047211170196533, "eval_runtime": 2.1436, "eval_samples_per_second": 116.625, "eval_steps_per_second": 3.265, "step": 61 }, { "epoch": 4.96, "grad_norm": 61.25, "learning_rate": 2.457627118644068e-05, "loss": 0.3026, "step": 62 }, { "epoch": 4.96, "eval_accuracy": 0.744, "eval_loss": 0.8158798813819885, "eval_runtime": 2.1428, "eval_samples_per_second": 116.671, "eval_steps_per_second": 3.267, "step": 62 }, { "epoch": 5.04, "grad_norm": 89.0, "learning_rate": 2.4152542372881357e-05, "loss": 0.3628, "step": 63 }, { "epoch": 5.04, "eval_accuracy": 0.732, "eval_loss": 0.7880058884620667, "eval_runtime": 2.142, "eval_samples_per_second": 116.711, "eval_steps_per_second": 3.268, "step": 63 }, { "epoch": 5.12, "grad_norm": 52.75, "learning_rate": 2.3728813559322036e-05, "loss": 0.1568, "step": 64 }, { "epoch": 5.12, "eval_accuracy": 0.752, "eval_loss": 0.7631772756576538, "eval_runtime": 2.1423, "eval_samples_per_second": 116.698, "eval_steps_per_second": 3.268, "step": 64 }, { "epoch": 5.2, "grad_norm": 3.625, "learning_rate": 2.3305084745762712e-05, "loss": 0.056, "step": 65 }, { "epoch": 5.2, "eval_accuracy": 0.74, "eval_loss": 0.8128260374069214, "eval_runtime": 2.1446, "eval_samples_per_second": 116.571, "eval_steps_per_second": 3.264, "step": 65 }, { "epoch": 5.28, "grad_norm": 2.78125, "learning_rate": 2.2881355932203392e-05, "loss": 0.0508, "step": 66 }, { "epoch": 5.28, "eval_accuracy": 0.72, "eval_loss": 0.9758080244064331, "eval_runtime": 2.1435, "eval_samples_per_second": 116.629, "eval_steps_per_second": 3.266, "step": 66 }, { "epoch": 5.36, "grad_norm": 48.25, "learning_rate": 2.245762711864407e-05, "loss": 0.2033, "step": 67 }, { "epoch": 5.36, "eval_accuracy": 0.704, "eval_loss": 1.1122255325317383, "eval_runtime": 2.1416, "eval_samples_per_second": 116.736, "eval_steps_per_second": 3.269, "step": 67 }, { "epoch": 5.44, "grad_norm": 64.0, "learning_rate": 2.2033898305084748e-05, "loss": 0.1902, "step": 68 }, { "epoch": 5.44, "eval_accuracy": 0.7, "eval_loss": 1.154529333114624, "eval_runtime": 2.1433, "eval_samples_per_second": 116.641, "eval_steps_per_second": 3.266, "step": 68 }, { "epoch": 5.52, "grad_norm": 80.5, "learning_rate": 2.1610169491525427e-05, "loss": 0.4264, "step": 69 }, { "epoch": 5.52, "eval_accuracy": 0.708, "eval_loss": 1.1109943389892578, "eval_runtime": 2.1436, "eval_samples_per_second": 116.624, "eval_steps_per_second": 3.265, "step": 69 }, { "epoch": 5.6, "grad_norm": 65.5, "learning_rate": 2.1186440677966103e-05, "loss": 0.2435, "step": 70 }, { "epoch": 5.6, "eval_accuracy": 0.736, "eval_loss": 1.0071001052856445, "eval_runtime": 2.1399, "eval_samples_per_second": 116.827, "eval_steps_per_second": 3.271, "step": 70 }, { "epoch": 5.68, "grad_norm": 46.25, "learning_rate": 2.076271186440678e-05, "loss": 0.1582, "step": 71 }, { "epoch": 5.68, "eval_accuracy": 0.736, "eval_loss": 0.920242965221405, "eval_runtime": 2.1407, "eval_samples_per_second": 116.783, "eval_steps_per_second": 3.27, "step": 71 }, { "epoch": 5.76, "grad_norm": 4.59375, "learning_rate": 2.033898305084746e-05, "loss": 0.0614, "step": 72 }, { "epoch": 5.76, "eval_accuracy": 0.744, "eval_loss": 0.9244736433029175, "eval_runtime": 2.1425, "eval_samples_per_second": 116.688, "eval_steps_per_second": 3.267, "step": 72 }, { "epoch": 5.84, "grad_norm": 8.0625, "learning_rate": 1.9915254237288135e-05, "loss": 0.1134, "step": 73 }, { "epoch": 5.84, "eval_accuracy": 0.74, "eval_loss": 0.9344038367271423, "eval_runtime": 2.1415, "eval_samples_per_second": 116.738, "eval_steps_per_second": 3.269, "step": 73 }, { "epoch": 5.92, "grad_norm": 17.375, "learning_rate": 1.9491525423728814e-05, "loss": 0.0692, "step": 74 }, { "epoch": 5.92, "eval_accuracy": 0.756, "eval_loss": 0.9447259902954102, "eval_runtime": 2.1434, "eval_samples_per_second": 116.639, "eval_steps_per_second": 3.266, "step": 74 }, { "epoch": 6.0, "grad_norm": 8.8125, "learning_rate": 1.906779661016949e-05, "loss": 0.054, "step": 75 }, { "epoch": 6.0, "eval_accuracy": 0.748, "eval_loss": 0.96815425157547, "eval_runtime": 2.142, "eval_samples_per_second": 116.713, "eval_steps_per_second": 3.268, "step": 75 }, { "epoch": 6.08, "grad_norm": 4.34375, "learning_rate": 1.864406779661017e-05, "loss": 0.0653, "step": 76 }, { "epoch": 6.08, "eval_accuracy": 0.748, "eval_loss": 0.9931323528289795, "eval_runtime": 2.1425, "eval_samples_per_second": 116.685, "eval_steps_per_second": 3.267, "step": 76 }, { "epoch": 6.16, "grad_norm": 1.1875, "learning_rate": 1.8220338983050846e-05, "loss": 0.0175, "step": 77 }, { "epoch": 6.16, "eval_accuracy": 0.736, "eval_loss": 1.0591909885406494, "eval_runtime": 2.1405, "eval_samples_per_second": 116.796, "eval_steps_per_second": 3.27, "step": 77 }, { "epoch": 6.24, "grad_norm": 13.75, "learning_rate": 1.7796610169491526e-05, "loss": 0.0699, "step": 78 }, { "epoch": 6.24, "eval_accuracy": 0.736, "eval_loss": 1.0900299549102783, "eval_runtime": 2.1428, "eval_samples_per_second": 116.671, "eval_steps_per_second": 3.267, "step": 78 }, { "epoch": 6.32, "grad_norm": 16.5, "learning_rate": 1.7372881355932205e-05, "loss": 0.0761, "step": 79 }, { "epoch": 6.32, "eval_accuracy": 0.736, "eval_loss": 1.0893088579177856, "eval_runtime": 2.141, "eval_samples_per_second": 116.769, "eval_steps_per_second": 3.27, "step": 79 }, { "epoch": 6.4, "grad_norm": 12.6875, "learning_rate": 1.694915254237288e-05, "loss": 0.1584, "step": 80 }, { "epoch": 6.4, "eval_accuracy": 0.736, "eval_loss": 1.1014509201049805, "eval_runtime": 2.0982, "eval_samples_per_second": 119.149, "eval_steps_per_second": 3.336, "step": 80 }, { "epoch": 6.48, "grad_norm": 6.78125, "learning_rate": 1.652542372881356e-05, "loss": 0.0631, "step": 81 }, { "epoch": 6.48, "eval_accuracy": 0.748, "eval_loss": 1.0965150594711304, "eval_runtime": 2.1439, "eval_samples_per_second": 116.612, "eval_steps_per_second": 3.265, "step": 81 }, { "epoch": 6.5600000000000005, "grad_norm": 7.53125, "learning_rate": 1.6101694915254237e-05, "loss": 0.0382, "step": 82 }, { "epoch": 6.5600000000000005, "eval_accuracy": 0.748, "eval_loss": 1.0999393463134766, "eval_runtime": 2.1423, "eval_samples_per_second": 116.698, "eval_steps_per_second": 3.268, "step": 82 }, { "epoch": 6.64, "grad_norm": 11.0625, "learning_rate": 1.5677966101694916e-05, "loss": 0.0582, "step": 83 }, { "epoch": 6.64, "eval_accuracy": 0.744, "eval_loss": 1.0906572341918945, "eval_runtime": 2.143, "eval_samples_per_second": 116.66, "eval_steps_per_second": 3.266, "step": 83 }, { "epoch": 6.72, "grad_norm": 14.75, "learning_rate": 1.5254237288135596e-05, "loss": 0.0316, "step": 84 }, { "epoch": 6.72, "eval_accuracy": 0.748, "eval_loss": 1.1019198894500732, "eval_runtime": 2.1453, "eval_samples_per_second": 116.535, "eval_steps_per_second": 3.263, "step": 84 }, { "epoch": 6.8, "grad_norm": 17.375, "learning_rate": 1.4830508474576272e-05, "loss": 0.0509, "step": 85 }, { "epoch": 6.8, "eval_accuracy": 0.744, "eval_loss": 1.0971815586090088, "eval_runtime": 2.1409, "eval_samples_per_second": 116.772, "eval_steps_per_second": 3.27, "step": 85 }, { "epoch": 6.88, "grad_norm": 8.5625, "learning_rate": 1.440677966101695e-05, "loss": 0.0787, "step": 86 }, { "epoch": 6.88, "eval_accuracy": 0.748, "eval_loss": 1.094210147857666, "eval_runtime": 2.0923, "eval_samples_per_second": 119.488, "eval_steps_per_second": 3.346, "step": 86 }, { "epoch": 6.96, "grad_norm": 10.5, "learning_rate": 1.3983050847457627e-05, "loss": 0.0346, "step": 87 }, { "epoch": 6.96, "eval_accuracy": 0.744, "eval_loss": 1.0973331928253174, "eval_runtime": 2.1418, "eval_samples_per_second": 116.723, "eval_steps_per_second": 3.268, "step": 87 }, { "epoch": 7.04, "grad_norm": 5.71875, "learning_rate": 1.3559322033898305e-05, "loss": 0.0242, "step": 88 }, { "epoch": 7.04, "eval_accuracy": 0.744, "eval_loss": 1.096313714981079, "eval_runtime": 2.1419, "eval_samples_per_second": 116.72, "eval_steps_per_second": 3.268, "step": 88 }, { "epoch": 7.12, "grad_norm": 2.296875, "learning_rate": 1.3135593220338985e-05, "loss": 0.0089, "step": 89 }, { "epoch": 7.12, "eval_accuracy": 0.74, "eval_loss": 1.108156442642212, "eval_runtime": 2.1448, "eval_samples_per_second": 116.562, "eval_steps_per_second": 3.264, "step": 89 }, { "epoch": 7.2, "grad_norm": 2.90625, "learning_rate": 1.2711864406779661e-05, "loss": 0.0143, "step": 90 }, { "epoch": 7.2, "eval_accuracy": 0.736, "eval_loss": 1.1125974655151367, "eval_runtime": 2.1429, "eval_samples_per_second": 116.666, "eval_steps_per_second": 3.267, "step": 90 }, { "epoch": 7.28, "grad_norm": 1.671875, "learning_rate": 1.228813559322034e-05, "loss": 0.007, "step": 91 }, { "epoch": 7.28, "eval_accuracy": 0.736, "eval_loss": 1.134128451347351, "eval_runtime": 2.1433, "eval_samples_per_second": 116.64, "eval_steps_per_second": 3.266, "step": 91 }, { "epoch": 7.36, "grad_norm": 2.796875, "learning_rate": 1.1864406779661018e-05, "loss": 0.0159, "step": 92 }, { "epoch": 7.36, "eval_accuracy": 0.724, "eval_loss": 1.1692942380905151, "eval_runtime": 2.093, "eval_samples_per_second": 119.446, "eval_steps_per_second": 3.344, "step": 92 }, { "epoch": 7.44, "grad_norm": 6.3125, "learning_rate": 1.1440677966101696e-05, "loss": 0.0148, "step": 93 }, { "epoch": 7.44, "eval_accuracy": 0.72, "eval_loss": 1.187640905380249, "eval_runtime": 2.1424, "eval_samples_per_second": 116.694, "eval_steps_per_second": 3.267, "step": 93 }, { "epoch": 7.52, "grad_norm": 8.0625, "learning_rate": 1.1016949152542374e-05, "loss": 0.0343, "step": 94 }, { "epoch": 7.52, "eval_accuracy": 0.728, "eval_loss": 1.1934490203857422, "eval_runtime": 2.1413, "eval_samples_per_second": 116.753, "eval_steps_per_second": 3.269, "step": 94 }, { "epoch": 7.6, "grad_norm": 2.296875, "learning_rate": 1.0593220338983052e-05, "loss": 0.0124, "step": 95 }, { "epoch": 7.6, "eval_accuracy": 0.728, "eval_loss": 1.1945278644561768, "eval_runtime": 2.1425, "eval_samples_per_second": 116.687, "eval_steps_per_second": 3.267, "step": 95 }, { "epoch": 7.68, "grad_norm": 7.78125, "learning_rate": 1.016949152542373e-05, "loss": 0.0975, "step": 96 }, { "epoch": 7.68, "eval_accuracy": 0.736, "eval_loss": 1.1970733404159546, "eval_runtime": 2.145, "eval_samples_per_second": 116.551, "eval_steps_per_second": 3.263, "step": 96 }, { "epoch": 7.76, "grad_norm": 3.484375, "learning_rate": 9.745762711864407e-06, "loss": 0.0192, "step": 97 }, { "epoch": 7.76, "eval_accuracy": 0.744, "eval_loss": 1.195726752281189, "eval_runtime": 2.145, "eval_samples_per_second": 116.549, "eval_steps_per_second": 3.263, "step": 97 }, { "epoch": 7.84, "grad_norm": 5.78125, "learning_rate": 9.322033898305085e-06, "loss": 0.0314, "step": 98 }, { "epoch": 7.84, "eval_accuracy": 0.744, "eval_loss": 1.2016874551773071, "eval_runtime": 2.1454, "eval_samples_per_second": 116.529, "eval_steps_per_second": 3.263, "step": 98 }, { "epoch": 7.92, "grad_norm": 1.9921875, "learning_rate": 8.898305084745763e-06, "loss": 0.0094, "step": 99 }, { "epoch": 7.92, "eval_accuracy": 0.748, "eval_loss": 1.212010383605957, "eval_runtime": 2.1446, "eval_samples_per_second": 116.572, "eval_steps_per_second": 3.264, "step": 99 }, { "epoch": 8.0, "grad_norm": 1.046875, "learning_rate": 8.47457627118644e-06, "loss": 0.0098, "step": 100 }, { "epoch": 8.0, "eval_accuracy": 0.748, "eval_loss": 1.2213313579559326, "eval_runtime": 2.1436, "eval_samples_per_second": 116.624, "eval_steps_per_second": 3.265, "step": 100 }, { "epoch": 8.08, "grad_norm": 1.1875, "learning_rate": 8.050847457627118e-06, "loss": 0.0056, "step": 101 }, { "epoch": 8.08, "eval_accuracy": 0.748, "eval_loss": 1.2390106916427612, "eval_runtime": 2.1453, "eval_samples_per_second": 116.536, "eval_steps_per_second": 3.263, "step": 101 }, { "epoch": 8.16, "grad_norm": 0.6640625, "learning_rate": 7.627118644067798e-06, "loss": 0.003, "step": 102 }, { "epoch": 8.16, "eval_accuracy": 0.748, "eval_loss": 1.253604531288147, "eval_runtime": 2.1442, "eval_samples_per_second": 116.594, "eval_steps_per_second": 3.265, "step": 102 }, { "epoch": 8.24, "grad_norm": 0.578125, "learning_rate": 7.203389830508475e-06, "loss": 0.005, "step": 103 }, { "epoch": 8.24, "eval_accuracy": 0.74, "eval_loss": 1.2792062759399414, "eval_runtime": 2.143, "eval_samples_per_second": 116.656, "eval_steps_per_second": 3.266, "step": 103 }, { "epoch": 8.32, "grad_norm": 12.125, "learning_rate": 6.779661016949153e-06, "loss": 0.0821, "step": 104 }, { "epoch": 8.32, "eval_accuracy": 0.74, "eval_loss": 1.2985644340515137, "eval_runtime": 2.1417, "eval_samples_per_second": 116.731, "eval_steps_per_second": 3.268, "step": 104 }, { "epoch": 8.4, "grad_norm": 0.435546875, "learning_rate": 6.3559322033898304e-06, "loss": 0.0038, "step": 105 }, { "epoch": 8.4, "eval_accuracy": 0.736, "eval_loss": 1.3192646503448486, "eval_runtime": 2.1413, "eval_samples_per_second": 116.753, "eval_steps_per_second": 3.269, "step": 105 }, { "epoch": 8.48, "grad_norm": 1.4453125, "learning_rate": 5.932203389830509e-06, "loss": 0.0035, "step": 106 }, { "epoch": 8.48, "eval_accuracy": 0.728, "eval_loss": 1.340459942817688, "eval_runtime": 2.1419, "eval_samples_per_second": 116.717, "eval_steps_per_second": 3.268, "step": 106 }, { "epoch": 8.56, "grad_norm": 0.50390625, "learning_rate": 5.508474576271187e-06, "loss": 0.0031, "step": 107 }, { "epoch": 8.56, "eval_accuracy": 0.728, "eval_loss": 1.360876202583313, "eval_runtime": 2.143, "eval_samples_per_second": 116.659, "eval_steps_per_second": 3.266, "step": 107 }, { "epoch": 8.64, "grad_norm": 0.96875, "learning_rate": 5.084745762711865e-06, "loss": 0.0035, "step": 108 }, { "epoch": 8.64, "eval_accuracy": 0.724, "eval_loss": 1.3767642974853516, "eval_runtime": 2.1447, "eval_samples_per_second": 116.569, "eval_steps_per_second": 3.264, "step": 108 }, { "epoch": 8.72, "grad_norm": 0.41015625, "learning_rate": 4.6610169491525425e-06, "loss": 0.003, "step": 109 }, { "epoch": 8.72, "eval_accuracy": 0.724, "eval_loss": 1.3948369026184082, "eval_runtime": 2.1416, "eval_samples_per_second": 116.735, "eval_steps_per_second": 3.269, "step": 109 }, { "epoch": 8.8, "grad_norm": 0.36328125, "learning_rate": 4.23728813559322e-06, "loss": 0.0021, "step": 110 }, { "epoch": 8.8, "eval_accuracy": 0.724, "eval_loss": 1.4067965745925903, "eval_runtime": 2.1439, "eval_samples_per_second": 116.612, "eval_steps_per_second": 3.265, "step": 110 }, { "epoch": 8.88, "grad_norm": 0.59765625, "learning_rate": 3.813559322033899e-06, "loss": 0.0017, "step": 111 }, { "epoch": 8.88, "eval_accuracy": 0.724, "eval_loss": 1.4231809377670288, "eval_runtime": 2.1423, "eval_samples_per_second": 116.695, "eval_steps_per_second": 3.267, "step": 111 }, { "epoch": 8.96, "grad_norm": 0.53125, "learning_rate": 3.3898305084745763e-06, "loss": 0.0037, "step": 112 }, { "epoch": 8.96, "eval_accuracy": 0.724, "eval_loss": 1.4200129508972168, "eval_runtime": 2.1433, "eval_samples_per_second": 116.644, "eval_steps_per_second": 3.266, "step": 112 }, { "epoch": 9.04, "grad_norm": 1.375, "learning_rate": 2.9661016949152545e-06, "loss": 0.0027, "step": 113 }, { "epoch": 9.04, "eval_accuracy": 0.724, "eval_loss": 1.4301310777664185, "eval_runtime": 2.1447, "eval_samples_per_second": 116.568, "eval_steps_per_second": 3.264, "step": 113 }, { "epoch": 9.12, "grad_norm": 0.44140625, "learning_rate": 2.5423728813559323e-06, "loss": 0.0019, "step": 114 }, { "epoch": 9.12, "eval_accuracy": 0.728, "eval_loss": 1.4320718050003052, "eval_runtime": 2.094, "eval_samples_per_second": 119.389, "eval_steps_per_second": 3.343, "step": 114 }, { "epoch": 9.2, "grad_norm": 0.30078125, "learning_rate": 2.11864406779661e-06, "loss": 0.0016, "step": 115 }, { "epoch": 9.2, "eval_accuracy": 0.728, "eval_loss": 1.4348891973495483, "eval_runtime": 2.1448, "eval_samples_per_second": 116.559, "eval_steps_per_second": 3.264, "step": 115 }, { "epoch": 9.28, "grad_norm": 1.9375, "learning_rate": 1.6949152542372882e-06, "loss": 0.0103, "step": 116 }, { "epoch": 9.28, "eval_accuracy": 0.728, "eval_loss": 1.433516263961792, "eval_runtime": 2.1453, "eval_samples_per_second": 116.536, "eval_steps_per_second": 3.263, "step": 116 }, { "epoch": 9.36, "grad_norm": 0.55859375, "learning_rate": 1.2711864406779662e-06, "loss": 0.0024, "step": 117 }, { "epoch": 9.36, "eval_accuracy": 0.728, "eval_loss": 1.4311394691467285, "eval_runtime": 2.1427, "eval_samples_per_second": 116.676, "eval_steps_per_second": 3.267, "step": 117 }, { "epoch": 9.44, "grad_norm": 0.55859375, "learning_rate": 8.474576271186441e-07, "loss": 0.0023, "step": 118 }, { "epoch": 9.44, "eval_accuracy": 0.728, "eval_loss": 1.4323029518127441, "eval_runtime": 2.1437, "eval_samples_per_second": 116.618, "eval_steps_per_second": 3.265, "step": 118 }, { "epoch": 9.52, "grad_norm": 0.408203125, "learning_rate": 4.2372881355932204e-07, "loss": 0.0028, "step": 119 }, { "epoch": 9.52, "eval_accuracy": 0.728, "eval_loss": 1.4346427917480469, "eval_runtime": 2.1431, "eval_samples_per_second": 116.651, "eval_steps_per_second": 3.266, "step": 119 }, { "epoch": 9.6, "grad_norm": 0.06982421875, "learning_rate": 0.0, "loss": 0.0004, "step": 120 }, { "epoch": 9.6, "eval_accuracy": 0.728, "eval_loss": 1.4318400621414185, "eval_runtime": 2.1424, "eval_samples_per_second": 116.691, "eval_steps_per_second": 3.267, "step": 120 }, { "epoch": 9.6, "step": 120, "total_flos": 2.803887199223808e+16, "train_loss": 0.28943174814849043, "train_runtime": 477.0534, "train_samples_per_second": 20.962, "train_steps_per_second": 0.252 } ], "logging_steps": 1, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.803887199223808e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }