adapters-gemma-bf16-QLORA-super_glue-wic
/
trainer_state-gemma-bf16-QLORA-super_glue-wic-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 9.6, | |
"eval_steps": 1, | |
"global_step": 120, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.08, | |
"grad_norm": 136.0, | |
"learning_rate": 2.5e-05, | |
"loss": 1.0785, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.08, | |
"eval_accuracy": 0.516, | |
"eval_loss": 1.023781657218933, | |
"eval_runtime": 2.0454, | |
"eval_samples_per_second": 122.227, | |
"eval_steps_per_second": 3.422, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 185.0, | |
"learning_rate": 5e-05, | |
"loss": 1.2776, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.16, | |
"eval_accuracy": 0.504, | |
"eval_loss": 1.0729936361312866, | |
"eval_runtime": 2.1411, | |
"eval_samples_per_second": 116.764, | |
"eval_steps_per_second": 3.269, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 162.0, | |
"learning_rate": 4.957627118644068e-05, | |
"loss": 1.1542, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.24, | |
"eval_accuracy": 0.516, | |
"eval_loss": 1.3786845207214355, | |
"eval_runtime": 2.1405, | |
"eval_samples_per_second": 116.796, | |
"eval_steps_per_second": 3.27, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 262.0, | |
"learning_rate": 4.915254237288136e-05, | |
"loss": 1.6628, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.850824236869812, | |
"eval_runtime": 2.1406, | |
"eval_samples_per_second": 116.792, | |
"eval_steps_per_second": 3.27, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 86.5, | |
"learning_rate": 4.8728813559322034e-05, | |
"loss": 1.0706, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.4, | |
"eval_accuracy": 0.512, | |
"eval_loss": 1.2434474229812622, | |
"eval_runtime": 2.1419, | |
"eval_samples_per_second": 116.717, | |
"eval_steps_per_second": 3.268, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 221.0, | |
"learning_rate": 4.8305084745762714e-05, | |
"loss": 1.4146, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.48, | |
"eval_accuracy": 0.488, | |
"eval_loss": 1.4112776517868042, | |
"eval_runtime": 2.1428, | |
"eval_samples_per_second": 116.672, | |
"eval_steps_per_second": 3.267, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 239.0, | |
"learning_rate": 4.788135593220339e-05, | |
"loss": 1.4855, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.56, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.9813472032546997, | |
"eval_runtime": 2.1413, | |
"eval_samples_per_second": 116.75, | |
"eval_steps_per_second": 3.269, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 158.0, | |
"learning_rate": 4.745762711864407e-05, | |
"loss": 1.0638, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.624, | |
"eval_loss": 0.7241154909133911, | |
"eval_runtime": 2.1408, | |
"eval_samples_per_second": 116.78, | |
"eval_steps_per_second": 3.27, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 70.0, | |
"learning_rate": 4.703389830508475e-05, | |
"loss": 0.6901, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.72, | |
"eval_accuracy": 0.628, | |
"eval_loss": 0.7069931626319885, | |
"eval_runtime": 2.1404, | |
"eval_samples_per_second": 116.801, | |
"eval_steps_per_second": 3.27, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 87.0, | |
"learning_rate": 4.6610169491525425e-05, | |
"loss": 0.7535, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.8, | |
"eval_accuracy": 0.64, | |
"eval_loss": 0.6583124995231628, | |
"eval_runtime": 2.1407, | |
"eval_samples_per_second": 116.785, | |
"eval_steps_per_second": 3.27, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 52.75, | |
"learning_rate": 4.6186440677966104e-05, | |
"loss": 0.6397, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.88, | |
"eval_accuracy": 0.608, | |
"eval_loss": 0.7135878801345825, | |
"eval_runtime": 2.14, | |
"eval_samples_per_second": 116.822, | |
"eval_steps_per_second": 3.271, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 34.0, | |
"learning_rate": 4.5762711864406784e-05, | |
"loss": 0.6484, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.616, | |
"eval_loss": 0.6957533955574036, | |
"eval_runtime": 2.141, | |
"eval_samples_per_second": 116.766, | |
"eval_steps_per_second": 3.269, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 13.6875, | |
"learning_rate": 4.533898305084746e-05, | |
"loss": 0.66, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.04, | |
"eval_accuracy": 0.652, | |
"eval_loss": 0.6610312461853027, | |
"eval_runtime": 2.1413, | |
"eval_samples_per_second": 116.749, | |
"eval_steps_per_second": 3.269, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 7.90625, | |
"learning_rate": 4.491525423728814e-05, | |
"loss": 0.6312, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.12, | |
"eval_accuracy": 0.66, | |
"eval_loss": 0.6188369393348694, | |
"eval_runtime": 2.1393, | |
"eval_samples_per_second": 116.863, | |
"eval_steps_per_second": 3.272, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 44.25, | |
"learning_rate": 4.4491525423728816e-05, | |
"loss": 0.4925, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.2, | |
"eval_accuracy": 0.7, | |
"eval_loss": 0.6158857345581055, | |
"eval_runtime": 2.1409, | |
"eval_samples_per_second": 116.773, | |
"eval_steps_per_second": 3.27, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 101.0, | |
"learning_rate": 4.4067796610169495e-05, | |
"loss": 0.613, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.708, | |
"eval_loss": 0.5807080268859863, | |
"eval_runtime": 2.1421, | |
"eval_samples_per_second": 116.71, | |
"eval_steps_per_second": 3.268, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"grad_norm": 6.03125, | |
"learning_rate": 4.3644067796610175e-05, | |
"loss": 0.47, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"eval_accuracy": 0.688, | |
"eval_loss": 0.5807793140411377, | |
"eval_runtime": 2.0911, | |
"eval_samples_per_second": 119.555, | |
"eval_steps_per_second": 3.348, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 41.25, | |
"learning_rate": 4.3220338983050854e-05, | |
"loss": 0.5137, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.44, | |
"eval_accuracy": 0.704, | |
"eval_loss": 0.5797407031059265, | |
"eval_runtime": 2.1399, | |
"eval_samples_per_second": 116.829, | |
"eval_steps_per_second": 3.271, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 32.25, | |
"learning_rate": 4.279661016949153e-05, | |
"loss": 0.5839, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.52, | |
"eval_accuracy": 0.704, | |
"eval_loss": 0.628451406955719, | |
"eval_runtime": 2.1414, | |
"eval_samples_per_second": 116.747, | |
"eval_steps_per_second": 3.269, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 32.75, | |
"learning_rate": 4.2372881355932206e-05, | |
"loss": 0.4523, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.692, | |
"eval_loss": 0.6559668183326721, | |
"eval_runtime": 2.1409, | |
"eval_samples_per_second": 116.776, | |
"eval_steps_per_second": 3.27, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"grad_norm": 110.5, | |
"learning_rate": 4.1949152542372886e-05, | |
"loss": 0.6259, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.6154900193214417, | |
"eval_runtime": 2.1418, | |
"eval_samples_per_second": 116.724, | |
"eval_steps_per_second": 3.268, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 52.75, | |
"learning_rate": 4.152542372881356e-05, | |
"loss": 0.5299, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.76, | |
"eval_accuracy": 0.712, | |
"eval_loss": 0.5984148979187012, | |
"eval_runtime": 2.1432, | |
"eval_samples_per_second": 116.647, | |
"eval_steps_per_second": 3.266, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"grad_norm": 49.0, | |
"learning_rate": 4.110169491525424e-05, | |
"loss": 0.5721, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"eval_accuracy": 0.712, | |
"eval_loss": 0.6128161549568176, | |
"eval_runtime": 2.1427, | |
"eval_samples_per_second": 116.677, | |
"eval_steps_per_second": 3.267, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 97.5, | |
"learning_rate": 4.067796610169492e-05, | |
"loss": 0.7519, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.5905612707138062, | |
"eval_runtime": 2.1425, | |
"eval_samples_per_second": 116.687, | |
"eval_steps_per_second": 3.267, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 17.0, | |
"learning_rate": 4.025423728813559e-05, | |
"loss": 0.4132, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.6202563643455505, | |
"eval_runtime": 2.1404, | |
"eval_samples_per_second": 116.802, | |
"eval_steps_per_second": 3.27, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 69.5, | |
"learning_rate": 3.983050847457627e-05, | |
"loss": 0.4021, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.08, | |
"eval_accuracy": 0.708, | |
"eval_loss": 0.6416750550270081, | |
"eval_runtime": 2.1429, | |
"eval_samples_per_second": 116.663, | |
"eval_steps_per_second": 3.267, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 73.5, | |
"learning_rate": 3.940677966101695e-05, | |
"loss": 0.4837, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.16, | |
"eval_accuracy": 0.724, | |
"eval_loss": 0.609797477722168, | |
"eval_runtime": 2.1417, | |
"eval_samples_per_second": 116.727, | |
"eval_steps_per_second": 3.268, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.24, | |
"grad_norm": 31.0, | |
"learning_rate": 3.898305084745763e-05, | |
"loss": 0.382, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.24, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.5821030139923096, | |
"eval_runtime": 2.1399, | |
"eval_samples_per_second": 116.826, | |
"eval_steps_per_second": 3.271, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 49.25, | |
"learning_rate": 3.855932203389831e-05, | |
"loss": 0.4402, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.32, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.5874293446540833, | |
"eval_runtime": 2.1406, | |
"eval_samples_per_second": 116.787, | |
"eval_steps_per_second": 3.27, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 26.5, | |
"learning_rate": 3.813559322033898e-05, | |
"loss": 0.3664, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.4, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.5800690650939941, | |
"eval_runtime": 2.1453, | |
"eval_samples_per_second": 116.536, | |
"eval_steps_per_second": 3.263, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 34.5, | |
"learning_rate": 3.771186440677966e-05, | |
"loss": 0.313, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.48, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.6099223494529724, | |
"eval_runtime": 2.1422, | |
"eval_samples_per_second": 116.701, | |
"eval_steps_per_second": 3.268, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 42.5, | |
"learning_rate": 3.728813559322034e-05, | |
"loss": 0.4822, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.626610517501831, | |
"eval_runtime": 2.1438, | |
"eval_samples_per_second": 116.615, | |
"eval_steps_per_second": 3.265, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 48.75, | |
"learning_rate": 3.686440677966102e-05, | |
"loss": 0.3374, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.64, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.6000227332115173, | |
"eval_runtime": 2.144, | |
"eval_samples_per_second": 116.603, | |
"eval_steps_per_second": 3.265, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.7199999999999998, | |
"grad_norm": 4.59375, | |
"learning_rate": 3.644067796610169e-05, | |
"loss": 0.2605, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.7199999999999998, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.5826870203018188, | |
"eval_runtime": 2.1443, | |
"eval_samples_per_second": 116.588, | |
"eval_steps_per_second": 3.264, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 46.25, | |
"learning_rate": 3.601694915254237e-05, | |
"loss": 0.4195, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.8, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.5882580280303955, | |
"eval_runtime": 2.0917, | |
"eval_samples_per_second": 119.522, | |
"eval_steps_per_second": 3.347, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 23.125, | |
"learning_rate": 3.559322033898305e-05, | |
"loss": 0.3838, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.88, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.5986873507499695, | |
"eval_runtime": 2.1402, | |
"eval_samples_per_second": 116.811, | |
"eval_steps_per_second": 3.271, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 25.375, | |
"learning_rate": 3.516949152542373e-05, | |
"loss": 0.4504, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.96, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.599513828754425, | |
"eval_runtime": 2.1411, | |
"eval_samples_per_second": 116.761, | |
"eval_steps_per_second": 3.269, | |
"step": 37 | |
}, | |
{ | |
"epoch": 3.04, | |
"grad_norm": 12.625, | |
"learning_rate": 3.474576271186441e-05, | |
"loss": 0.3745, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.04, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.602383017539978, | |
"eval_runtime": 2.1404, | |
"eval_samples_per_second": 116.803, | |
"eval_steps_per_second": 3.27, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.12, | |
"grad_norm": 38.5, | |
"learning_rate": 3.432203389830508e-05, | |
"loss": 0.2068, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.12, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.6505258083343506, | |
"eval_runtime": 2.1409, | |
"eval_samples_per_second": 116.772, | |
"eval_steps_per_second": 3.27, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.2, | |
"grad_norm": 57.25, | |
"learning_rate": 3.389830508474576e-05, | |
"loss": 0.354, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.2, | |
"eval_accuracy": 0.724, | |
"eval_loss": 0.681692361831665, | |
"eval_runtime": 2.1411, | |
"eval_samples_per_second": 116.762, | |
"eval_steps_per_second": 3.269, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.2800000000000002, | |
"grad_norm": 20.25, | |
"learning_rate": 3.347457627118644e-05, | |
"loss": 0.2488, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.2800000000000002, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.6564323306083679, | |
"eval_runtime": 2.1392, | |
"eval_samples_per_second": 116.864, | |
"eval_steps_per_second": 3.272, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.36, | |
"grad_norm": 28.625, | |
"learning_rate": 3.305084745762712e-05, | |
"loss": 0.267, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.36, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.6272720694541931, | |
"eval_runtime": 2.1431, | |
"eval_samples_per_second": 116.651, | |
"eval_steps_per_second": 3.266, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.44, | |
"grad_norm": 6.90625, | |
"learning_rate": 3.26271186440678e-05, | |
"loss": 0.2534, | |
"step": 43 | |
}, | |
{ | |
"epoch": 3.44, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.679269015789032, | |
"eval_runtime": 2.1434, | |
"eval_samples_per_second": 116.636, | |
"eval_steps_per_second": 3.266, | |
"step": 43 | |
}, | |
{ | |
"epoch": 3.52, | |
"grad_norm": 67.0, | |
"learning_rate": 3.2203389830508473e-05, | |
"loss": 0.3636, | |
"step": 44 | |
}, | |
{ | |
"epoch": 3.52, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.7287683486938477, | |
"eval_runtime": 2.1422, | |
"eval_samples_per_second": 116.7, | |
"eval_steps_per_second": 3.268, | |
"step": 44 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 68.5, | |
"learning_rate": 3.177966101694915e-05, | |
"loss": 0.3003, | |
"step": 45 | |
}, | |
{ | |
"epoch": 3.6, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.7087514400482178, | |
"eval_runtime": 2.1436, | |
"eval_samples_per_second": 116.624, | |
"eval_steps_per_second": 3.265, | |
"step": 45 | |
}, | |
{ | |
"epoch": 3.68, | |
"grad_norm": 84.0, | |
"learning_rate": 3.135593220338983e-05, | |
"loss": 0.3875, | |
"step": 46 | |
}, | |
{ | |
"epoch": 3.68, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.6582326292991638, | |
"eval_runtime": 2.142, | |
"eval_samples_per_second": 116.713, | |
"eval_steps_per_second": 3.268, | |
"step": 46 | |
}, | |
{ | |
"epoch": 3.76, | |
"grad_norm": 72.5, | |
"learning_rate": 3.093220338983051e-05, | |
"loss": 0.3178, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.76, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.6546837687492371, | |
"eval_runtime": 2.1409, | |
"eval_samples_per_second": 116.776, | |
"eval_steps_per_second": 3.27, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.84, | |
"grad_norm": 29.0, | |
"learning_rate": 3.050847457627119e-05, | |
"loss": 0.3452, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.84, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.6847663521766663, | |
"eval_runtime": 2.1429, | |
"eval_samples_per_second": 116.662, | |
"eval_steps_per_second": 3.267, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.92, | |
"grad_norm": 24.625, | |
"learning_rate": 3.0084745762711864e-05, | |
"loss": 0.2246, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.92, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.6859081983566284, | |
"eval_runtime": 2.1464, | |
"eval_samples_per_second": 116.477, | |
"eval_steps_per_second": 3.261, | |
"step": 49 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 50.5, | |
"learning_rate": 2.9661016949152544e-05, | |
"loss": 0.3548, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.654508650302887, | |
"eval_runtime": 2.1438, | |
"eval_samples_per_second": 116.615, | |
"eval_steps_per_second": 3.265, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.08, | |
"grad_norm": 18.25, | |
"learning_rate": 2.9237288135593223e-05, | |
"loss": 0.1941, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.08, | |
"eval_accuracy": 0.756, | |
"eval_loss": 0.6475279331207275, | |
"eval_runtime": 2.1439, | |
"eval_samples_per_second": 116.612, | |
"eval_steps_per_second": 3.265, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.16, | |
"grad_norm": 28.5, | |
"learning_rate": 2.88135593220339e-05, | |
"loss": 0.2195, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.16, | |
"eval_accuracy": 0.752, | |
"eval_loss": 0.6630749702453613, | |
"eval_runtime": 2.1424, | |
"eval_samples_per_second": 116.691, | |
"eval_steps_per_second": 3.267, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.24, | |
"grad_norm": 33.25, | |
"learning_rate": 2.838983050847458e-05, | |
"loss": 0.2597, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.24, | |
"eval_accuracy": 0.752, | |
"eval_loss": 0.6564229726791382, | |
"eval_runtime": 2.1436, | |
"eval_samples_per_second": 116.628, | |
"eval_steps_per_second": 3.266, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.32, | |
"grad_norm": 46.0, | |
"learning_rate": 2.7966101694915255e-05, | |
"loss": 0.1398, | |
"step": 54 | |
}, | |
{ | |
"epoch": 4.32, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.646704912185669, | |
"eval_runtime": 2.1406, | |
"eval_samples_per_second": 116.789, | |
"eval_steps_per_second": 3.27, | |
"step": 54 | |
}, | |
{ | |
"epoch": 4.4, | |
"grad_norm": 10.0625, | |
"learning_rate": 2.754237288135593e-05, | |
"loss": 0.1407, | |
"step": 55 | |
}, | |
{ | |
"epoch": 4.4, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.6935080289840698, | |
"eval_runtime": 2.145, | |
"eval_samples_per_second": 116.552, | |
"eval_steps_per_second": 3.263, | |
"step": 55 | |
}, | |
{ | |
"epoch": 4.48, | |
"grad_norm": 37.25, | |
"learning_rate": 2.711864406779661e-05, | |
"loss": 0.2388, | |
"step": 56 | |
}, | |
{ | |
"epoch": 4.48, | |
"eval_accuracy": 0.712, | |
"eval_loss": 0.7364786267280579, | |
"eval_runtime": 2.0957, | |
"eval_samples_per_second": 119.291, | |
"eval_steps_per_second": 3.34, | |
"step": 56 | |
}, | |
{ | |
"epoch": 4.5600000000000005, | |
"grad_norm": 35.0, | |
"learning_rate": 2.669491525423729e-05, | |
"loss": 0.2071, | |
"step": 57 | |
}, | |
{ | |
"epoch": 4.5600000000000005, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.7429905533790588, | |
"eval_runtime": 2.1434, | |
"eval_samples_per_second": 116.637, | |
"eval_steps_per_second": 3.266, | |
"step": 57 | |
}, | |
{ | |
"epoch": 4.64, | |
"grad_norm": 24.25, | |
"learning_rate": 2.627118644067797e-05, | |
"loss": 0.1246, | |
"step": 58 | |
}, | |
{ | |
"epoch": 4.64, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.7141672372817993, | |
"eval_runtime": 2.1431, | |
"eval_samples_per_second": 116.652, | |
"eval_steps_per_second": 3.266, | |
"step": 58 | |
}, | |
{ | |
"epoch": 4.72, | |
"grad_norm": 46.5, | |
"learning_rate": 2.5847457627118642e-05, | |
"loss": 0.2217, | |
"step": 59 | |
}, | |
{ | |
"epoch": 4.72, | |
"eval_accuracy": 0.736, | |
"eval_loss": 0.6908359527587891, | |
"eval_runtime": 2.1425, | |
"eval_samples_per_second": 116.687, | |
"eval_steps_per_second": 3.267, | |
"step": 59 | |
}, | |
{ | |
"epoch": 4.8, | |
"grad_norm": 6.0, | |
"learning_rate": 2.5423728813559322e-05, | |
"loss": 0.1806, | |
"step": 60 | |
}, | |
{ | |
"epoch": 4.8, | |
"eval_accuracy": 0.736, | |
"eval_loss": 0.7421625852584839, | |
"eval_runtime": 2.1421, | |
"eval_samples_per_second": 116.709, | |
"eval_steps_per_second": 3.268, | |
"step": 60 | |
}, | |
{ | |
"epoch": 4.88, | |
"grad_norm": 62.25, | |
"learning_rate": 2.5e-05, | |
"loss": 0.2022, | |
"step": 61 | |
}, | |
{ | |
"epoch": 4.88, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.8047211170196533, | |
"eval_runtime": 2.1436, | |
"eval_samples_per_second": 116.625, | |
"eval_steps_per_second": 3.265, | |
"step": 61 | |
}, | |
{ | |
"epoch": 4.96, | |
"grad_norm": 61.25, | |
"learning_rate": 2.457627118644068e-05, | |
"loss": 0.3026, | |
"step": 62 | |
}, | |
{ | |
"epoch": 4.96, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.8158798813819885, | |
"eval_runtime": 2.1428, | |
"eval_samples_per_second": 116.671, | |
"eval_steps_per_second": 3.267, | |
"step": 62 | |
}, | |
{ | |
"epoch": 5.04, | |
"grad_norm": 89.0, | |
"learning_rate": 2.4152542372881357e-05, | |
"loss": 0.3628, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.04, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.7880058884620667, | |
"eval_runtime": 2.142, | |
"eval_samples_per_second": 116.711, | |
"eval_steps_per_second": 3.268, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.12, | |
"grad_norm": 52.75, | |
"learning_rate": 2.3728813559322036e-05, | |
"loss": 0.1568, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.12, | |
"eval_accuracy": 0.752, | |
"eval_loss": 0.7631772756576538, | |
"eval_runtime": 2.1423, | |
"eval_samples_per_second": 116.698, | |
"eval_steps_per_second": 3.268, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.2, | |
"grad_norm": 3.625, | |
"learning_rate": 2.3305084745762712e-05, | |
"loss": 0.056, | |
"step": 65 | |
}, | |
{ | |
"epoch": 5.2, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.8128260374069214, | |
"eval_runtime": 2.1446, | |
"eval_samples_per_second": 116.571, | |
"eval_steps_per_second": 3.264, | |
"step": 65 | |
}, | |
{ | |
"epoch": 5.28, | |
"grad_norm": 2.78125, | |
"learning_rate": 2.2881355932203392e-05, | |
"loss": 0.0508, | |
"step": 66 | |
}, | |
{ | |
"epoch": 5.28, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.9758080244064331, | |
"eval_runtime": 2.1435, | |
"eval_samples_per_second": 116.629, | |
"eval_steps_per_second": 3.266, | |
"step": 66 | |
}, | |
{ | |
"epoch": 5.36, | |
"grad_norm": 48.25, | |
"learning_rate": 2.245762711864407e-05, | |
"loss": 0.2033, | |
"step": 67 | |
}, | |
{ | |
"epoch": 5.36, | |
"eval_accuracy": 0.704, | |
"eval_loss": 1.1122255325317383, | |
"eval_runtime": 2.1416, | |
"eval_samples_per_second": 116.736, | |
"eval_steps_per_second": 3.269, | |
"step": 67 | |
}, | |
{ | |
"epoch": 5.44, | |
"grad_norm": 64.0, | |
"learning_rate": 2.2033898305084748e-05, | |
"loss": 0.1902, | |
"step": 68 | |
}, | |
{ | |
"epoch": 5.44, | |
"eval_accuracy": 0.7, | |
"eval_loss": 1.154529333114624, | |
"eval_runtime": 2.1433, | |
"eval_samples_per_second": 116.641, | |
"eval_steps_per_second": 3.266, | |
"step": 68 | |
}, | |
{ | |
"epoch": 5.52, | |
"grad_norm": 80.5, | |
"learning_rate": 2.1610169491525427e-05, | |
"loss": 0.4264, | |
"step": 69 | |
}, | |
{ | |
"epoch": 5.52, | |
"eval_accuracy": 0.708, | |
"eval_loss": 1.1109943389892578, | |
"eval_runtime": 2.1436, | |
"eval_samples_per_second": 116.624, | |
"eval_steps_per_second": 3.265, | |
"step": 69 | |
}, | |
{ | |
"epoch": 5.6, | |
"grad_norm": 65.5, | |
"learning_rate": 2.1186440677966103e-05, | |
"loss": 0.2435, | |
"step": 70 | |
}, | |
{ | |
"epoch": 5.6, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.0071001052856445, | |
"eval_runtime": 2.1399, | |
"eval_samples_per_second": 116.827, | |
"eval_steps_per_second": 3.271, | |
"step": 70 | |
}, | |
{ | |
"epoch": 5.68, | |
"grad_norm": 46.25, | |
"learning_rate": 2.076271186440678e-05, | |
"loss": 0.1582, | |
"step": 71 | |
}, | |
{ | |
"epoch": 5.68, | |
"eval_accuracy": 0.736, | |
"eval_loss": 0.920242965221405, | |
"eval_runtime": 2.1407, | |
"eval_samples_per_second": 116.783, | |
"eval_steps_per_second": 3.27, | |
"step": 71 | |
}, | |
{ | |
"epoch": 5.76, | |
"grad_norm": 4.59375, | |
"learning_rate": 2.033898305084746e-05, | |
"loss": 0.0614, | |
"step": 72 | |
}, | |
{ | |
"epoch": 5.76, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.9244736433029175, | |
"eval_runtime": 2.1425, | |
"eval_samples_per_second": 116.688, | |
"eval_steps_per_second": 3.267, | |
"step": 72 | |
}, | |
{ | |
"epoch": 5.84, | |
"grad_norm": 8.0625, | |
"learning_rate": 1.9915254237288135e-05, | |
"loss": 0.1134, | |
"step": 73 | |
}, | |
{ | |
"epoch": 5.84, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.9344038367271423, | |
"eval_runtime": 2.1415, | |
"eval_samples_per_second": 116.738, | |
"eval_steps_per_second": 3.269, | |
"step": 73 | |
}, | |
{ | |
"epoch": 5.92, | |
"grad_norm": 17.375, | |
"learning_rate": 1.9491525423728814e-05, | |
"loss": 0.0692, | |
"step": 74 | |
}, | |
{ | |
"epoch": 5.92, | |
"eval_accuracy": 0.756, | |
"eval_loss": 0.9447259902954102, | |
"eval_runtime": 2.1434, | |
"eval_samples_per_second": 116.639, | |
"eval_steps_per_second": 3.266, | |
"step": 74 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 8.8125, | |
"learning_rate": 1.906779661016949e-05, | |
"loss": 0.054, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.96815425157547, | |
"eval_runtime": 2.142, | |
"eval_samples_per_second": 116.713, | |
"eval_steps_per_second": 3.268, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.08, | |
"grad_norm": 4.34375, | |
"learning_rate": 1.864406779661017e-05, | |
"loss": 0.0653, | |
"step": 76 | |
}, | |
{ | |
"epoch": 6.08, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.9931323528289795, | |
"eval_runtime": 2.1425, | |
"eval_samples_per_second": 116.685, | |
"eval_steps_per_second": 3.267, | |
"step": 76 | |
}, | |
{ | |
"epoch": 6.16, | |
"grad_norm": 1.1875, | |
"learning_rate": 1.8220338983050846e-05, | |
"loss": 0.0175, | |
"step": 77 | |
}, | |
{ | |
"epoch": 6.16, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.0591909885406494, | |
"eval_runtime": 2.1405, | |
"eval_samples_per_second": 116.796, | |
"eval_steps_per_second": 3.27, | |
"step": 77 | |
}, | |
{ | |
"epoch": 6.24, | |
"grad_norm": 13.75, | |
"learning_rate": 1.7796610169491526e-05, | |
"loss": 0.0699, | |
"step": 78 | |
}, | |
{ | |
"epoch": 6.24, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.0900299549102783, | |
"eval_runtime": 2.1428, | |
"eval_samples_per_second": 116.671, | |
"eval_steps_per_second": 3.267, | |
"step": 78 | |
}, | |
{ | |
"epoch": 6.32, | |
"grad_norm": 16.5, | |
"learning_rate": 1.7372881355932205e-05, | |
"loss": 0.0761, | |
"step": 79 | |
}, | |
{ | |
"epoch": 6.32, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.0893088579177856, | |
"eval_runtime": 2.141, | |
"eval_samples_per_second": 116.769, | |
"eval_steps_per_second": 3.27, | |
"step": 79 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 12.6875, | |
"learning_rate": 1.694915254237288e-05, | |
"loss": 0.1584, | |
"step": 80 | |
}, | |
{ | |
"epoch": 6.4, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.1014509201049805, | |
"eval_runtime": 2.0982, | |
"eval_samples_per_second": 119.149, | |
"eval_steps_per_second": 3.336, | |
"step": 80 | |
}, | |
{ | |
"epoch": 6.48, | |
"grad_norm": 6.78125, | |
"learning_rate": 1.652542372881356e-05, | |
"loss": 0.0631, | |
"step": 81 | |
}, | |
{ | |
"epoch": 6.48, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.0965150594711304, | |
"eval_runtime": 2.1439, | |
"eval_samples_per_second": 116.612, | |
"eval_steps_per_second": 3.265, | |
"step": 81 | |
}, | |
{ | |
"epoch": 6.5600000000000005, | |
"grad_norm": 7.53125, | |
"learning_rate": 1.6101694915254237e-05, | |
"loss": 0.0382, | |
"step": 82 | |
}, | |
{ | |
"epoch": 6.5600000000000005, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.0999393463134766, | |
"eval_runtime": 2.1423, | |
"eval_samples_per_second": 116.698, | |
"eval_steps_per_second": 3.268, | |
"step": 82 | |
}, | |
{ | |
"epoch": 6.64, | |
"grad_norm": 11.0625, | |
"learning_rate": 1.5677966101694916e-05, | |
"loss": 0.0582, | |
"step": 83 | |
}, | |
{ | |
"epoch": 6.64, | |
"eval_accuracy": 0.744, | |
"eval_loss": 1.0906572341918945, | |
"eval_runtime": 2.143, | |
"eval_samples_per_second": 116.66, | |
"eval_steps_per_second": 3.266, | |
"step": 83 | |
}, | |
{ | |
"epoch": 6.72, | |
"grad_norm": 14.75, | |
"learning_rate": 1.5254237288135596e-05, | |
"loss": 0.0316, | |
"step": 84 | |
}, | |
{ | |
"epoch": 6.72, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.1019198894500732, | |
"eval_runtime": 2.1453, | |
"eval_samples_per_second": 116.535, | |
"eval_steps_per_second": 3.263, | |
"step": 84 | |
}, | |
{ | |
"epoch": 6.8, | |
"grad_norm": 17.375, | |
"learning_rate": 1.4830508474576272e-05, | |
"loss": 0.0509, | |
"step": 85 | |
}, | |
{ | |
"epoch": 6.8, | |
"eval_accuracy": 0.744, | |
"eval_loss": 1.0971815586090088, | |
"eval_runtime": 2.1409, | |
"eval_samples_per_second": 116.772, | |
"eval_steps_per_second": 3.27, | |
"step": 85 | |
}, | |
{ | |
"epoch": 6.88, | |
"grad_norm": 8.5625, | |
"learning_rate": 1.440677966101695e-05, | |
"loss": 0.0787, | |
"step": 86 | |
}, | |
{ | |
"epoch": 6.88, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.094210147857666, | |
"eval_runtime": 2.0923, | |
"eval_samples_per_second": 119.488, | |
"eval_steps_per_second": 3.346, | |
"step": 86 | |
}, | |
{ | |
"epoch": 6.96, | |
"grad_norm": 10.5, | |
"learning_rate": 1.3983050847457627e-05, | |
"loss": 0.0346, | |
"step": 87 | |
}, | |
{ | |
"epoch": 6.96, | |
"eval_accuracy": 0.744, | |
"eval_loss": 1.0973331928253174, | |
"eval_runtime": 2.1418, | |
"eval_samples_per_second": 116.723, | |
"eval_steps_per_second": 3.268, | |
"step": 87 | |
}, | |
{ | |
"epoch": 7.04, | |
"grad_norm": 5.71875, | |
"learning_rate": 1.3559322033898305e-05, | |
"loss": 0.0242, | |
"step": 88 | |
}, | |
{ | |
"epoch": 7.04, | |
"eval_accuracy": 0.744, | |
"eval_loss": 1.096313714981079, | |
"eval_runtime": 2.1419, | |
"eval_samples_per_second": 116.72, | |
"eval_steps_per_second": 3.268, | |
"step": 88 | |
}, | |
{ | |
"epoch": 7.12, | |
"grad_norm": 2.296875, | |
"learning_rate": 1.3135593220338985e-05, | |
"loss": 0.0089, | |
"step": 89 | |
}, | |
{ | |
"epoch": 7.12, | |
"eval_accuracy": 0.74, | |
"eval_loss": 1.108156442642212, | |
"eval_runtime": 2.1448, | |
"eval_samples_per_second": 116.562, | |
"eval_steps_per_second": 3.264, | |
"step": 89 | |
}, | |
{ | |
"epoch": 7.2, | |
"grad_norm": 2.90625, | |
"learning_rate": 1.2711864406779661e-05, | |
"loss": 0.0143, | |
"step": 90 | |
}, | |
{ | |
"epoch": 7.2, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.1125974655151367, | |
"eval_runtime": 2.1429, | |
"eval_samples_per_second": 116.666, | |
"eval_steps_per_second": 3.267, | |
"step": 90 | |
}, | |
{ | |
"epoch": 7.28, | |
"grad_norm": 1.671875, | |
"learning_rate": 1.228813559322034e-05, | |
"loss": 0.007, | |
"step": 91 | |
}, | |
{ | |
"epoch": 7.28, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.134128451347351, | |
"eval_runtime": 2.1433, | |
"eval_samples_per_second": 116.64, | |
"eval_steps_per_second": 3.266, | |
"step": 91 | |
}, | |
{ | |
"epoch": 7.36, | |
"grad_norm": 2.796875, | |
"learning_rate": 1.1864406779661018e-05, | |
"loss": 0.0159, | |
"step": 92 | |
}, | |
{ | |
"epoch": 7.36, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.1692942380905151, | |
"eval_runtime": 2.093, | |
"eval_samples_per_second": 119.446, | |
"eval_steps_per_second": 3.344, | |
"step": 92 | |
}, | |
{ | |
"epoch": 7.44, | |
"grad_norm": 6.3125, | |
"learning_rate": 1.1440677966101696e-05, | |
"loss": 0.0148, | |
"step": 93 | |
}, | |
{ | |
"epoch": 7.44, | |
"eval_accuracy": 0.72, | |
"eval_loss": 1.187640905380249, | |
"eval_runtime": 2.1424, | |
"eval_samples_per_second": 116.694, | |
"eval_steps_per_second": 3.267, | |
"step": 93 | |
}, | |
{ | |
"epoch": 7.52, | |
"grad_norm": 8.0625, | |
"learning_rate": 1.1016949152542374e-05, | |
"loss": 0.0343, | |
"step": 94 | |
}, | |
{ | |
"epoch": 7.52, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.1934490203857422, | |
"eval_runtime": 2.1413, | |
"eval_samples_per_second": 116.753, | |
"eval_steps_per_second": 3.269, | |
"step": 94 | |
}, | |
{ | |
"epoch": 7.6, | |
"grad_norm": 2.296875, | |
"learning_rate": 1.0593220338983052e-05, | |
"loss": 0.0124, | |
"step": 95 | |
}, | |
{ | |
"epoch": 7.6, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.1945278644561768, | |
"eval_runtime": 2.1425, | |
"eval_samples_per_second": 116.687, | |
"eval_steps_per_second": 3.267, | |
"step": 95 | |
}, | |
{ | |
"epoch": 7.68, | |
"grad_norm": 7.78125, | |
"learning_rate": 1.016949152542373e-05, | |
"loss": 0.0975, | |
"step": 96 | |
}, | |
{ | |
"epoch": 7.68, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.1970733404159546, | |
"eval_runtime": 2.145, | |
"eval_samples_per_second": 116.551, | |
"eval_steps_per_second": 3.263, | |
"step": 96 | |
}, | |
{ | |
"epoch": 7.76, | |
"grad_norm": 3.484375, | |
"learning_rate": 9.745762711864407e-06, | |
"loss": 0.0192, | |
"step": 97 | |
}, | |
{ | |
"epoch": 7.76, | |
"eval_accuracy": 0.744, | |
"eval_loss": 1.195726752281189, | |
"eval_runtime": 2.145, | |
"eval_samples_per_second": 116.549, | |
"eval_steps_per_second": 3.263, | |
"step": 97 | |
}, | |
{ | |
"epoch": 7.84, | |
"grad_norm": 5.78125, | |
"learning_rate": 9.322033898305085e-06, | |
"loss": 0.0314, | |
"step": 98 | |
}, | |
{ | |
"epoch": 7.84, | |
"eval_accuracy": 0.744, | |
"eval_loss": 1.2016874551773071, | |
"eval_runtime": 2.1454, | |
"eval_samples_per_second": 116.529, | |
"eval_steps_per_second": 3.263, | |
"step": 98 | |
}, | |
{ | |
"epoch": 7.92, | |
"grad_norm": 1.9921875, | |
"learning_rate": 8.898305084745763e-06, | |
"loss": 0.0094, | |
"step": 99 | |
}, | |
{ | |
"epoch": 7.92, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.212010383605957, | |
"eval_runtime": 2.1446, | |
"eval_samples_per_second": 116.572, | |
"eval_steps_per_second": 3.264, | |
"step": 99 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 1.046875, | |
"learning_rate": 8.47457627118644e-06, | |
"loss": 0.0098, | |
"step": 100 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.2213313579559326, | |
"eval_runtime": 2.1436, | |
"eval_samples_per_second": 116.624, | |
"eval_steps_per_second": 3.265, | |
"step": 100 | |
}, | |
{ | |
"epoch": 8.08, | |
"grad_norm": 1.1875, | |
"learning_rate": 8.050847457627118e-06, | |
"loss": 0.0056, | |
"step": 101 | |
}, | |
{ | |
"epoch": 8.08, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.2390106916427612, | |
"eval_runtime": 2.1453, | |
"eval_samples_per_second": 116.536, | |
"eval_steps_per_second": 3.263, | |
"step": 101 | |
}, | |
{ | |
"epoch": 8.16, | |
"grad_norm": 0.6640625, | |
"learning_rate": 7.627118644067798e-06, | |
"loss": 0.003, | |
"step": 102 | |
}, | |
{ | |
"epoch": 8.16, | |
"eval_accuracy": 0.748, | |
"eval_loss": 1.253604531288147, | |
"eval_runtime": 2.1442, | |
"eval_samples_per_second": 116.594, | |
"eval_steps_per_second": 3.265, | |
"step": 102 | |
}, | |
{ | |
"epoch": 8.24, | |
"grad_norm": 0.578125, | |
"learning_rate": 7.203389830508475e-06, | |
"loss": 0.005, | |
"step": 103 | |
}, | |
{ | |
"epoch": 8.24, | |
"eval_accuracy": 0.74, | |
"eval_loss": 1.2792062759399414, | |
"eval_runtime": 2.143, | |
"eval_samples_per_second": 116.656, | |
"eval_steps_per_second": 3.266, | |
"step": 103 | |
}, | |
{ | |
"epoch": 8.32, | |
"grad_norm": 12.125, | |
"learning_rate": 6.779661016949153e-06, | |
"loss": 0.0821, | |
"step": 104 | |
}, | |
{ | |
"epoch": 8.32, | |
"eval_accuracy": 0.74, | |
"eval_loss": 1.2985644340515137, | |
"eval_runtime": 2.1417, | |
"eval_samples_per_second": 116.731, | |
"eval_steps_per_second": 3.268, | |
"step": 104 | |
}, | |
{ | |
"epoch": 8.4, | |
"grad_norm": 0.435546875, | |
"learning_rate": 6.3559322033898304e-06, | |
"loss": 0.0038, | |
"step": 105 | |
}, | |
{ | |
"epoch": 8.4, | |
"eval_accuracy": 0.736, | |
"eval_loss": 1.3192646503448486, | |
"eval_runtime": 2.1413, | |
"eval_samples_per_second": 116.753, | |
"eval_steps_per_second": 3.269, | |
"step": 105 | |
}, | |
{ | |
"epoch": 8.48, | |
"grad_norm": 1.4453125, | |
"learning_rate": 5.932203389830509e-06, | |
"loss": 0.0035, | |
"step": 106 | |
}, | |
{ | |
"epoch": 8.48, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.340459942817688, | |
"eval_runtime": 2.1419, | |
"eval_samples_per_second": 116.717, | |
"eval_steps_per_second": 3.268, | |
"step": 106 | |
}, | |
{ | |
"epoch": 8.56, | |
"grad_norm": 0.50390625, | |
"learning_rate": 5.508474576271187e-06, | |
"loss": 0.0031, | |
"step": 107 | |
}, | |
{ | |
"epoch": 8.56, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.360876202583313, | |
"eval_runtime": 2.143, | |
"eval_samples_per_second": 116.659, | |
"eval_steps_per_second": 3.266, | |
"step": 107 | |
}, | |
{ | |
"epoch": 8.64, | |
"grad_norm": 0.96875, | |
"learning_rate": 5.084745762711865e-06, | |
"loss": 0.0035, | |
"step": 108 | |
}, | |
{ | |
"epoch": 8.64, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.3767642974853516, | |
"eval_runtime": 2.1447, | |
"eval_samples_per_second": 116.569, | |
"eval_steps_per_second": 3.264, | |
"step": 108 | |
}, | |
{ | |
"epoch": 8.72, | |
"grad_norm": 0.41015625, | |
"learning_rate": 4.6610169491525425e-06, | |
"loss": 0.003, | |
"step": 109 | |
}, | |
{ | |
"epoch": 8.72, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.3948369026184082, | |
"eval_runtime": 2.1416, | |
"eval_samples_per_second": 116.735, | |
"eval_steps_per_second": 3.269, | |
"step": 109 | |
}, | |
{ | |
"epoch": 8.8, | |
"grad_norm": 0.36328125, | |
"learning_rate": 4.23728813559322e-06, | |
"loss": 0.0021, | |
"step": 110 | |
}, | |
{ | |
"epoch": 8.8, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.4067965745925903, | |
"eval_runtime": 2.1439, | |
"eval_samples_per_second": 116.612, | |
"eval_steps_per_second": 3.265, | |
"step": 110 | |
}, | |
{ | |
"epoch": 8.88, | |
"grad_norm": 0.59765625, | |
"learning_rate": 3.813559322033899e-06, | |
"loss": 0.0017, | |
"step": 111 | |
}, | |
{ | |
"epoch": 8.88, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.4231809377670288, | |
"eval_runtime": 2.1423, | |
"eval_samples_per_second": 116.695, | |
"eval_steps_per_second": 3.267, | |
"step": 111 | |
}, | |
{ | |
"epoch": 8.96, | |
"grad_norm": 0.53125, | |
"learning_rate": 3.3898305084745763e-06, | |
"loss": 0.0037, | |
"step": 112 | |
}, | |
{ | |
"epoch": 8.96, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.4200129508972168, | |
"eval_runtime": 2.1433, | |
"eval_samples_per_second": 116.644, | |
"eval_steps_per_second": 3.266, | |
"step": 112 | |
}, | |
{ | |
"epoch": 9.04, | |
"grad_norm": 1.375, | |
"learning_rate": 2.9661016949152545e-06, | |
"loss": 0.0027, | |
"step": 113 | |
}, | |
{ | |
"epoch": 9.04, | |
"eval_accuracy": 0.724, | |
"eval_loss": 1.4301310777664185, | |
"eval_runtime": 2.1447, | |
"eval_samples_per_second": 116.568, | |
"eval_steps_per_second": 3.264, | |
"step": 113 | |
}, | |
{ | |
"epoch": 9.12, | |
"grad_norm": 0.44140625, | |
"learning_rate": 2.5423728813559323e-06, | |
"loss": 0.0019, | |
"step": 114 | |
}, | |
{ | |
"epoch": 9.12, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.4320718050003052, | |
"eval_runtime": 2.094, | |
"eval_samples_per_second": 119.389, | |
"eval_steps_per_second": 3.343, | |
"step": 114 | |
}, | |
{ | |
"epoch": 9.2, | |
"grad_norm": 0.30078125, | |
"learning_rate": 2.11864406779661e-06, | |
"loss": 0.0016, | |
"step": 115 | |
}, | |
{ | |
"epoch": 9.2, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.4348891973495483, | |
"eval_runtime": 2.1448, | |
"eval_samples_per_second": 116.559, | |
"eval_steps_per_second": 3.264, | |
"step": 115 | |
}, | |
{ | |
"epoch": 9.28, | |
"grad_norm": 1.9375, | |
"learning_rate": 1.6949152542372882e-06, | |
"loss": 0.0103, | |
"step": 116 | |
}, | |
{ | |
"epoch": 9.28, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.433516263961792, | |
"eval_runtime": 2.1453, | |
"eval_samples_per_second": 116.536, | |
"eval_steps_per_second": 3.263, | |
"step": 116 | |
}, | |
{ | |
"epoch": 9.36, | |
"grad_norm": 0.55859375, | |
"learning_rate": 1.2711864406779662e-06, | |
"loss": 0.0024, | |
"step": 117 | |
}, | |
{ | |
"epoch": 9.36, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.4311394691467285, | |
"eval_runtime": 2.1427, | |
"eval_samples_per_second": 116.676, | |
"eval_steps_per_second": 3.267, | |
"step": 117 | |
}, | |
{ | |
"epoch": 9.44, | |
"grad_norm": 0.55859375, | |
"learning_rate": 8.474576271186441e-07, | |
"loss": 0.0023, | |
"step": 118 | |
}, | |
{ | |
"epoch": 9.44, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.4323029518127441, | |
"eval_runtime": 2.1437, | |
"eval_samples_per_second": 116.618, | |
"eval_steps_per_second": 3.265, | |
"step": 118 | |
}, | |
{ | |
"epoch": 9.52, | |
"grad_norm": 0.408203125, | |
"learning_rate": 4.2372881355932204e-07, | |
"loss": 0.0028, | |
"step": 119 | |
}, | |
{ | |
"epoch": 9.52, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.4346427917480469, | |
"eval_runtime": 2.1431, | |
"eval_samples_per_second": 116.651, | |
"eval_steps_per_second": 3.266, | |
"step": 119 | |
}, | |
{ | |
"epoch": 9.6, | |
"grad_norm": 0.06982421875, | |
"learning_rate": 0.0, | |
"loss": 0.0004, | |
"step": 120 | |
}, | |
{ | |
"epoch": 9.6, | |
"eval_accuracy": 0.728, | |
"eval_loss": 1.4318400621414185, | |
"eval_runtime": 2.1424, | |
"eval_samples_per_second": 116.691, | |
"eval_steps_per_second": 3.267, | |
"step": 120 | |
}, | |
{ | |
"epoch": 9.6, | |
"step": 120, | |
"total_flos": 2.803887199223808e+16, | |
"train_loss": 0.28943174814849043, | |
"train_runtime": 477.0534, | |
"train_samples_per_second": 20.962, | |
"train_steps_per_second": 0.252 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 120, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": false, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 2.803887199223808e+16, | |
"train_batch_size": 10, | |
"trial_name": null, | |
"trial_params": null | |
} | |