|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0005, |
|
"loss": 2.3712, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8732, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.593873417721519, |
|
"eval_loss": 2.011091947555542, |
|
"eval_runtime": 5.3852, |
|
"eval_samples_per_second": 92.847, |
|
"eval_steps_per_second": 11.699, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7329, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6233, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6142, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6051139240506329, |
|
"eval_loss": 1.844284176826477, |
|
"eval_runtime": 5.3165, |
|
"eval_samples_per_second": 94.047, |
|
"eval_steps_per_second": 11.85, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 0.0005, |
|
"loss": 1.1625, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.0005, |
|
"loss": 1.206, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6007341772151898, |
|
"eval_loss": 1.981817603111267, |
|
"eval_runtime": 5.3466, |
|
"eval_samples_per_second": 93.517, |
|
"eval_steps_per_second": 11.783, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.0005, |
|
"loss": 1.0064, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8141, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8693, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.594126582278481, |
|
"eval_loss": 2.2100281715393066, |
|
"eval_runtime": 4.739, |
|
"eval_samples_per_second": 105.508, |
|
"eval_steps_per_second": 13.294, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5477, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.6023, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5910126582278481, |
|
"eval_loss": 2.375615119934082, |
|
"eval_runtime": 4.6068, |
|
"eval_samples_per_second": 108.536, |
|
"eval_steps_per_second": 13.676, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.514, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4479, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4717, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5895696202531645, |
|
"eval_loss": 2.542104482650757, |
|
"eval_runtime": 4.662, |
|
"eval_samples_per_second": 107.251, |
|
"eval_steps_per_second": 13.514, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3723, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3938, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5891139240506329, |
|
"eval_loss": 2.658656597137451, |
|
"eval_runtime": 4.9363, |
|
"eval_samples_per_second": 101.29, |
|
"eval_steps_per_second": 12.763, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3727, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3538, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3697, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5872911392405064, |
|
"eval_loss": 2.7531518936157227, |
|
"eval_runtime": 4.7177, |
|
"eval_samples_per_second": 105.985, |
|
"eval_steps_per_second": 13.354, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3431, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3617, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5870126582278481, |
|
"eval_loss": 2.7664403915405273, |
|
"eval_runtime": 4.7999, |
|
"eval_samples_per_second": 104.169, |
|
"eval_steps_per_second": 13.125, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3424, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3418, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3607, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5866835443037974, |
|
"eval_loss": 2.8513779640197754, |
|
"eval_runtime": 4.6699, |
|
"eval_samples_per_second": 107.069, |
|
"eval_steps_per_second": 13.491, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.339, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3414, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.5860759493670886, |
|
"eval_loss": 2.8931641578674316, |
|
"eval_runtime": 4.8648, |
|
"eval_samples_per_second": 102.78, |
|
"eval_steps_per_second": 12.95, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3302, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.329, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3439, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.5854683544303797, |
|
"eval_loss": 2.9544754028320312, |
|
"eval_runtime": 5.1699, |
|
"eval_samples_per_second": 96.713, |
|
"eval_steps_per_second": 12.186, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3182, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.335, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.5842784810126582, |
|
"eval_loss": 2.899094343185425, |
|
"eval_runtime": 4.8433, |
|
"eval_samples_per_second": 103.236, |
|
"eval_steps_per_second": 13.008, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.327, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3223, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3391, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.5839746835443038, |
|
"eval_loss": 2.879316806793213, |
|
"eval_runtime": 4.9294, |
|
"eval_samples_per_second": 101.433, |
|
"eval_steps_per_second": 12.781, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3128, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.328, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.5851392405063292, |
|
"eval_loss": 2.8953680992126465, |
|
"eval_runtime": 4.6969, |
|
"eval_samples_per_second": 106.452, |
|
"eval_steps_per_second": 13.413, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3233, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3216, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3351, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5838481012658228, |
|
"eval_loss": 2.913999080657959, |
|
"eval_runtime": 4.7522, |
|
"eval_samples_per_second": 105.214, |
|
"eval_steps_per_second": 13.257, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3087, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3252, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.5825063291139241, |
|
"eval_loss": 2.929701089859009, |
|
"eval_runtime": 4.8518, |
|
"eval_samples_per_second": 103.055, |
|
"eval_steps_per_second": 12.985, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3152, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3161, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.332, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.5833924050632912, |
|
"eval_loss": 2.9811997413635254, |
|
"eval_runtime": 5.341, |
|
"eval_samples_per_second": 93.615, |
|
"eval_steps_per_second": 11.796, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3089, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"learning_rate": 0.0005, |
|
"loss": 0.324, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.5807848101265822, |
|
"eval_loss": 2.982343912124634, |
|
"eval_runtime": 4.9018, |
|
"eval_samples_per_second": 102.002, |
|
"eval_steps_per_second": 12.852, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3169, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3116, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3329, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.5813164556962025, |
|
"eval_loss": 2.9419918060302734, |
|
"eval_runtime": 4.5961, |
|
"eval_samples_per_second": 108.787, |
|
"eval_steps_per_second": 13.707, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 5000, |
|
"total_flos": 3.1967425075347456e+17, |
|
"train_loss": 0.5721206008911133, |
|
"train_runtime": 3939.3187, |
|
"train_samples_per_second": 40.616, |
|
"train_steps_per_second": 1.269 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5000, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 3.1967425075347456e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|