|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 12.41322988975092, |
|
"eval_steps": 50, |
|
"global_step": 1900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 10.6083, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 10.407150268554688, |
|
"eval_runtime": 65.9544, |
|
"eval_samples_per_second": 0.561, |
|
"eval_steps_per_second": 0.561, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 10.0615, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 9.026963233947754, |
|
"eval_runtime": 66.0596, |
|
"eval_samples_per_second": 0.56, |
|
"eval_steps_per_second": 0.56, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 8.0748, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 6.740030288696289, |
|
"eval_runtime": 66.4335, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 1.999831241633323e-05, |
|
"loss": 6.2551, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_loss": 5.468978404998779, |
|
"eval_runtime": 66.4722, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 1.9939306773179498e-05, |
|
"loss": 5.3533, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"eval_loss": 4.9109086990356445, |
|
"eval_runtime": 66.1035, |
|
"eval_samples_per_second": 0.56, |
|
"eval_steps_per_second": 0.56, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 1.979649067087574e-05, |
|
"loss": 4.9846, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 4.694249629974365, |
|
"eval_runtime": 66.2471, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 1.9571068366759143e-05, |
|
"loss": 4.8176, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"eval_loss": 4.578726768493652, |
|
"eval_runtime": 66.1659, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"learning_rate": 1.9264940672148018e-05, |
|
"loss": 4.7153, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"eval_loss": 4.5052080154418945, |
|
"eval_runtime": 66.4396, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 1.888068892427538e-05, |
|
"loss": 4.6504, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_loss": 4.450746536254883, |
|
"eval_runtime": 66.6083, |
|
"eval_samples_per_second": 0.555, |
|
"eval_steps_per_second": 0.555, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 1.842155321987566e-05, |
|
"loss": 4.5897, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"eval_loss": 4.411241054534912, |
|
"eval_runtime": 66.3951, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"learning_rate": 1.789140509396394e-05, |
|
"loss": 4.5623, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"eval_loss": 4.383650302886963, |
|
"eval_runtime": 66.4603, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 1.729471487418621e-05, |
|
"loss": 4.5371, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_loss": 4.359971523284912, |
|
"eval_runtime": 66.5261, |
|
"eval_samples_per_second": 0.556, |
|
"eval_steps_per_second": 0.556, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 1.6636513986016215e-05, |
|
"loss": 4.5189, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 4.340854167938232, |
|
"eval_runtime": 66.0697, |
|
"eval_samples_per_second": 0.56, |
|
"eval_steps_per_second": 0.56, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 1.5922352526649803e-05, |
|
"loss": 4.4797, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"eval_loss": 4.326164722442627, |
|
"eval_runtime": 66.0876, |
|
"eval_samples_per_second": 0.56, |
|
"eval_steps_per_second": 0.56, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"learning_rate": 1.5158252465343242e-05, |
|
"loss": 4.4863, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"eval_loss": 4.313417911529541, |
|
"eval_runtime": 66.3814, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"learning_rate": 1.4350656864820733e-05, |
|
"loss": 4.4571, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"eval_loss": 4.301650047302246, |
|
"eval_runtime": 66.0901, |
|
"eval_samples_per_second": 0.56, |
|
"eval_steps_per_second": 0.56, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"learning_rate": 1.3506375551927546e-05, |
|
"loss": 4.4562, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"eval_loss": 4.292283058166504, |
|
"eval_runtime": 66.2796, |
|
"eval_samples_per_second": 0.558, |
|
"eval_steps_per_second": 0.558, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 1.2632527695645993e-05, |
|
"loss": 4.4527, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"eval_loss": 4.286748886108398, |
|
"eval_runtime": 66.2321, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 4.4292, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"eval_loss": 4.277628421783447, |
|
"eval_runtime": 66.6577, |
|
"eval_samples_per_second": 0.555, |
|
"eval_steps_per_second": 0.555, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"learning_rate": 1.0825793454723325e-05, |
|
"loss": 4.423, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"eval_loss": 4.27248477935791, |
|
"eval_runtime": 66.2871, |
|
"eval_samples_per_second": 0.558, |
|
"eval_steps_per_second": 0.558, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"learning_rate": 9.908141857552737e-06, |
|
"loss": 4.423, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"eval_loss": 4.266038417816162, |
|
"eval_runtime": 66.5179, |
|
"eval_samples_per_second": 0.556, |
|
"eval_steps_per_second": 0.556, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"learning_rate": 8.991264828797319e-06, |
|
"loss": 4.4166, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"eval_loss": 4.2608160972595215, |
|
"eval_runtime": 66.1693, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"learning_rate": 8.082893680762619e-06, |
|
"loss": 4.4156, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"eval_loss": 4.257768630981445, |
|
"eval_runtime": 66.5456, |
|
"eval_samples_per_second": 0.556, |
|
"eval_steps_per_second": 0.556, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"learning_rate": 7.190688002264308e-06, |
|
"loss": 4.3988, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"eval_loss": 4.253511428833008, |
|
"eval_runtime": 66.2492, |
|
"eval_samples_per_second": 0.558, |
|
"eval_steps_per_second": 0.558, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"learning_rate": 6.322171071261071e-06, |
|
"loss": 4.4021, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"eval_loss": 4.251075744628906, |
|
"eval_runtime": 66.2478, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"learning_rate": 5.484666416891109e-06, |
|
"loss": 4.4038, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"eval_loss": 4.247603893280029, |
|
"eval_runtime": 66.0732, |
|
"eval_samples_per_second": 0.56, |
|
"eval_steps_per_second": 0.56, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 4.685236065835443e-06, |
|
"loss": 4.3881, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"eval_loss": 4.24613094329834, |
|
"eval_runtime": 66.4204, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"learning_rate": 3.930620993728434e-06, |
|
"loss": 4.3943, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"eval_loss": 4.2452192306518555, |
|
"eval_runtime": 66.4796, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"learning_rate": 3.2271842837425917e-06, |
|
"loss": 4.3822, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"eval_loss": 4.24231481552124, |
|
"eval_runtime": 66.529, |
|
"eval_samples_per_second": 0.556, |
|
"eval_steps_per_second": 0.556, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"learning_rate": 2.580857471647186e-06, |
|
"loss": 4.4064, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"eval_loss": 4.241450309753418, |
|
"eval_runtime": 66.3417, |
|
"eval_samples_per_second": 0.558, |
|
"eval_steps_per_second": 0.558, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 10.13, |
|
"learning_rate": 1.9970905297711606e-06, |
|
"loss": 4.3793, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 10.13, |
|
"eval_loss": 4.239691257476807, |
|
"eval_runtime": 66.2256, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 10.45, |
|
"learning_rate": 1.4808059116167306e-06, |
|
"loss": 4.385, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 10.45, |
|
"eval_loss": 4.239114761352539, |
|
"eval_runtime": 66.4764, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 10.78, |
|
"learning_rate": 1.0363570446297999e-06, |
|
"loss": 4.3919, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 10.78, |
|
"eval_loss": 4.239101409912109, |
|
"eval_runtime": 66.7261, |
|
"eval_samples_per_second": 0.555, |
|
"eval_steps_per_second": 0.555, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"learning_rate": 6.67491621125429e-07, |
|
"loss": 4.3934, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"eval_loss": 4.238786220550537, |
|
"eval_runtime": 66.2354, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 11.43, |
|
"learning_rate": 3.773199969074959e-07, |
|
"loss": 4.3825, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 11.43, |
|
"eval_loss": 4.238570690155029, |
|
"eval_runtime": 66.3209, |
|
"eval_samples_per_second": 0.558, |
|
"eval_steps_per_second": 0.558, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"learning_rate": 1.6828896405244988e-07, |
|
"loss": 4.3835, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"eval_loss": 4.2383294105529785, |
|
"eval_runtime": 66.2218, |
|
"eval_samples_per_second": 0.559, |
|
"eval_steps_per_second": 0.559, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 12.09, |
|
"learning_rate": 4.216111901092501e-08, |
|
"loss": 4.3851, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 12.09, |
|
"eval_loss": 4.238271236419678, |
|
"eval_runtime": 66.3755, |
|
"eval_samples_per_second": 0.557, |
|
"eval_steps_per_second": 0.557, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 12.41, |
|
"learning_rate": 0.0, |
|
"loss": 4.3848, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 12.41, |
|
"eval_loss": 4.2382659912109375, |
|
"eval_runtime": 66.5156, |
|
"eval_samples_per_second": 0.556, |
|
"eval_steps_per_second": 0.556, |
|
"step": 1900 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 1900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 13, |
|
"save_steps": 50, |
|
"total_flos": 7.73200764370944e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|