{ "best_metric": null, "best_model_checkpoint": null, "epoch": 12.41322988975092, "eval_steps": 50, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.33, "learning_rate": 5.263157894736842e-06, "loss": 10.6083, "step": 50 }, { "epoch": 0.33, "eval_loss": 10.407150268554688, "eval_runtime": 65.9544, "eval_samples_per_second": 0.561, "eval_steps_per_second": 0.561, "step": 50 }, { "epoch": 0.65, "learning_rate": 1.0526315789473684e-05, "loss": 10.0615, "step": 100 }, { "epoch": 0.65, "eval_loss": 9.026963233947754, "eval_runtime": 66.0596, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 100 }, { "epoch": 0.98, "learning_rate": 1.578947368421053e-05, "loss": 8.0748, "step": 150 }, { "epoch": 0.98, "eval_loss": 6.740030288696289, "eval_runtime": 66.4335, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 150 }, { "epoch": 1.31, "learning_rate": 1.999831241633323e-05, "loss": 6.2551, "step": 200 }, { "epoch": 1.31, "eval_loss": 5.468978404998779, "eval_runtime": 66.4722, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 200 }, { "epoch": 1.63, "learning_rate": 1.9939306773179498e-05, "loss": 5.3533, "step": 250 }, { "epoch": 1.63, "eval_loss": 4.9109086990356445, "eval_runtime": 66.1035, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 250 }, { "epoch": 1.96, "learning_rate": 1.979649067087574e-05, "loss": 4.9846, "step": 300 }, { "epoch": 1.96, "eval_loss": 4.694249629974365, "eval_runtime": 66.2471, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 300 }, { "epoch": 2.29, "learning_rate": 1.9571068366759143e-05, "loss": 4.8176, "step": 350 }, { "epoch": 2.29, "eval_loss": 4.578726768493652, "eval_runtime": 66.1659, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 350 }, { "epoch": 2.61, "learning_rate": 1.9264940672148018e-05, "loss": 4.7153, "step": 400 }, { "epoch": 2.61, "eval_loss": 4.5052080154418945, "eval_runtime": 66.4396, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 400 }, { "epoch": 2.94, "learning_rate": 1.888068892427538e-05, "loss": 4.6504, "step": 450 }, { "epoch": 2.94, "eval_loss": 4.450746536254883, "eval_runtime": 66.6083, "eval_samples_per_second": 0.555, "eval_steps_per_second": 0.555, "step": 450 }, { "epoch": 3.27, "learning_rate": 1.842155321987566e-05, "loss": 4.5897, "step": 500 }, { "epoch": 3.27, "eval_loss": 4.411241054534912, "eval_runtime": 66.3951, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 500 }, { "epoch": 3.59, "learning_rate": 1.789140509396394e-05, "loss": 4.5623, "step": 550 }, { "epoch": 3.59, "eval_loss": 4.383650302886963, "eval_runtime": 66.4603, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 550 }, { "epoch": 3.92, "learning_rate": 1.729471487418621e-05, "loss": 4.5371, "step": 600 }, { "epoch": 3.92, "eval_loss": 4.359971523284912, "eval_runtime": 66.5261, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.556, "step": 600 }, { "epoch": 4.25, "learning_rate": 1.6636513986016215e-05, "loss": 4.5189, "step": 650 }, { "epoch": 4.25, "eval_loss": 4.340854167938232, "eval_runtime": 66.0697, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 650 }, { "epoch": 4.57, "learning_rate": 1.5922352526649803e-05, "loss": 4.4797, "step": 700 }, { "epoch": 4.57, "eval_loss": 4.326164722442627, "eval_runtime": 66.0876, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 700 }, { "epoch": 4.9, "learning_rate": 1.5158252465343242e-05, "loss": 4.4863, "step": 750 }, { "epoch": 4.9, "eval_loss": 4.313417911529541, "eval_runtime": 66.3814, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 750 }, { "epoch": 5.23, "learning_rate": 1.4350656864820733e-05, "loss": 4.4571, "step": 800 }, { "epoch": 5.23, "eval_loss": 4.301650047302246, "eval_runtime": 66.0901, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 800 }, { "epoch": 5.55, "learning_rate": 1.3506375551927546e-05, "loss": 4.4562, "step": 850 }, { "epoch": 5.55, "eval_loss": 4.292283058166504, "eval_runtime": 66.2796, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.558, "step": 850 }, { "epoch": 5.88, "learning_rate": 1.2632527695645993e-05, "loss": 4.4527, "step": 900 }, { "epoch": 5.88, "eval_loss": 4.286748886108398, "eval_runtime": 66.2321, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 900 }, { "epoch": 6.21, "learning_rate": 1.1736481776669307e-05, "loss": 4.4292, "step": 950 }, { "epoch": 6.21, "eval_loss": 4.277628421783447, "eval_runtime": 66.6577, "eval_samples_per_second": 0.555, "eval_steps_per_second": 0.555, "step": 950 }, { "epoch": 6.53, "learning_rate": 1.0825793454723325e-05, "loss": 4.423, "step": 1000 }, { "epoch": 6.53, "eval_loss": 4.27248477935791, "eval_runtime": 66.2871, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.558, "step": 1000 }, { "epoch": 6.86, "learning_rate": 9.908141857552737e-06, "loss": 4.423, "step": 1050 }, { "epoch": 6.86, "eval_loss": 4.266038417816162, "eval_runtime": 66.5179, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.556, "step": 1050 }, { "epoch": 7.19, "learning_rate": 8.991264828797319e-06, "loss": 4.4166, "step": 1100 }, { "epoch": 7.19, "eval_loss": 4.2608160972595215, "eval_runtime": 66.1693, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 1100 }, { "epoch": 7.51, "learning_rate": 8.082893680762619e-06, "loss": 4.4156, "step": 1150 }, { "epoch": 7.51, "eval_loss": 4.257768630981445, "eval_runtime": 66.5456, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.556, "step": 1150 }, { "epoch": 7.84, "learning_rate": 7.190688002264308e-06, "loss": 4.3988, "step": 1200 }, { "epoch": 7.84, "eval_loss": 4.253511428833008, "eval_runtime": 66.2492, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.558, "step": 1200 }, { "epoch": 8.17, "learning_rate": 6.322171071261071e-06, "loss": 4.4021, "step": 1250 }, { "epoch": 8.17, "eval_loss": 4.251075744628906, "eval_runtime": 66.2478, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 1250 }, { "epoch": 8.49, "learning_rate": 5.484666416891109e-06, "loss": 4.4038, "step": 1300 }, { "epoch": 8.49, "eval_loss": 4.247603893280029, "eval_runtime": 66.0732, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 1300 }, { "epoch": 8.82, "learning_rate": 4.685236065835443e-06, "loss": 4.3881, "step": 1350 }, { "epoch": 8.82, "eval_loss": 4.24613094329834, "eval_runtime": 66.4204, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 1350 }, { "epoch": 9.15, "learning_rate": 3.930620993728434e-06, "loss": 4.3943, "step": 1400 }, { "epoch": 9.15, "eval_loss": 4.2452192306518555, "eval_runtime": 66.4796, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 1400 }, { "epoch": 9.47, "learning_rate": 3.2271842837425917e-06, "loss": 4.3822, "step": 1450 }, { "epoch": 9.47, "eval_loss": 4.24231481552124, "eval_runtime": 66.529, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.556, "step": 1450 }, { "epoch": 9.8, "learning_rate": 2.580857471647186e-06, "loss": 4.4064, "step": 1500 }, { "epoch": 9.8, "eval_loss": 4.241450309753418, "eval_runtime": 66.3417, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.558, "step": 1500 }, { "epoch": 10.13, "learning_rate": 1.9970905297711606e-06, "loss": 4.3793, "step": 1550 }, { "epoch": 10.13, "eval_loss": 4.239691257476807, "eval_runtime": 66.2256, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 1550 }, { "epoch": 10.45, "learning_rate": 1.4808059116167306e-06, "loss": 4.385, "step": 1600 }, { "epoch": 10.45, "eval_loss": 4.239114761352539, "eval_runtime": 66.4764, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 1600 }, { "epoch": 10.78, "learning_rate": 1.0363570446297999e-06, "loss": 4.3919, "step": 1650 }, { "epoch": 10.78, "eval_loss": 4.239101409912109, "eval_runtime": 66.7261, "eval_samples_per_second": 0.555, "eval_steps_per_second": 0.555, "step": 1650 }, { "epoch": 11.11, "learning_rate": 6.67491621125429e-07, "loss": 4.3934, "step": 1700 }, { "epoch": 11.11, "eval_loss": 4.238786220550537, "eval_runtime": 66.2354, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 1700 }, { "epoch": 11.43, "learning_rate": 3.773199969074959e-07, "loss": 4.3825, "step": 1750 }, { "epoch": 11.43, "eval_loss": 4.238570690155029, "eval_runtime": 66.3209, "eval_samples_per_second": 0.558, "eval_steps_per_second": 0.558, "step": 1750 }, { "epoch": 11.76, "learning_rate": 1.6828896405244988e-07, "loss": 4.3835, "step": 1800 }, { "epoch": 11.76, "eval_loss": 4.2383294105529785, "eval_runtime": 66.2218, "eval_samples_per_second": 0.559, "eval_steps_per_second": 0.559, "step": 1800 }, { "epoch": 12.09, "learning_rate": 4.216111901092501e-08, "loss": 4.3851, "step": 1850 }, { "epoch": 12.09, "eval_loss": 4.238271236419678, "eval_runtime": 66.3755, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.557, "step": 1850 }, { "epoch": 12.41, "learning_rate": 0.0, "loss": 4.3848, "step": 1900 }, { "epoch": 12.41, "eval_loss": 4.2382659912109375, "eval_runtime": 66.5156, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.556, "step": 1900 } ], "logging_steps": 50, "max_steps": 1900, "num_input_tokens_seen": 0, "num_train_epochs": 13, "save_steps": 50, "total_flos": 7.73200764370944e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }