{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 88686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.382721060821325e-05, "grad_norm": 1352.0, "learning_rate": 0.00029999661727893914, "loss": 22.625, "step": 1 }, { "epoch": 0.10148163182463973, "grad_norm": 1.4296875, "learning_rate": 0.000289851836817536, "loss": 3.5832, "step": 3000 }, { "epoch": 0.10148163182463973, "eval_loss": 2.45703125, "eval_runtime": 96.0596, "eval_samples_per_second": 982.775, "eval_steps_per_second": 7.683, "step": 3000 }, { "epoch": 0.20296326364927947, "grad_norm": 1.375, "learning_rate": 0.000279703673635072, "loss": 2.9906, "step": 6000 }, { "epoch": 0.20296326364927947, "eval_loss": 2.388899564743042, "eval_runtime": 95.3468, "eval_samples_per_second": 990.122, "eval_steps_per_second": 7.74, "step": 6000 }, { "epoch": 0.30444489547391923, "grad_norm": 1.3125, "learning_rate": 0.00026955551045260807, "loss": 2.9367, "step": 9000 }, { "epoch": 0.30444489547391923, "eval_loss": 2.3747141361236572, "eval_runtime": 95.1929, "eval_samples_per_second": 991.723, "eval_steps_per_second": 7.753, "step": 9000 }, { "epoch": 0.40592652729855894, "grad_norm": 1.421875, "learning_rate": 0.0002594073472701441, "loss": 2.9115, "step": 12000 }, { "epoch": 0.40592652729855894, "eval_loss": 2.3593432903289795, "eval_runtime": 95.2533, "eval_samples_per_second": 991.095, "eval_steps_per_second": 7.748, "step": 12000 }, { "epoch": 0.5074081591231987, "grad_norm": 1.3828125, "learning_rate": 0.0002492591840876801, "loss": 2.891, "step": 15000 }, { "epoch": 0.5074081591231987, "eval_loss": 2.3490748405456543, "eval_runtime": 95.1766, "eval_samples_per_second": 991.893, "eval_steps_per_second": 7.754, "step": 15000 }, { "epoch": 0.6088897909478385, "grad_norm": 1.3671875, "learning_rate": 0.0002391110209052161, "loss": 2.8835, "step": 18000 }, { "epoch": 0.6088897909478385, "eval_loss": 2.3357362747192383, "eval_runtime": 95.2104, "eval_samples_per_second": 991.541, "eval_steps_per_second": 7.751, "step": 18000 }, { "epoch": 0.7103714227724782, "grad_norm": 1.40625, "learning_rate": 0.00022896285772275215, "loss": 2.8777, "step": 21000 }, { "epoch": 0.7103714227724782, "eval_loss": 2.33984375, "eval_runtime": 95.1033, "eval_samples_per_second": 992.658, "eval_steps_per_second": 7.76, "step": 21000 }, { "epoch": 0.8118530545971179, "grad_norm": 1.4140625, "learning_rate": 0.0002188146945402882, "loss": 2.8722, "step": 24000 }, { "epoch": 0.8118530545971179, "eval_loss": 2.335789203643799, "eval_runtime": 95.1796, "eval_samples_per_second": 991.862, "eval_steps_per_second": 7.754, "step": 24000 }, { "epoch": 0.9133346864217576, "grad_norm": 1.40625, "learning_rate": 0.00020866653135782423, "loss": 2.8663, "step": 27000 }, { "epoch": 0.9133346864217576, "eval_loss": 2.3275110721588135, "eval_runtime": 95.2734, "eval_samples_per_second": 990.885, "eval_steps_per_second": 7.746, "step": 27000 }, { "epoch": 1.0148163182463974, "grad_norm": 1.359375, "learning_rate": 0.00019851836817536025, "loss": 2.8658, "step": 30000 }, { "epoch": 1.0148163182463974, "eval_loss": 2.330390453338623, "eval_runtime": 95.2824, "eval_samples_per_second": 990.792, "eval_steps_per_second": 7.745, "step": 30000 }, { "epoch": 1.116297950071037, "grad_norm": 1.375, "learning_rate": 0.0001883702049928963, "loss": 2.8623, "step": 33000 }, { "epoch": 1.116297950071037, "eval_loss": 2.3300411701202393, "eval_runtime": 95.3514, "eval_samples_per_second": 990.074, "eval_steps_per_second": 7.74, "step": 33000 }, { "epoch": 1.217779581895677, "grad_norm": 1.40625, "learning_rate": 0.0001782220418104323, "loss": 2.8579, "step": 36000 }, { "epoch": 1.217779581895677, "eval_loss": 2.3285059928894043, "eval_runtime": 95.288, "eval_samples_per_second": 990.734, "eval_steps_per_second": 7.745, "step": 36000 }, { "epoch": 1.3192612137203166, "grad_norm": 1.9140625, "learning_rate": 0.00016807387862796832, "loss": 2.857, "step": 39000 }, { "epoch": 1.3192612137203166, "eval_loss": 2.3231706619262695, "eval_runtime": 95.3264, "eval_samples_per_second": 990.334, "eval_steps_per_second": 7.742, "step": 39000 }, { "epoch": 1.4207428455449564, "grad_norm": 1.359375, "learning_rate": 0.00015792571544550436, "loss": 2.8552, "step": 42000 }, { "epoch": 1.4207428455449564, "eval_loss": 2.3225038051605225, "eval_runtime": 95.2774, "eval_samples_per_second": 990.843, "eval_steps_per_second": 7.746, "step": 42000 }, { "epoch": 1.522224477369596, "grad_norm": 1.40625, "learning_rate": 0.00014777755226304037, "loss": 2.8548, "step": 45000 }, { "epoch": 1.522224477369596, "eval_loss": 2.3205666542053223, "eval_runtime": 95.4014, "eval_samples_per_second": 989.556, "eval_steps_per_second": 7.736, "step": 45000 }, { "epoch": 1.6237061091942357, "grad_norm": 1.4375, "learning_rate": 0.0001376293890805764, "loss": 2.8518, "step": 48000 }, { "epoch": 1.6237061091942357, "eval_loss": 2.324282169342041, "eval_runtime": 95.4158, "eval_samples_per_second": 989.407, "eval_steps_per_second": 7.735, "step": 48000 }, { "epoch": 1.7251877410188756, "grad_norm": 1.7578125, "learning_rate": 0.00012748122589811243, "loss": 2.8539, "step": 51000 }, { "epoch": 1.7251877410188756, "eval_loss": 2.3227896690368652, "eval_runtime": 95.374, "eval_samples_per_second": 989.84, "eval_steps_per_second": 7.738, "step": 51000 }, { "epoch": 1.8266693728435153, "grad_norm": 1.34375, "learning_rate": 0.00011733306271564845, "loss": 2.8483, "step": 54000 }, { "epoch": 1.8266693728435153, "eval_loss": 2.3198044300079346, "eval_runtime": 95.1153, "eval_samples_per_second": 992.532, "eval_steps_per_second": 7.759, "step": 54000 }, { "epoch": 1.928151004668155, "grad_norm": 1.3359375, "learning_rate": 0.00010718489953318448, "loss": 2.8512, "step": 57000 }, { "epoch": 1.928151004668155, "eval_loss": 2.321180582046509, "eval_runtime": 95.1519, "eval_samples_per_second": 992.15, "eval_steps_per_second": 7.756, "step": 57000 }, { "epoch": 2.029632636492795, "grad_norm": 1.3671875, "learning_rate": 9.703673635072052e-05, "loss": 2.8515, "step": 60000 }, { "epoch": 2.029632636492795, "eval_loss": 2.3204078674316406, "eval_runtime": 95.1797, "eval_samples_per_second": 991.861, "eval_steps_per_second": 7.754, "step": 60000 }, { "epoch": 2.1311142683174347, "grad_norm": 1.390625, "learning_rate": 8.688857316825655e-05, "loss": 2.8512, "step": 63000 }, { "epoch": 2.1311142683174347, "eval_loss": 2.3204712867736816, "eval_runtime": 95.1755, "eval_samples_per_second": 991.905, "eval_steps_per_second": 7.754, "step": 63000 }, { "epoch": 2.232595900142074, "grad_norm": 1.484375, "learning_rate": 7.674040998579256e-05, "loss": 2.8492, "step": 66000 }, { "epoch": 2.232595900142074, "eval_loss": 2.3218369483947754, "eval_runtime": 95.2213, "eval_samples_per_second": 991.428, "eval_steps_per_second": 7.75, "step": 66000 }, { "epoch": 2.334077531966714, "grad_norm": 1.4296875, "learning_rate": 6.659224680332859e-05, "loss": 2.851, "step": 69000 }, { "epoch": 2.334077531966714, "eval_loss": 2.3220698833465576, "eval_runtime": 95.1826, "eval_samples_per_second": 991.831, "eval_steps_per_second": 7.754, "step": 69000 }, { "epoch": 2.435559163791354, "grad_norm": 1.46875, "learning_rate": 5.644408362086462e-05, "loss": 2.8497, "step": 72000 }, { "epoch": 2.435559163791354, "eval_loss": 2.320767641067505, "eval_runtime": 95.2504, "eval_samples_per_second": 991.125, "eval_steps_per_second": 7.748, "step": 72000 }, { "epoch": 2.5370407956159937, "grad_norm": 1.46875, "learning_rate": 4.629592043840065e-05, "loss": 2.848, "step": 75000 }, { "epoch": 2.5370407956159937, "eval_loss": 2.3203125, "eval_runtime": 95.295, "eval_samples_per_second": 990.661, "eval_steps_per_second": 7.744, "step": 75000 }, { "epoch": 2.638522427440633, "grad_norm": 1.2265625, "learning_rate": 3.614775725593667e-05, "loss": 2.852, "step": 78000 }, { "epoch": 2.638522427440633, "eval_loss": 2.319963216781616, "eval_runtime": 95.168, "eval_samples_per_second": 991.982, "eval_steps_per_second": 7.755, "step": 78000 }, { "epoch": 2.740004059265273, "grad_norm": 1.328125, "learning_rate": 2.59995940734727e-05, "loss": 2.8483, "step": 81000 }, { "epoch": 2.740004059265273, "eval_loss": 2.321169853210449, "eval_runtime": 95.17, "eval_samples_per_second": 991.962, "eval_steps_per_second": 7.755, "step": 81000 }, { "epoch": 2.841485691089913, "grad_norm": 1.4296875, "learning_rate": 1.5851430891008727e-05, "loss": 2.85, "step": 84000 }, { "epoch": 2.841485691089913, "eval_loss": 2.3206405639648438, "eval_runtime": 95.2006, "eval_samples_per_second": 991.642, "eval_steps_per_second": 7.752, "step": 84000 }, { "epoch": 2.9429673229145523, "grad_norm": 1.375, "learning_rate": 5.703267708544753e-06, "loss": 2.8503, "step": 87000 }, { "epoch": 2.9429673229145523, "eval_loss": 2.320661783218384, "eval_runtime": 95.1542, "eval_samples_per_second": 992.127, "eval_steps_per_second": 7.756, "step": 87000 }, { "epoch": 3.0, "step": 88686, "total_flos": 1.3129555583125094e+18, "train_loss": 2.891493231738944, "train_runtime": 37678.5959, "train_samples_per_second": 301.271, "train_steps_per_second": 2.354 } ], "logging_steps": 3000, "max_steps": 88686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3129555583125094e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }