|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 3000, |
|
"global_step": 88686, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.382721060821325e-05, |
|
"grad_norm": 1352.0, |
|
"learning_rate": 0.00029999661727893914, |
|
"loss": 22.625, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10148163182463973, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.000289851836817536, |
|
"loss": 3.5832, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10148163182463973, |
|
"eval_loss": 2.45703125, |
|
"eval_runtime": 96.0596, |
|
"eval_samples_per_second": 982.775, |
|
"eval_steps_per_second": 7.683, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.20296326364927947, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.000279703673635072, |
|
"loss": 2.9906, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.20296326364927947, |
|
"eval_loss": 2.388899564743042, |
|
"eval_runtime": 95.3468, |
|
"eval_samples_per_second": 990.122, |
|
"eval_steps_per_second": 7.74, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.30444489547391923, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00026955551045260807, |
|
"loss": 2.9367, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.30444489547391923, |
|
"eval_loss": 2.3747141361236572, |
|
"eval_runtime": 95.1929, |
|
"eval_samples_per_second": 991.723, |
|
"eval_steps_per_second": 7.753, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.40592652729855894, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0002594073472701441, |
|
"loss": 2.9115, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.40592652729855894, |
|
"eval_loss": 2.3593432903289795, |
|
"eval_runtime": 95.2533, |
|
"eval_samples_per_second": 991.095, |
|
"eval_steps_per_second": 7.748, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.5074081591231987, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0002492591840876801, |
|
"loss": 2.891, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5074081591231987, |
|
"eval_loss": 2.3490748405456543, |
|
"eval_runtime": 95.1766, |
|
"eval_samples_per_second": 991.893, |
|
"eval_steps_per_second": 7.754, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.6088897909478385, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0002391110209052161, |
|
"loss": 2.8835, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6088897909478385, |
|
"eval_loss": 2.3357362747192383, |
|
"eval_runtime": 95.2104, |
|
"eval_samples_per_second": 991.541, |
|
"eval_steps_per_second": 7.751, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.7103714227724782, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00022896285772275215, |
|
"loss": 2.8777, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.7103714227724782, |
|
"eval_loss": 2.33984375, |
|
"eval_runtime": 95.1033, |
|
"eval_samples_per_second": 992.658, |
|
"eval_steps_per_second": 7.76, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.8118530545971179, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0002188146945402882, |
|
"loss": 2.8722, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8118530545971179, |
|
"eval_loss": 2.335789203643799, |
|
"eval_runtime": 95.1796, |
|
"eval_samples_per_second": 991.862, |
|
"eval_steps_per_second": 7.754, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.9133346864217576, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00020866653135782423, |
|
"loss": 2.8663, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9133346864217576, |
|
"eval_loss": 2.3275110721588135, |
|
"eval_runtime": 95.2734, |
|
"eval_samples_per_second": 990.885, |
|
"eval_steps_per_second": 7.746, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.0148163182463974, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00019851836817536025, |
|
"loss": 2.8658, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.0148163182463974, |
|
"eval_loss": 2.330390453338623, |
|
"eval_runtime": 95.2824, |
|
"eval_samples_per_second": 990.792, |
|
"eval_steps_per_second": 7.745, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.116297950071037, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0001883702049928963, |
|
"loss": 2.8623, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.116297950071037, |
|
"eval_loss": 2.3300411701202393, |
|
"eval_runtime": 95.3514, |
|
"eval_samples_per_second": 990.074, |
|
"eval_steps_per_second": 7.74, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.217779581895677, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0001782220418104323, |
|
"loss": 2.8579, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.217779581895677, |
|
"eval_loss": 2.3285059928894043, |
|
"eval_runtime": 95.288, |
|
"eval_samples_per_second": 990.734, |
|
"eval_steps_per_second": 7.745, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.00016807387862796832, |
|
"loss": 2.857, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"eval_loss": 2.3231706619262695, |
|
"eval_runtime": 95.3264, |
|
"eval_samples_per_second": 990.334, |
|
"eval_steps_per_second": 7.742, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.4207428455449564, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00015792571544550436, |
|
"loss": 2.8552, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.4207428455449564, |
|
"eval_loss": 2.3225038051605225, |
|
"eval_runtime": 95.2774, |
|
"eval_samples_per_second": 990.843, |
|
"eval_steps_per_second": 7.746, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.522224477369596, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00014777755226304037, |
|
"loss": 2.8548, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.522224477369596, |
|
"eval_loss": 2.3205666542053223, |
|
"eval_runtime": 95.4014, |
|
"eval_samples_per_second": 989.556, |
|
"eval_steps_per_second": 7.736, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.6237061091942357, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0001376293890805764, |
|
"loss": 2.8518, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.6237061091942357, |
|
"eval_loss": 2.324282169342041, |
|
"eval_runtime": 95.4158, |
|
"eval_samples_per_second": 989.407, |
|
"eval_steps_per_second": 7.735, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.7251877410188756, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.00012748122589811243, |
|
"loss": 2.8539, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.7251877410188756, |
|
"eval_loss": 2.3227896690368652, |
|
"eval_runtime": 95.374, |
|
"eval_samples_per_second": 989.84, |
|
"eval_steps_per_second": 7.738, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.8266693728435153, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00011733306271564845, |
|
"loss": 2.8483, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.8266693728435153, |
|
"eval_loss": 2.3198044300079346, |
|
"eval_runtime": 95.1153, |
|
"eval_samples_per_second": 992.532, |
|
"eval_steps_per_second": 7.759, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.928151004668155, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00010718489953318448, |
|
"loss": 2.8512, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.928151004668155, |
|
"eval_loss": 2.321180582046509, |
|
"eval_runtime": 95.1519, |
|
"eval_samples_per_second": 992.15, |
|
"eval_steps_per_second": 7.756, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.029632636492795, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 9.703673635072052e-05, |
|
"loss": 2.8515, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.029632636492795, |
|
"eval_loss": 2.3204078674316406, |
|
"eval_runtime": 95.1797, |
|
"eval_samples_per_second": 991.861, |
|
"eval_steps_per_second": 7.754, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.1311142683174347, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 8.688857316825655e-05, |
|
"loss": 2.8512, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.1311142683174347, |
|
"eval_loss": 2.3204712867736816, |
|
"eval_runtime": 95.1755, |
|
"eval_samples_per_second": 991.905, |
|
"eval_steps_per_second": 7.754, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.232595900142074, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 7.674040998579256e-05, |
|
"loss": 2.8492, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.232595900142074, |
|
"eval_loss": 2.3218369483947754, |
|
"eval_runtime": 95.2213, |
|
"eval_samples_per_second": 991.428, |
|
"eval_steps_per_second": 7.75, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.334077531966714, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 6.659224680332859e-05, |
|
"loss": 2.851, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.334077531966714, |
|
"eval_loss": 2.3220698833465576, |
|
"eval_runtime": 95.1826, |
|
"eval_samples_per_second": 991.831, |
|
"eval_steps_per_second": 7.754, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.435559163791354, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 5.644408362086462e-05, |
|
"loss": 2.8497, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.435559163791354, |
|
"eval_loss": 2.320767641067505, |
|
"eval_runtime": 95.2504, |
|
"eval_samples_per_second": 991.125, |
|
"eval_steps_per_second": 7.748, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.5370407956159937, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.629592043840065e-05, |
|
"loss": 2.848, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.5370407956159937, |
|
"eval_loss": 2.3203125, |
|
"eval_runtime": 95.295, |
|
"eval_samples_per_second": 990.661, |
|
"eval_steps_per_second": 7.744, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.614775725593667e-05, |
|
"loss": 2.852, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"eval_loss": 2.319963216781616, |
|
"eval_runtime": 95.168, |
|
"eval_samples_per_second": 991.982, |
|
"eval_steps_per_second": 7.755, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.740004059265273, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 2.59995940734727e-05, |
|
"loss": 2.8483, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.740004059265273, |
|
"eval_loss": 2.321169853210449, |
|
"eval_runtime": 95.17, |
|
"eval_samples_per_second": 991.962, |
|
"eval_steps_per_second": 7.755, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.841485691089913, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.5851430891008727e-05, |
|
"loss": 2.85, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.841485691089913, |
|
"eval_loss": 2.3206405639648438, |
|
"eval_runtime": 95.2006, |
|
"eval_samples_per_second": 991.642, |
|
"eval_steps_per_second": 7.752, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.9429673229145523, |
|
"grad_norm": 1.375, |
|
"learning_rate": 5.703267708544753e-06, |
|
"loss": 2.8503, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.9429673229145523, |
|
"eval_loss": 2.320661783218384, |
|
"eval_runtime": 95.1542, |
|
"eval_samples_per_second": 992.127, |
|
"eval_steps_per_second": 7.756, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 88686, |
|
"total_flos": 1.3129555583125094e+18, |
|
"train_loss": 2.891493231738944, |
|
"train_runtime": 37678.5959, |
|
"train_samples_per_second": 301.271, |
|
"train_steps_per_second": 2.354 |
|
} |
|
], |
|
"logging_steps": 3000, |
|
"max_steps": 88686, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3129555583125094e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|