|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.4562002275312855, |
|
"eval_steps": 200, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.4095656871795654, |
|
"learning_rate": 9.999e-06, |
|
"loss": 0.1224, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1863898038864136, |
|
"learning_rate": 9.998000000000002e-06, |
|
"loss": 0.1054, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_validation_loss": 0.09953554719686508, |
|
"eval_validation_runtime": 712.7055, |
|
"eval_validation_samples_per_second": 1.625, |
|
"eval_validation_steps_per_second": 0.203, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_benchmark_loss": 0.32548987865448, |
|
"eval_benchmark_runtime": 17.5172, |
|
"eval_benchmark_samples_per_second": 1.484, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5735375881195068, |
|
"learning_rate": 9.997000000000001e-06, |
|
"loss": 0.101, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.6928790807724, |
|
"learning_rate": 9.996e-06, |
|
"loss": 0.0968, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_validation_loss": 0.0932546928524971, |
|
"eval_validation_runtime": 709.1289, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_benchmark_loss": 0.31784072518348694, |
|
"eval_benchmark_runtime": 17.5221, |
|
"eval_benchmark_samples_per_second": 1.484, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.1487505435943604, |
|
"learning_rate": 9.995000000000002e-06, |
|
"loss": 0.0959, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6419602036476135, |
|
"learning_rate": 9.994000000000001e-06, |
|
"loss": 0.0998, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_validation_loss": 0.0911107286810875, |
|
"eval_validation_runtime": 708.9068, |
|
"eval_validation_samples_per_second": 1.634, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_benchmark_loss": 0.3297904133796692, |
|
"eval_benchmark_runtime": 17.5622, |
|
"eval_benchmark_samples_per_second": 1.48, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2389020919799805, |
|
"learning_rate": 9.993e-06, |
|
"loss": 0.097, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.500471353530884, |
|
"learning_rate": 9.992e-06, |
|
"loss": 0.0988, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_validation_loss": 0.09039539098739624, |
|
"eval_validation_runtime": 709.2212, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_benchmark_loss": 0.3489764630794525, |
|
"eval_benchmark_runtime": 17.6053, |
|
"eval_benchmark_samples_per_second": 1.477, |
|
"eval_benchmark_steps_per_second": 0.227, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7160820960998535, |
|
"learning_rate": 9.991000000000001e-06, |
|
"loss": 0.0918, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.9196050763130188, |
|
"learning_rate": 9.990000000000001e-06, |
|
"loss": 0.097, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_validation_loss": 0.08984846621751785, |
|
"eval_validation_runtime": 709.6339, |
|
"eval_validation_samples_per_second": 1.632, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_benchmark_loss": 0.35091081261634827, |
|
"eval_benchmark_runtime": 17.5252, |
|
"eval_benchmark_samples_per_second": 1.484, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.5167059898376465, |
|
"learning_rate": 9.989e-06, |
|
"loss": 0.0932, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.0317384004592896, |
|
"learning_rate": 9.988000000000002e-06, |
|
"loss": 0.0919, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_validation_loss": 0.08784810453653336, |
|
"eval_validation_runtime": 709.1851, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_benchmark_loss": 0.3649544417858124, |
|
"eval_benchmark_runtime": 17.5537, |
|
"eval_benchmark_samples_per_second": 1.481, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9792326092720032, |
|
"learning_rate": 9.987000000000001e-06, |
|
"loss": 0.0978, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5393081903457642, |
|
"learning_rate": 9.986e-06, |
|
"loss": 0.0967, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_validation_loss": 0.08623290061950684, |
|
"eval_validation_runtime": 709.2989, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_benchmark_loss": 0.35636618733406067, |
|
"eval_benchmark_runtime": 17.52, |
|
"eval_benchmark_samples_per_second": 1.484, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.099972128868103, |
|
"learning_rate": 9.985000000000002e-06, |
|
"loss": 0.0945, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.371626377105713, |
|
"learning_rate": 9.984e-06, |
|
"loss": 0.0884, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_validation_loss": 0.08641818910837173, |
|
"eval_validation_runtime": 709.0375, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_benchmark_loss": 0.3572656214237213, |
|
"eval_benchmark_runtime": 17.5451, |
|
"eval_benchmark_samples_per_second": 1.482, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.2601964473724365, |
|
"learning_rate": 9.983e-06, |
|
"loss": 0.0858, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.709611713886261, |
|
"learning_rate": 9.982e-06, |
|
"loss": 0.0909, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_validation_loss": 0.08525099605321884, |
|
"eval_validation_runtime": 709.1501, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_benchmark_loss": 0.36886686086654663, |
|
"eval_benchmark_runtime": 17.5384, |
|
"eval_benchmark_samples_per_second": 1.482, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.529994249343872, |
|
"learning_rate": 9.981000000000002e-06, |
|
"loss": 0.0922, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.118903636932373, |
|
"learning_rate": 9.980000000000001e-06, |
|
"loss": 0.0865, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_validation_loss": 0.08593959361314774, |
|
"eval_validation_runtime": 708.9421, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_benchmark_loss": 0.373639851808548, |
|
"eval_benchmark_runtime": 17.5649, |
|
"eval_benchmark_samples_per_second": 1.48, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.64658522605896, |
|
"learning_rate": 9.979e-06, |
|
"loss": 0.0853, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.2276527881622314, |
|
"learning_rate": 9.978000000000002e-06, |
|
"loss": 0.0842, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_validation_loss": 0.08536743372678757, |
|
"eval_validation_runtime": 709.101, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_benchmark_loss": 0.37157073616981506, |
|
"eval_benchmark_runtime": 17.527, |
|
"eval_benchmark_samples_per_second": 1.483, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.052180528640747, |
|
"learning_rate": 9.977000000000001e-06, |
|
"loss": 0.0897, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.211391806602478, |
|
"learning_rate": 9.976e-06, |
|
"loss": 0.0943, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_validation_loss": 0.08370199054479599, |
|
"eval_validation_runtime": 709.1222, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_benchmark_loss": 0.392040878534317, |
|
"eval_benchmark_runtime": 17.5514, |
|
"eval_benchmark_samples_per_second": 1.481, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0776712894439697, |
|
"learning_rate": 9.975000000000002e-06, |
|
"loss": 0.0845, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.919306218624115, |
|
"learning_rate": 9.974e-06, |
|
"loss": 0.0849, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_validation_loss": 0.08564597368240356, |
|
"eval_validation_runtime": 709.0841, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_benchmark_loss": 0.3918878436088562, |
|
"eval_benchmark_runtime": 17.5531, |
|
"eval_benchmark_samples_per_second": 1.481, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.2274926900863647, |
|
"learning_rate": 9.973000000000001e-06, |
|
"loss": 0.0827, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.0446745157241821, |
|
"learning_rate": 9.972e-06, |
|
"loss": 0.0648, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_validation_loss": 0.08917231112718582, |
|
"eval_validation_runtime": 708.8767, |
|
"eval_validation_samples_per_second": 1.634, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_benchmark_loss": 0.4130290150642395, |
|
"eval_benchmark_runtime": 17.5261, |
|
"eval_benchmark_samples_per_second": 1.484, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.5209759473800659, |
|
"learning_rate": 9.971e-06, |
|
"loss": 0.0389, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.7747032642364502, |
|
"learning_rate": 9.970000000000001e-06, |
|
"loss": 0.0425, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"eval_validation_loss": 0.0874892994761467, |
|
"eval_validation_runtime": 709.0226, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"eval_benchmark_loss": 0.41068679094314575, |
|
"eval_benchmark_runtime": 17.5438, |
|
"eval_benchmark_samples_per_second": 1.482, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.7112635374069214, |
|
"learning_rate": 9.969e-06, |
|
"loss": 0.0391, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.6574044227600098, |
|
"learning_rate": 9.968000000000002e-06, |
|
"loss": 0.0407, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_validation_loss": 0.08782221376895905, |
|
"eval_validation_runtime": 708.8262, |
|
"eval_validation_samples_per_second": 1.634, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_benchmark_loss": 0.40687239170074463, |
|
"eval_benchmark_runtime": 17.5555, |
|
"eval_benchmark_samples_per_second": 1.481, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.3925846815109253, |
|
"learning_rate": 9.967000000000001e-06, |
|
"loss": 0.039, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.7560076713562012, |
|
"learning_rate": 9.966e-06, |
|
"loss": 0.0395, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_validation_loss": 0.08419103175401688, |
|
"eval_validation_runtime": 708.8516, |
|
"eval_validation_samples_per_second": 1.634, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_benchmark_loss": 0.38862425088882446, |
|
"eval_benchmark_runtime": 17.5289, |
|
"eval_benchmark_samples_per_second": 1.483, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.716073215007782, |
|
"learning_rate": 9.965000000000002e-06, |
|
"loss": 0.0386, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.1741386651992798, |
|
"learning_rate": 9.964e-06, |
|
"loss": 0.0405, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_validation_loss": 0.08967277407646179, |
|
"eval_validation_runtime": 708.8983, |
|
"eval_validation_samples_per_second": 1.634, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_benchmark_loss": 0.3966710865497589, |
|
"eval_benchmark_runtime": 17.5289, |
|
"eval_benchmark_samples_per_second": 1.483, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.2536277770996094, |
|
"learning_rate": 9.963000000000001e-06, |
|
"loss": 0.0396, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.1809356212615967, |
|
"learning_rate": 9.962e-06, |
|
"loss": 0.0444, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_validation_loss": 0.08714079856872559, |
|
"eval_validation_runtime": 709.1882, |
|
"eval_validation_samples_per_second": 1.633, |
|
"eval_validation_steps_per_second": 0.204, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_benchmark_loss": 0.3916112780570984, |
|
"eval_benchmark_runtime": 17.538, |
|
"eval_benchmark_samples_per_second": 1.482, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.5824371576309204, |
|
"learning_rate": 9.961e-06, |
|
"loss": 0.0389, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.8065107464790344, |
|
"learning_rate": 9.960000000000001e-06, |
|
"loss": 0.0392, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_validation_loss": 0.08596936613321304, |
|
"eval_validation_runtime": 708.8196, |
|
"eval_validation_samples_per_second": 1.634, |
|
"eval_validation_steps_per_second": 0.205, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_benchmark_loss": 0.4028017818927765, |
|
"eval_benchmark_runtime": 17.5338, |
|
"eval_benchmark_samples_per_second": 1.483, |
|
"eval_benchmark_steps_per_second": 0.228, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 1000000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 365, |
|
"save_steps": 200, |
|
"total_flos": 2.608121981079552e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|