|
{ |
|
"best_metric": 0.8062803149223328, |
|
"best_model_checkpoint": "./output/training_results/C015_llama3-8b-base_instruct_20240504_123713/checkpoint-10", |
|
"epoch": 4.0, |
|
"eval_steps": 5, |
|
"global_step": 192, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8766, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 10.34330229700748, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.8675, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"eval_loss": 0.8584801554679871, |
|
"eval_runtime": 1.9951, |
|
"eval_samples_per_second": 170.414, |
|
"eval_steps_per_second": 1.504, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 4.512014612667122, |
|
"learning_rate": 5.25e-06, |
|
"loss": 0.8415, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"eval_loss": 0.8062803149223328, |
|
"eval_runtime": 1.9589, |
|
"eval_samples_per_second": 173.564, |
|
"eval_steps_per_second": 1.531, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 5.466223920700161, |
|
"learning_rate": 8.25e-06, |
|
"loss": 0.8225, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 0.820951521396637, |
|
"eval_runtime": 1.9583, |
|
"eval_samples_per_second": 173.62, |
|
"eval_steps_per_second": 1.532, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 5.091985939472258, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.806, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"eval_loss": 0.8412486910820007, |
|
"eval_runtime": 1.9516, |
|
"eval_samples_per_second": 174.217, |
|
"eval_steps_per_second": 1.537, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 4.323182492427286, |
|
"learning_rate": 1.4071209905461127e-05, |
|
"loss": 0.8139, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"eval_loss": 0.8701534867286682, |
|
"eval_runtime": 1.956, |
|
"eval_samples_per_second": 173.828, |
|
"eval_steps_per_second": 1.534, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 4.430828029367158, |
|
"learning_rate": 1.0166196232101288e-05, |
|
"loss": 0.8978, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 0.8630704879760742, |
|
"eval_runtime": 1.9545, |
|
"eval_samples_per_second": 173.954, |
|
"eval_steps_per_second": 1.535, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 3.8459262571296122, |
|
"learning_rate": 7.276248845991498e-06, |
|
"loss": 0.814, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"eval_loss": 0.8549697995185852, |
|
"eval_runtime": 1.9539, |
|
"eval_samples_per_second": 174.008, |
|
"eval_steps_per_second": 1.535, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 4.312137758874538, |
|
"learning_rate": 5.157388080190487e-06, |
|
"loss": 0.7989, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_loss": 0.8472943902015686, |
|
"eval_runtime": 1.9515, |
|
"eval_samples_per_second": 174.222, |
|
"eval_steps_per_second": 1.537, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 4.013634591695819, |
|
"learning_rate": 3.6192313334626905e-06, |
|
"loss": 0.8769, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.8382811546325684, |
|
"eval_runtime": 1.9527, |
|
"eval_samples_per_second": 174.116, |
|
"eval_steps_per_second": 1.536, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 3.825805552811641, |
|
"learning_rate": 2.514391432582838e-06, |
|
"loss": 0.7244, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"eval_loss": 0.8277742266654968, |
|
"eval_runtime": 1.9549, |
|
"eval_samples_per_second": 173.925, |
|
"eval_steps_per_second": 1.535, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1458333333333333, |
|
"grad_norm": 3.0670883145044487, |
|
"learning_rate": 1.7297262757656213e-06, |
|
"loss": 0.4644, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.1458333333333333, |
|
"eval_loss": 0.8387134671211243, |
|
"eval_runtime": 1.959, |
|
"eval_samples_per_second": 173.561, |
|
"eval_steps_per_second": 1.531, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.904758197983173, |
|
"learning_rate": 1.1791620375982074e-06, |
|
"loss": 0.4488, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8680305480957031, |
|
"eval_runtime": 1.953, |
|
"eval_samples_per_second": 174.087, |
|
"eval_steps_per_second": 1.536, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3541666666666667, |
|
"grad_norm": 4.037522952699986, |
|
"learning_rate": 7.978466092394693e-07, |
|
"loss": 0.3973, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.3541666666666667, |
|
"eval_loss": 0.8717625737190247, |
|
"eval_runtime": 1.9541, |
|
"eval_samples_per_second": 173.996, |
|
"eval_steps_per_second": 1.535, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 4.006395162021574, |
|
"learning_rate": 5.374210410959207e-07, |
|
"loss": 0.443, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"eval_loss": 0.8596016764640808, |
|
"eval_runtime": 1.9513, |
|
"eval_samples_per_second": 174.244, |
|
"eval_steps_per_second": 1.537, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 4.477860740274337, |
|
"learning_rate": 3.6222476698215175e-07, |
|
"loss": 0.4346, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.8514222502708435, |
|
"eval_runtime": 1.9616, |
|
"eval_samples_per_second": 173.329, |
|
"eval_steps_per_second": 1.529, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.79003494996975, |
|
"learning_rate": 2.462755297384099e-07, |
|
"loss": 0.4701, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 0.8461114764213562, |
|
"eval_runtime": 1.9519, |
|
"eval_samples_per_second": 174.192, |
|
"eval_steps_per_second": 1.537, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7708333333333335, |
|
"grad_norm": 3.431009868136907, |
|
"learning_rate": 1.7088740175034947e-07, |
|
"loss": 0.4344, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.7708333333333335, |
|
"eval_loss": 0.8437052369117737, |
|
"eval_runtime": 1.9548, |
|
"eval_samples_per_second": 173.928, |
|
"eval_steps_per_second": 1.535, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 3.4612975522103846, |
|
"learning_rate": 1.228102956599465e-07, |
|
"loss": 0.4274, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.8434357643127441, |
|
"eval_runtime": 1.9551, |
|
"eval_samples_per_second": 173.905, |
|
"eval_steps_per_second": 1.534, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9791666666666665, |
|
"grad_norm": 4.089060356958601, |
|
"learning_rate": 9.279207916081227e-08, |
|
"loss": 0.4771, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.9791666666666665, |
|
"eval_loss": 0.8434197902679443, |
|
"eval_runtime": 1.9533, |
|
"eval_samples_per_second": 174.06, |
|
"eval_steps_per_second": 1.536, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 3.5663107359521624, |
|
"learning_rate": 7.448002404850094e-08, |
|
"loss": 0.3876, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"eval_loss": 0.8438728451728821, |
|
"eval_runtime": 1.957, |
|
"eval_samples_per_second": 173.739, |
|
"eval_steps_per_second": 1.533, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 3.3025378175144806, |
|
"learning_rate": 6.35920070839697e-08, |
|
"loss": 0.3698, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.845079243183136, |
|
"eval_runtime": 1.9562, |
|
"eval_samples_per_second": 173.803, |
|
"eval_steps_per_second": 1.534, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"grad_norm": 3.4985300165216615, |
|
"learning_rate": 5.7299804687499997e-08, |
|
"loss": 0.407, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"eval_loss": 0.8465444445610046, |
|
"eval_runtime": 1.9573, |
|
"eval_samples_per_second": 173.708, |
|
"eval_steps_per_second": 1.533, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3958333333333335, |
|
"grad_norm": 3.394752587183079, |
|
"learning_rate": 5.37771434967624e-08, |
|
"loss": 0.374, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.3958333333333335, |
|
"eval_loss": 0.8481599688529968, |
|
"eval_runtime": 1.9556, |
|
"eval_samples_per_second": 173.861, |
|
"eval_steps_per_second": 1.534, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.399721414998381, |
|
"learning_rate": 5.187403540619925e-08, |
|
"loss": 0.3945, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.8498236536979675, |
|
"eval_runtime": 1.9552, |
|
"eval_samples_per_second": 173.893, |
|
"eval_steps_per_second": 1.534, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6041666666666665, |
|
"grad_norm": 3.2768849901156845, |
|
"learning_rate": 5.088648238966908e-08, |
|
"loss": 0.3753, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.6041666666666665, |
|
"eval_loss": 0.8512565493583679, |
|
"eval_runtime": 1.9594, |
|
"eval_samples_per_second": 173.526, |
|
"eval_steps_per_second": 1.531, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"grad_norm": 3.666595330730063, |
|
"learning_rate": 5.039701925276604e-08, |
|
"loss": 0.3721, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"eval_loss": 0.8527700304985046, |
|
"eval_runtime": 1.9575, |
|
"eval_samples_per_second": 173.689, |
|
"eval_steps_per_second": 1.533, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 3.4733320032072537, |
|
"learning_rate": 5.0166900048082497e-08, |
|
"loss": 0.3718, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.8541720509529114, |
|
"eval_runtime": 1.9599, |
|
"eval_samples_per_second": 173.479, |
|
"eval_steps_per_second": 1.531, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 3.447696757476531, |
|
"learning_rate": 5.0065147322870076e-08, |
|
"loss": 0.3773, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"eval_loss": 0.8555252552032471, |
|
"eval_runtime": 1.9586, |
|
"eval_samples_per_second": 173.592, |
|
"eval_steps_per_second": 1.532, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0208333333333335, |
|
"grad_norm": 3.0052210603196237, |
|
"learning_rate": 5.002328628528332e-08, |
|
"loss": 0.3723, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.0208333333333335, |
|
"eval_loss": 0.8565484881401062, |
|
"eval_runtime": 1.9586, |
|
"eval_samples_per_second": 173.589, |
|
"eval_steps_per_second": 1.532, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 3.368197941438436, |
|
"learning_rate": 5.0007484528133236e-08, |
|
"loss": 0.374, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.8576194643974304, |
|
"eval_runtime": 1.9541, |
|
"eval_samples_per_second": 173.993, |
|
"eval_steps_per_second": 1.535, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2291666666666665, |
|
"grad_norm": 3.3290743731904304, |
|
"learning_rate": 5.0002110817570477e-08, |
|
"loss": 0.3728, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.2291666666666665, |
|
"eval_loss": 0.8588044047355652, |
|
"eval_runtime": 1.951, |
|
"eval_samples_per_second": 174.273, |
|
"eval_steps_per_second": 1.538, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 4.793937739567796, |
|
"learning_rate": 5.0000504842356326e-08, |
|
"loss": 0.3686, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 0.859791100025177, |
|
"eval_runtime": 1.9522, |
|
"eval_samples_per_second": 174.159, |
|
"eval_steps_per_second": 1.537, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 3.326342529192208, |
|
"learning_rate": 5.000009745562451e-08, |
|
"loss": 0.3617, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.8607122302055359, |
|
"eval_runtime": 1.958, |
|
"eval_samples_per_second": 173.647, |
|
"eval_steps_per_second": 1.532, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"grad_norm": 3.6505713497705736, |
|
"learning_rate": 5.0000014077810156e-08, |
|
"loss": 0.3546, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"eval_loss": 0.8613293170928955, |
|
"eval_runtime": 1.9527, |
|
"eval_samples_per_second": 174.122, |
|
"eval_steps_per_second": 1.536, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6458333333333335, |
|
"grad_norm": 3.496080458530573, |
|
"learning_rate": 5.0000001343508807e-08, |
|
"loss": 0.3707, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.6458333333333335, |
|
"eval_loss": 0.8619220852851868, |
|
"eval_runtime": 1.9552, |
|
"eval_samples_per_second": 173.893, |
|
"eval_steps_per_second": 1.534, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.50316414527161, |
|
"learning_rate": 5.000000006747581e-08, |
|
"loss": 0.3739, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.862490177154541, |
|
"eval_runtime": 1.9547, |
|
"eval_samples_per_second": 173.936, |
|
"eval_steps_per_second": 1.535, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8541666666666665, |
|
"grad_norm": 3.7278057893863874, |
|
"learning_rate": 5.0000000001094325e-08, |
|
"loss": 0.3617, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.8541666666666665, |
|
"eval_loss": 0.8631939888000488, |
|
"eval_runtime": 1.9574, |
|
"eval_samples_per_second": 173.703, |
|
"eval_steps_per_second": 1.533, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"grad_norm": 3.160928200357982, |
|
"learning_rate": 5.000000000000139e-08, |
|
"loss": 0.3591, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"eval_loss": 0.8637197613716125, |
|
"eval_runtime": 1.9552, |
|
"eval_samples_per_second": 173.893, |
|
"eval_steps_per_second": 1.534, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 192, |
|
"total_flos": 5334785064960.0, |
|
"train_loss": 0.508326952966551, |
|
"train_runtime": 4164.2152, |
|
"train_samples_per_second": 2.934, |
|
"train_steps_per_second": 0.046 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 192, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 5, |
|
"total_flos": 5334785064960.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|