|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0028436018957345, |
|
"eval_steps": 66, |
|
"global_step": 264, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0037914691943127963, |
|
"eval_loss": 2.0063271522521973, |
|
"eval_runtime": 14.7339, |
|
"eval_samples_per_second": 7.534, |
|
"eval_steps_per_second": 3.801, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018957345971563982, |
|
"grad_norm": 4.575433254241943, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.316, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.037914691943127965, |
|
"grad_norm": 1.6114368438720703, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.9077, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05687203791469194, |
|
"grad_norm": 1.59214448928833, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1933, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07582938388625593, |
|
"grad_norm": 1.5013682842254639, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.1184, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0947867298578199, |
|
"grad_norm": 2.266918897628784, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.2437, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11374407582938388, |
|
"grad_norm": 2.5026302337646484, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3318, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13270142180094788, |
|
"grad_norm": 4.762207984924316, |
|
"learning_rate": 9.988738792578126e-05, |
|
"loss": 1.3961, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15165876777251186, |
|
"grad_norm": 5.990009784698486, |
|
"learning_rate": 9.955005896229543e-05, |
|
"loss": 1.3369, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17061611374407584, |
|
"grad_norm": 13.45676326751709, |
|
"learning_rate": 9.898953260211338e-05, |
|
"loss": 1.6573, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1895734597156398, |
|
"grad_norm": 20.142257690429688, |
|
"learning_rate": 9.820833372667812e-05, |
|
"loss": 2.4071, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20853080568720378, |
|
"grad_norm": 2.136573314666748, |
|
"learning_rate": 9.720998123301923e-05, |
|
"loss": 1.3441, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22748815165876776, |
|
"grad_norm": 1.304834008216858, |
|
"learning_rate": 9.599897218294122e-05, |
|
"loss": 3.473, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24644549763033174, |
|
"grad_norm": 1.3176236152648926, |
|
"learning_rate": 9.458076154608515e-05, |
|
"loss": 1.0091, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2502369668246445, |
|
"eval_loss": 1.3693103790283203, |
|
"eval_runtime": 15.1018, |
|
"eval_samples_per_second": 7.35, |
|
"eval_steps_per_second": 3.708, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.26540284360189575, |
|
"grad_norm": 1.3287938833236694, |
|
"learning_rate": 9.296173762811085e-05, |
|
"loss": 0.9501, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2843601895734597, |
|
"grad_norm": 2.3178670406341553, |
|
"learning_rate": 9.114919329468282e-05, |
|
"loss": 1.1101, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3033175355450237, |
|
"grad_norm": 3.3593289852142334, |
|
"learning_rate": 8.915129312088112e-05, |
|
"loss": 1.1248, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3222748815165877, |
|
"grad_norm": 2.683689832687378, |
|
"learning_rate": 8.697703661401186e-05, |
|
"loss": 1.0555, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3412322274881517, |
|
"grad_norm": 2.545621871948242, |
|
"learning_rate": 8.463621767547998e-05, |
|
"loss": 0.9885, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36018957345971564, |
|
"grad_norm": 6.741793155670166, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 1.3054, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3791469194312796, |
|
"grad_norm": 30.347795486450195, |
|
"learning_rate": 7.949777200115616e-05, |
|
"loss": 2.2587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3981042654028436, |
|
"grad_norm": 0.6259477138519287, |
|
"learning_rate": 7.672329130639005e-05, |
|
"loss": 0.9893, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.41706161137440756, |
|
"grad_norm": 1.175485372543335, |
|
"learning_rate": 7.38284360010654e-05, |
|
"loss": 3.1866, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.43601895734597157, |
|
"grad_norm": 1.1434948444366455, |
|
"learning_rate": 7.082624591160201e-05, |
|
"loss": 1.0689, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4549763033175355, |
|
"grad_norm": 2.3580820560455322, |
|
"learning_rate": 6.773024435212678e-05, |
|
"loss": 1.1477, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.47393364928909953, |
|
"grad_norm": 1.5657998323440552, |
|
"learning_rate": 6.455437720893564e-05, |
|
"loss": 0.8839, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4928909952606635, |
|
"grad_norm": 2.185641288757324, |
|
"learning_rate": 6.131295012148612e-05, |
|
"loss": 1.0864, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.500473933649289, |
|
"eval_loss": 1.240807056427002, |
|
"eval_runtime": 14.9134, |
|
"eval_samples_per_second": 7.443, |
|
"eval_steps_per_second": 3.755, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5118483412322274, |
|
"grad_norm": 2.4784717559814453, |
|
"learning_rate": 5.8020564042888015e-05, |
|
"loss": 0.9521, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5308056872037915, |
|
"grad_norm": 4.177745342254639, |
|
"learning_rate": 5.469204947015897e-05, |
|
"loss": 1.1792, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5497630331753555, |
|
"grad_norm": 9.582822799682617, |
|
"learning_rate": 5.134239964050307e-05, |
|
"loss": 1.4924, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5687203791469194, |
|
"grad_norm": 15.12871265411377, |
|
"learning_rate": 4.798670299452926e-05, |
|
"loss": 2.0682, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5876777251184834, |
|
"grad_norm": 1.2145541906356812, |
|
"learning_rate": 4.4640075210627615e-05, |
|
"loss": 0.9506, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6066350710900474, |
|
"grad_norm": 0.9340455532073975, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 3.1557, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6255924170616114, |
|
"grad_norm": 1.0512124300003052, |
|
"learning_rate": 3.803421678562213e-05, |
|
"loss": 1.3106, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6445497630331753, |
|
"grad_norm": 1.033964991569519, |
|
"learning_rate": 3.480474212128766e-05, |
|
"loss": 1.1408, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6635071090047393, |
|
"grad_norm": 1.7510710954666138, |
|
"learning_rate": 3.164371423727362e-05, |
|
"loss": 0.8601, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6824644549763034, |
|
"grad_norm": 1.9360414743423462, |
|
"learning_rate": 2.8565371929847284e-05, |
|
"loss": 1.1003, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7014218009478673, |
|
"grad_norm": 1.985480785369873, |
|
"learning_rate": 2.5583581539504464e-05, |
|
"loss": 0.9948, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7203791469194313, |
|
"grad_norm": 3.7504513263702393, |
|
"learning_rate": 2.2711774490274766e-05, |
|
"loss": 1.152, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7393364928909952, |
|
"grad_norm": 4.799010753631592, |
|
"learning_rate": 1.996288678810105e-05, |
|
"loss": 1.4613, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7507109004739336, |
|
"eval_loss": 1.1724706888198853, |
|
"eval_runtime": 15.1001, |
|
"eval_samples_per_second": 7.351, |
|
"eval_steps_per_second": 3.709, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7582938388625592, |
|
"grad_norm": 36.87416076660156, |
|
"learning_rate": 1.734930075082076e-05, |
|
"loss": 1.8898, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7772511848341233, |
|
"grad_norm": 0.7642176151275635, |
|
"learning_rate": 1.4882789232226125e-05, |
|
"loss": 1.1519, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7962085308056872, |
|
"grad_norm": 0.675323486328125, |
|
"learning_rate": 1.257446259144494e-05, |
|
"loss": 2.5007, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8151658767772512, |
|
"grad_norm": 1.0708622932434082, |
|
"learning_rate": 1.0434718646516917e-05, |
|
"loss": 0.8607, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8341232227488151, |
|
"grad_norm": 2.112217664718628, |
|
"learning_rate": 8.473195837599418e-06, |
|
"loss": 0.9331, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8530805687203792, |
|
"grad_norm": 1.9522455930709839, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 1.1168, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8720379146919431, |
|
"grad_norm": 2.002206563949585, |
|
"learning_rate": 5.1193136180493095e-06, |
|
"loss": 0.9973, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8909952606635071, |
|
"grad_norm": 1.8621110916137695, |
|
"learning_rate": 3.7420617127538248e-06, |
|
"loss": 0.9534, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.909952606635071, |
|
"grad_norm": 3.7210357189178467, |
|
"learning_rate": 2.573177902642726e-06, |
|
"loss": 1.2029, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9289099526066351, |
|
"grad_norm": 3.859623670578003, |
|
"learning_rate": 1.6179274049310966e-06, |
|
"loss": 1.4043, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9478672985781991, |
|
"grad_norm": 15.304886817932129, |
|
"learning_rate": 8.806131292167618e-07, |
|
"loss": 2.086, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.966824644549763, |
|
"grad_norm": 1.3227756023406982, |
|
"learning_rate": 3.6455629509730136e-07, |
|
"loss": 1.6543, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.985781990521327, |
|
"grad_norm": 2.1567444801330566, |
|
"learning_rate": 7.208147179291192e-08, |
|
"loss": 1.0496, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0028436018957345, |
|
"eval_loss": 1.1479114294052124, |
|
"eval_runtime": 15.1375, |
|
"eval_samples_per_second": 7.333, |
|
"eval_steps_per_second": 3.699, |
|
"step": 264 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 264, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.631756088414044e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|