|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 834, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03597122302158273, |
|
"grad_norm": 26.93948859971876, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0175, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07194244604316546, |
|
"grad_norm": 2.6743423167480063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9337, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1079136690647482, |
|
"grad_norm": 1.100805597904231, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8923, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14388489208633093, |
|
"grad_norm": 0.8355539701078896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8673, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17985611510791366, |
|
"grad_norm": 0.7157506047100403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8553, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2158273381294964, |
|
"grad_norm": 0.9806631521043339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8492, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2517985611510791, |
|
"grad_norm": 0.8360835611944488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8382, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28776978417266186, |
|
"grad_norm": 0.7078472519601653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8318, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3237410071942446, |
|
"grad_norm": 0.6255785562847258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.825, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3597122302158273, |
|
"grad_norm": 0.6950072028339258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8225, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39568345323741005, |
|
"grad_norm": 0.622757689781733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8165, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4316546762589928, |
|
"grad_norm": 0.6855173384055511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8162, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4676258992805755, |
|
"grad_norm": 0.555459004966806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8141, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5035971223021583, |
|
"grad_norm": 0.7189252900166325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8113, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.539568345323741, |
|
"grad_norm": 0.8411135438726722, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8069, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5755395683453237, |
|
"grad_norm": 0.9141854769887011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8087, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6115107913669064, |
|
"grad_norm": 0.6527584548807389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8048, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6474820143884892, |
|
"grad_norm": 0.6986581112545092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8051, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6834532374100719, |
|
"grad_norm": 0.6094857952430536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8044, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7194244604316546, |
|
"grad_norm": 0.74096920276776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7989, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7553956834532374, |
|
"grad_norm": 0.6584952886572538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8025, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7913669064748201, |
|
"grad_norm": 0.5838446606699556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7988, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8273381294964028, |
|
"grad_norm": 0.5916175411049406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7985, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8633093525179856, |
|
"grad_norm": 0.626471567693148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7973, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8992805755395683, |
|
"grad_norm": 0.6338741269795162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7933, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.935251798561151, |
|
"grad_norm": 0.8343555675066444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7969, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9712230215827338, |
|
"grad_norm": 0.6221641429373133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7933, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7923575043678284, |
|
"eval_runtime": 27.9533, |
|
"eval_samples_per_second": 267.732, |
|
"eval_steps_per_second": 1.073, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.0071942446043165, |
|
"grad_norm": 0.8944971285319924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7823, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0431654676258992, |
|
"grad_norm": 0.7668083853056575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7574, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.079136690647482, |
|
"grad_norm": 0.6176816592509634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7529, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1151079136690647, |
|
"grad_norm": 0.6475301176330789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7558, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1510791366906474, |
|
"grad_norm": 0.5811910989874788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7623, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1870503597122302, |
|
"grad_norm": 0.6269454462814978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7601, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.223021582733813, |
|
"grad_norm": 0.5423886247053047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7535, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2589928057553956, |
|
"grad_norm": 0.6670401432003603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.757, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2949640287769784, |
|
"grad_norm": 0.7095322132659916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.759, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.330935251798561, |
|
"grad_norm": 0.6870367808903867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7567, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3669064748201438, |
|
"grad_norm": 0.6640094117573664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7592, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4028776978417266, |
|
"grad_norm": 0.5994950619117767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7529, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4388489208633093, |
|
"grad_norm": 0.7392872817621052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7554, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.474820143884892, |
|
"grad_norm": 0.5656749568866071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7547, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5107913669064748, |
|
"grad_norm": 0.921484641426356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7532, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5467625899280577, |
|
"grad_norm": 0.540059029380678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7585, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5827338129496402, |
|
"grad_norm": 0.6558652758296812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7515, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6187050359712232, |
|
"grad_norm": 0.57268163367781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7562, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6546762589928057, |
|
"grad_norm": 0.5407189047091853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7559, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6906474820143886, |
|
"grad_norm": 0.6077940984618293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.757, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7266187050359711, |
|
"grad_norm": 1.001124812241379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7552, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.762589928057554, |
|
"grad_norm": 0.6254013722291123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.753, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.7985611510791366, |
|
"grad_norm": 0.5767617312575639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7594, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8345323741007196, |
|
"grad_norm": 0.665915353902276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7554, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.870503597122302, |
|
"grad_norm": 0.5596777388150926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7537, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.906474820143885, |
|
"grad_norm": 0.5547398560915929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7555, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9424460431654675, |
|
"grad_norm": 0.5874602156110944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7509, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9784172661870505, |
|
"grad_norm": 0.6369533697170318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7503, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7788412570953369, |
|
"eval_runtime": 27.8988, |
|
"eval_samples_per_second": 268.255, |
|
"eval_steps_per_second": 1.075, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.014388489208633, |
|
"grad_norm": 1.0929207520027995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.735, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.050359712230216, |
|
"grad_norm": 0.687310495052166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7131, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.0863309352517985, |
|
"grad_norm": 0.6848749958758751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7129, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.1223021582733814, |
|
"grad_norm": 0.9700661070159223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7154, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.158273381294964, |
|
"grad_norm": 0.7429316335562708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7163, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.194244604316547, |
|
"grad_norm": 0.5731198010767242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7197, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.2302158273381294, |
|
"grad_norm": 0.6519774548706885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7192, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.2661870503597124, |
|
"grad_norm": 0.7092939571259266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.717, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.302158273381295, |
|
"grad_norm": 0.8300683342338049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7171, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.338129496402878, |
|
"grad_norm": 0.6364079517115279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7179, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.3741007194244603, |
|
"grad_norm": 0.6830216482631195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.4100719424460433, |
|
"grad_norm": 0.580810416113199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7201, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.446043165467626, |
|
"grad_norm": 0.7709663647446697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7165, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.4820143884892087, |
|
"grad_norm": 0.6587806242655105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7199, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.5179856115107913, |
|
"grad_norm": 0.6679031168226195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7228, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.553956834532374, |
|
"grad_norm": 0.5802019851320436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7211, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.5899280575539567, |
|
"grad_norm": 0.633360775543426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7192, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6258992805755397, |
|
"grad_norm": 0.7014721250700231, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.661870503597122, |
|
"grad_norm": 0.5972726636881343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7184, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.697841726618705, |
|
"grad_norm": 0.5454556975289979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7139, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7338129496402876, |
|
"grad_norm": 0.5626224999737693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7207, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.7697841726618706, |
|
"grad_norm": 0.5106193565014756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7193, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.805755395683453, |
|
"grad_norm": 0.6138738602878809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7185, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.841726618705036, |
|
"grad_norm": 0.6093685279993987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7217, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.8776978417266186, |
|
"grad_norm": 0.5564883285882788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7213, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.9136690647482015, |
|
"grad_norm": 0.5906548449538034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7183, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.949640287769784, |
|
"grad_norm": 0.5460219561244413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7216, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.985611510791367, |
|
"grad_norm": 0.6453368774762195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7198, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7752296328544617, |
|
"eval_runtime": 27.5746, |
|
"eval_samples_per_second": 271.409, |
|
"eval_steps_per_second": 1.088, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 834, |
|
"total_flos": 1396981062696960.0, |
|
"train_loss": 0.7675551453368555, |
|
"train_runtime": 5571.5313, |
|
"train_samples_per_second": 76.563, |
|
"train_steps_per_second": 0.15 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 834, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1396981062696960.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|