|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9996105008958479, |
|
"eval_steps": 25, |
|
"global_step": 401, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009971177066292747, |
|
"grad_norm": 7.652912616729736, |
|
"learning_rate": 1.9512195121951222e-05, |
|
"loss": 2.2965, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.019942354132585494, |
|
"grad_norm": 25.198680877685547, |
|
"learning_rate": 3.9024390243902444e-05, |
|
"loss": 2.1832, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.029913531198878244, |
|
"grad_norm": 1.633164405822754, |
|
"learning_rate": 5.853658536585366e-05, |
|
"loss": 2.0435, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03988470826517099, |
|
"grad_norm": 1.5140329599380493, |
|
"learning_rate": 7.804878048780489e-05, |
|
"loss": 1.7848, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04985588533146374, |
|
"grad_norm": 2.636758804321289, |
|
"learning_rate": 9.75609756097561e-05, |
|
"loss": 1.5732, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05982706239775649, |
|
"grad_norm": 1.3496147394180298, |
|
"learning_rate": 0.00011707317073170732, |
|
"loss": 1.4785, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.062319856664329674, |
|
"eval_loss": 1.4366233348846436, |
|
"eval_runtime": 140.1422, |
|
"eval_samples_per_second": 29.227, |
|
"eval_steps_per_second": 1.827, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06979823946404923, |
|
"grad_norm": 0.6696961522102356, |
|
"learning_rate": 0.00013658536585365856, |
|
"loss": 1.4621, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07976941653034197, |
|
"grad_norm": 0.5544710159301758, |
|
"learning_rate": 0.00015609756097560978, |
|
"loss": 1.4146, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08974059359663472, |
|
"grad_norm": 0.5817800760269165, |
|
"learning_rate": 0.000175609756097561, |
|
"loss": 1.3902, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09971177066292748, |
|
"grad_norm": 0.49702659249305725, |
|
"learning_rate": 0.0001951219512195122, |
|
"loss": 1.3624, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10968294772922023, |
|
"grad_norm": 0.5360238552093506, |
|
"learning_rate": 0.00019833333333333335, |
|
"loss": 1.4103, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11965412479551298, |
|
"grad_norm": 0.6442453265190125, |
|
"learning_rate": 0.00019611111111111112, |
|
"loss": 1.3618, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12463971332865935, |
|
"eval_loss": 1.3544318675994873, |
|
"eval_runtime": 137.5844, |
|
"eval_samples_per_second": 29.771, |
|
"eval_steps_per_second": 1.861, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1296253018618057, |
|
"grad_norm": 0.46826621890068054, |
|
"learning_rate": 0.0001938888888888889, |
|
"loss": 1.3712, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13959647892809846, |
|
"grad_norm": 0.47624409198760986, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 1.3469, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1495676559943912, |
|
"grad_norm": 0.449796587228775, |
|
"learning_rate": 0.00018944444444444445, |
|
"loss": 1.3431, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15953883306068395, |
|
"grad_norm": 0.5890815258026123, |
|
"learning_rate": 0.00018722222222222222, |
|
"loss": 1.3511, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1695100101269767, |
|
"grad_norm": 0.429056316614151, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 1.342, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17948118719326944, |
|
"grad_norm": 0.4640163481235504, |
|
"learning_rate": 0.00018277777777777777, |
|
"loss": 1.3353, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18695956999298902, |
|
"eval_loss": 1.3305575847625732, |
|
"eval_runtime": 137.0065, |
|
"eval_samples_per_second": 29.896, |
|
"eval_steps_per_second": 1.869, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1894523642595622, |
|
"grad_norm": 0.4788176715373993, |
|
"learning_rate": 0.00018055555555555557, |
|
"loss": 1.3255, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19942354132585496, |
|
"grad_norm": 0.43469157814979553, |
|
"learning_rate": 0.00017833333333333335, |
|
"loss": 1.324, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2093947183921477, |
|
"grad_norm": 0.4492317736148834, |
|
"learning_rate": 0.00017611111111111112, |
|
"loss": 1.3325, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21936589545844046, |
|
"grad_norm": 0.4481510519981384, |
|
"learning_rate": 0.0001738888888888889, |
|
"loss": 1.3575, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2293370725247332, |
|
"grad_norm": 0.4316515624523163, |
|
"learning_rate": 0.00017166666666666667, |
|
"loss": 1.3247, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23930824959102595, |
|
"grad_norm": 0.48252755403518677, |
|
"learning_rate": 0.00016944444444444445, |
|
"loss": 1.3314, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2492794266573187, |
|
"grad_norm": 0.49299389123916626, |
|
"learning_rate": 0.00016722222222222222, |
|
"loss": 1.3422, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2492794266573187, |
|
"eval_loss": 1.319018840789795, |
|
"eval_runtime": 137.544, |
|
"eval_samples_per_second": 29.78, |
|
"eval_steps_per_second": 1.861, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2592506037236114, |
|
"grad_norm": 0.4329146444797516, |
|
"learning_rate": 0.000165, |
|
"loss": 1.3041, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26922178078990416, |
|
"grad_norm": 0.42452341318130493, |
|
"learning_rate": 0.00016277777777777777, |
|
"loss": 1.3334, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2791929578561969, |
|
"grad_norm": 0.482371985912323, |
|
"learning_rate": 0.00016055555555555558, |
|
"loss": 1.3352, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28916413492248966, |
|
"grad_norm": 0.4965856075286865, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 1.3361, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2991353119887824, |
|
"grad_norm": 0.4086521863937378, |
|
"learning_rate": 0.00015611111111111113, |
|
"loss": 1.3238, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30910648905507515, |
|
"grad_norm": 0.5104921460151672, |
|
"learning_rate": 0.0001538888888888889, |
|
"loss": 1.3064, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3115992833216484, |
|
"eval_loss": 1.3116130828857422, |
|
"eval_runtime": 142.0878, |
|
"eval_samples_per_second": 28.827, |
|
"eval_steps_per_second": 1.802, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3190776661213679, |
|
"grad_norm": 0.4384579658508301, |
|
"learning_rate": 0.00015166666666666668, |
|
"loss": 1.3192, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.32904884318766064, |
|
"grad_norm": 0.41889750957489014, |
|
"learning_rate": 0.00014944444444444445, |
|
"loss": 1.326, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3390200202539534, |
|
"grad_norm": 0.44409337639808655, |
|
"learning_rate": 0.00014722222222222223, |
|
"loss": 1.3291, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.34899119732024614, |
|
"grad_norm": 0.4266604483127594, |
|
"learning_rate": 0.000145, |
|
"loss": 1.3114, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3589623743865389, |
|
"grad_norm": 0.4299560785293579, |
|
"learning_rate": 0.00014277777777777778, |
|
"loss": 1.2829, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.36893355145283163, |
|
"grad_norm": 0.4503760039806366, |
|
"learning_rate": 0.00014055555555555555, |
|
"loss": 1.3061, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37391913998597803, |
|
"eval_loss": 1.3071435689926147, |
|
"eval_runtime": 138.362, |
|
"eval_samples_per_second": 29.604, |
|
"eval_steps_per_second": 1.85, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3789047285191244, |
|
"grad_norm": 0.42756497859954834, |
|
"learning_rate": 0.00013833333333333333, |
|
"loss": 1.3165, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3888759055854172, |
|
"grad_norm": 0.4549823999404907, |
|
"learning_rate": 0.00013611111111111113, |
|
"loss": 1.3283, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3988470826517099, |
|
"grad_norm": 0.43239399790763855, |
|
"learning_rate": 0.00013388888888888888, |
|
"loss": 1.3074, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4088182597180027, |
|
"grad_norm": 0.4290738105773926, |
|
"learning_rate": 0.00013166666666666668, |
|
"loss": 1.2981, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4187894367842954, |
|
"grad_norm": 0.6137893795967102, |
|
"learning_rate": 0.00012944444444444445, |
|
"loss": 1.3113, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.42876061385058817, |
|
"grad_norm": 0.4174145460128784, |
|
"learning_rate": 0.00012722222222222223, |
|
"loss": 1.2837, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4362389966503077, |
|
"eval_loss": 1.3025227785110474, |
|
"eval_runtime": 138.018, |
|
"eval_samples_per_second": 29.677, |
|
"eval_steps_per_second": 1.855, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4387317909168809, |
|
"grad_norm": 0.4010506272315979, |
|
"learning_rate": 0.000125, |
|
"loss": 1.31, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.44870296798317366, |
|
"grad_norm": 0.42004135251045227, |
|
"learning_rate": 0.0001227777777777778, |
|
"loss": 1.3001, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4586741450494664, |
|
"grad_norm": 0.434986412525177, |
|
"learning_rate": 0.00012055555555555555, |
|
"loss": 1.3285, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.46864532211575916, |
|
"grad_norm": 0.4850348234176636, |
|
"learning_rate": 0.00011833333333333334, |
|
"loss": 1.3145, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4786164991820519, |
|
"grad_norm": 0.42585763335227966, |
|
"learning_rate": 0.00011611111111111113, |
|
"loss": 1.288, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.48858767624834465, |
|
"grad_norm": 0.5526877045631409, |
|
"learning_rate": 0.00011388888888888889, |
|
"loss": 1.3069, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4985588533146374, |
|
"grad_norm": 0.44904306530952454, |
|
"learning_rate": 0.00011166666666666668, |
|
"loss": 1.3299, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4985588533146374, |
|
"eval_loss": 1.2983635663986206, |
|
"eval_runtime": 137.4814, |
|
"eval_samples_per_second": 29.793, |
|
"eval_steps_per_second": 1.862, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5085300303809301, |
|
"grad_norm": 0.43869510293006897, |
|
"learning_rate": 0.00010944444444444445, |
|
"loss": 1.3097, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5185012074472228, |
|
"grad_norm": 0.4270603060722351, |
|
"learning_rate": 0.00010722222222222223, |
|
"loss": 1.3294, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5284723845135156, |
|
"grad_norm": 0.43747490644454956, |
|
"learning_rate": 0.000105, |
|
"loss": 1.317, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5384435615798083, |
|
"grad_norm": 0.4406147301197052, |
|
"learning_rate": 0.00010277777777777778, |
|
"loss": 1.2965, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5484147386461011, |
|
"grad_norm": 0.4331868290901184, |
|
"learning_rate": 0.00010055555555555555, |
|
"loss": 1.2863, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5583859157123938, |
|
"grad_norm": 0.42832815647125244, |
|
"learning_rate": 9.833333333333333e-05, |
|
"loss": 1.2968, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.560878709978967, |
|
"eval_loss": 1.2944072484970093, |
|
"eval_runtime": 137.7551, |
|
"eval_samples_per_second": 29.734, |
|
"eval_steps_per_second": 1.858, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5683570927786866, |
|
"grad_norm": 0.43270131945610046, |
|
"learning_rate": 9.611111111111112e-05, |
|
"loss": 1.2904, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5783282698449793, |
|
"grad_norm": 0.4404992461204529, |
|
"learning_rate": 9.388888888888889e-05, |
|
"loss": 1.31, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5882994469112721, |
|
"grad_norm": 0.44887053966522217, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 1.3015, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5982706239775648, |
|
"grad_norm": 0.42438870668411255, |
|
"learning_rate": 8.944444444444446e-05, |
|
"loss": 1.3053, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6082418010438576, |
|
"grad_norm": 0.4337781071662903, |
|
"learning_rate": 8.722222222222223e-05, |
|
"loss": 1.278, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6182129781101503, |
|
"grad_norm": 0.4185997247695923, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.2979, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6231985666432968, |
|
"eval_loss": 1.290996789932251, |
|
"eval_runtime": 137.572, |
|
"eval_samples_per_second": 29.773, |
|
"eval_steps_per_second": 1.861, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6281841551764431, |
|
"grad_norm": 0.4232589900493622, |
|
"learning_rate": 8.277777777777778e-05, |
|
"loss": 1.2937, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6381553322427358, |
|
"grad_norm": 0.44364941120147705, |
|
"learning_rate": 8.055555555555556e-05, |
|
"loss": 1.3272, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6481265093090286, |
|
"grad_norm": 0.4201723635196686, |
|
"learning_rate": 7.833333333333333e-05, |
|
"loss": 1.311, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6580976863753213, |
|
"grad_norm": 0.4404850900173187, |
|
"learning_rate": 7.61111111111111e-05, |
|
"loss": 1.312, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6680688634416141, |
|
"grad_norm": 0.42794689536094666, |
|
"learning_rate": 7.38888888888889e-05, |
|
"loss": 1.2697, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6780400405079068, |
|
"grad_norm": 0.45072466135025024, |
|
"learning_rate": 7.166666666666667e-05, |
|
"loss": 1.2878, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6855184233076264, |
|
"eval_loss": 1.2891337871551514, |
|
"eval_runtime": 137.4053, |
|
"eval_samples_per_second": 29.81, |
|
"eval_steps_per_second": 1.863, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6880112175741996, |
|
"grad_norm": 0.42994165420532227, |
|
"learning_rate": 6.944444444444444e-05, |
|
"loss": 1.2823, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6979823946404923, |
|
"grad_norm": 0.44423794746398926, |
|
"learning_rate": 6.722222222222223e-05, |
|
"loss": 1.2828, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7079535717067851, |
|
"grad_norm": 0.4406779110431671, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.2836, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7179247487730778, |
|
"grad_norm": 0.4179854094982147, |
|
"learning_rate": 6.277777777777778e-05, |
|
"loss": 1.3222, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7278959258393706, |
|
"grad_norm": 0.42552730441093445, |
|
"learning_rate": 6.055555555555555e-05, |
|
"loss": 1.3059, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7378671029056633, |
|
"grad_norm": 0.45751523971557617, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 1.2756, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7478382799719561, |
|
"grad_norm": 0.4325704276561737, |
|
"learning_rate": 5.6111111111111114e-05, |
|
"loss": 1.2734, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7478382799719561, |
|
"eval_loss": 1.2868282794952393, |
|
"eval_runtime": 137.7011, |
|
"eval_samples_per_second": 29.746, |
|
"eval_steps_per_second": 1.859, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7578094570382488, |
|
"grad_norm": 0.4315655529499054, |
|
"learning_rate": 5.388888888888889e-05, |
|
"loss": 1.3283, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7677806341045416, |
|
"grad_norm": 0.4370616376399994, |
|
"learning_rate": 5.166666666666667e-05, |
|
"loss": 1.2928, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7777518111708344, |
|
"grad_norm": 0.46377331018447876, |
|
"learning_rate": 4.9444444444444446e-05, |
|
"loss": 1.2502, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.787722988237127, |
|
"grad_norm": 0.42414504289627075, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 1.2988, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7976941653034199, |
|
"grad_norm": 0.42951062321662903, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.2756, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8076653423697125, |
|
"grad_norm": 0.44111117720603943, |
|
"learning_rate": 4.277777777777778e-05, |
|
"loss": 1.2826, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8101581366362858, |
|
"eval_loss": 1.2843713760375977, |
|
"eval_runtime": 136.8431, |
|
"eval_samples_per_second": 29.932, |
|
"eval_steps_per_second": 1.871, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8176365194360053, |
|
"grad_norm": 0.453071266412735, |
|
"learning_rate": 4.055555555555556e-05, |
|
"loss": 1.3157, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.827607696502298, |
|
"grad_norm": 0.4466659426689148, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 1.3015, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8375788735685908, |
|
"grad_norm": 0.47975486516952515, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 1.2719, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8475500506348835, |
|
"grad_norm": 0.4295012056827545, |
|
"learning_rate": 3.388888888888889e-05, |
|
"loss": 1.2761, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8575212277011763, |
|
"grad_norm": 0.4600393772125244, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 1.3026, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.867492404767469, |
|
"grad_norm": 0.4381970167160034, |
|
"learning_rate": 2.9444444444444448e-05, |
|
"loss": 1.2989, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8724779933006154, |
|
"eval_loss": 1.283011794090271, |
|
"eval_runtime": 136.5997, |
|
"eval_samples_per_second": 29.985, |
|
"eval_steps_per_second": 1.874, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8774635818337618, |
|
"grad_norm": 0.4486907720565796, |
|
"learning_rate": 2.7222222222222223e-05, |
|
"loss": 1.2821, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8874347589000545, |
|
"grad_norm": 0.4515422582626343, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3049, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8974059359663473, |
|
"grad_norm": 0.430318146944046, |
|
"learning_rate": 2.277777777777778e-05, |
|
"loss": 1.2848, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.90737711303264, |
|
"grad_norm": 0.4484976828098297, |
|
"learning_rate": 2.0555555555555555e-05, |
|
"loss": 1.3249, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9173482900989328, |
|
"grad_norm": 0.43680936098098755, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 1.2724, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9273194671652255, |
|
"grad_norm": 0.45802849531173706, |
|
"learning_rate": 1.6111111111111115e-05, |
|
"loss": 1.3046, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9347978499649451, |
|
"eval_loss": 1.2813191413879395, |
|
"eval_runtime": 137.1907, |
|
"eval_samples_per_second": 29.856, |
|
"eval_steps_per_second": 1.866, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9372906442315183, |
|
"grad_norm": 0.4513862133026123, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 1.2824, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.947261821297811, |
|
"grad_norm": 0.43908172845840454, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 1.2946, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9572329983641038, |
|
"grad_norm": 0.4305504858493805, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 1.2842, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9672041754303965, |
|
"grad_norm": 0.4479218125343323, |
|
"learning_rate": 7.222222222222222e-06, |
|
"loss": 1.3138, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9771753524966893, |
|
"grad_norm": 0.43993687629699707, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3092, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.987146529562982, |
|
"grad_norm": 0.438343346118927, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 1.2904, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9971177066292748, |
|
"grad_norm": 0.4632357954978943, |
|
"learning_rate": 5.555555555555556e-07, |
|
"loss": 1.2978, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9971177066292748, |
|
"eval_loss": 1.2806103229522705, |
|
"eval_runtime": 137.0794, |
|
"eval_samples_per_second": 29.88, |
|
"eval_steps_per_second": 1.868, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 401, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 512, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.5348543338753556e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|