|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 4785, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003134796238244514, |
|
"grad_norm": 1200.0, |
|
"learning_rate": 4.175365344467641e-07, |
|
"loss": 56.0196, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01567398119122257, |
|
"grad_norm": 1104.0, |
|
"learning_rate": 2.0876826722338207e-06, |
|
"loss": 57.1065, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03134796238244514, |
|
"grad_norm": 780.0, |
|
"learning_rate": 4.175365344467641e-06, |
|
"loss": 53.982, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.047021943573667714, |
|
"grad_norm": 378.0, |
|
"learning_rate": 6.2630480167014616e-06, |
|
"loss": 39.7621, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06269592476489028, |
|
"grad_norm": 119.0, |
|
"learning_rate": 8.350730688935283e-06, |
|
"loss": 31.2881, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07836990595611286, |
|
"grad_norm": 59.0, |
|
"learning_rate": 1.0438413361169103e-05, |
|
"loss": 29.3676, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09404388714733543, |
|
"grad_norm": 27.75, |
|
"learning_rate": 1.2526096033402923e-05, |
|
"loss": 27.221, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.109717868338558, |
|
"grad_norm": 17.0, |
|
"learning_rate": 1.4613778705636743e-05, |
|
"loss": 25.3228, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12539184952978055, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.6701461377870565e-05, |
|
"loss": 24.7964, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14106583072100312, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 1.8789144050104384e-05, |
|
"loss": 23.6672, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15673981191222572, |
|
"grad_norm": 23.5, |
|
"learning_rate": 2.0876826722338206e-05, |
|
"loss": 22.938, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 46.75, |
|
"learning_rate": 2.2964509394572024e-05, |
|
"loss": 20.5447, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.18808777429467086, |
|
"grad_norm": 102.5, |
|
"learning_rate": 2.5052192066805846e-05, |
|
"loss": 15.6089, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20376175548589343, |
|
"grad_norm": 21.875, |
|
"learning_rate": 2.7139874739039668e-05, |
|
"loss": 6.57, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.219435736677116, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 2.9227557411273487e-05, |
|
"loss": 2.9537, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23510971786833856, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 3.131524008350731e-05, |
|
"loss": 2.4063, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2507836990595611, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 3.340292275574113e-05, |
|
"loss": 2.1568, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2664576802507837, |
|
"grad_norm": 4.75, |
|
"learning_rate": 3.5490605427974946e-05, |
|
"loss": 1.9549, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28213166144200624, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 3.757828810020877e-05, |
|
"loss": 1.8178, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.29780564263322884, |
|
"grad_norm": 16.625, |
|
"learning_rate": 3.966597077244259e-05, |
|
"loss": 1.7357, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.31347962382445144, |
|
"grad_norm": 10.625, |
|
"learning_rate": 4.175365344467641e-05, |
|
"loss": 1.6526, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.329153605015674, |
|
"grad_norm": 16.875, |
|
"learning_rate": 4.3841336116910233e-05, |
|
"loss": 1.6119, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 4.592901878914405e-05, |
|
"loss": 1.5619, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3605015673981191, |
|
"grad_norm": 6.25, |
|
"learning_rate": 4.801670146137787e-05, |
|
"loss": 1.553, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3761755485893417, |
|
"grad_norm": 16.875, |
|
"learning_rate": 5.010438413361169e-05, |
|
"loss": 1.5499, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.39184952978056425, |
|
"grad_norm": 18.875, |
|
"learning_rate": 5.219206680584552e-05, |
|
"loss": 1.5358, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.40752351097178685, |
|
"grad_norm": 10.5, |
|
"learning_rate": 5.4279749478079336e-05, |
|
"loss": 1.5022, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4231974921630094, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 5.636743215031316e-05, |
|
"loss": 1.4674, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.438871473354232, |
|
"grad_norm": 3.25, |
|
"learning_rate": 5.8455114822546973e-05, |
|
"loss": 1.4238, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 9.0, |
|
"learning_rate": 6.05427974947808e-05, |
|
"loss": 1.3893, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4702194357366771, |
|
"grad_norm": 25.875, |
|
"learning_rate": 6.263048016701462e-05, |
|
"loss": 1.3631, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48589341692789967, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 6.471816283924845e-05, |
|
"loss": 1.3578, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5015673981191222, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 6.680584551148226e-05, |
|
"loss": 1.2742, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 6.889352818371608e-05, |
|
"loss": 1.2685, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5329153605015674, |
|
"grad_norm": 8.5, |
|
"learning_rate": 7.098121085594989e-05, |
|
"loss": 1.2819, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.54858934169279, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 7.306889352818372e-05, |
|
"loss": 1.284, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5642633228840125, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 7.515657620041754e-05, |
|
"loss": 1.2541, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5799373040752351, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 7.724425887265136e-05, |
|
"loss": 1.25, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5956112852664577, |
|
"grad_norm": 11.875, |
|
"learning_rate": 7.933194154488518e-05, |
|
"loss": 1.2324, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6112852664576802, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 8.141962421711901e-05, |
|
"loss": 1.2093, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6269592476489029, |
|
"grad_norm": 10.0, |
|
"learning_rate": 8.350730688935282e-05, |
|
"loss": 1.2225, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6426332288401254, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 8.559498956158665e-05, |
|
"loss": 1.1948, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.658307210031348, |
|
"grad_norm": 6.125, |
|
"learning_rate": 8.768267223382047e-05, |
|
"loss": 1.1823, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6739811912225705, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 8.977035490605428e-05, |
|
"loss": 1.1812, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 9.18580375782881e-05, |
|
"loss": 1.1833, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7053291536050157, |
|
"grad_norm": 9.875, |
|
"learning_rate": 9.394572025052193e-05, |
|
"loss": 1.1606, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7210031347962382, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.603340292275574e-05, |
|
"loss": 1.1662, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7366771159874608, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 9.812108559498957e-05, |
|
"loss": 1.1573, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7523510971786834, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00010020876826722338, |
|
"loss": 1.1386, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.768025078369906, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.00010229645093945721, |
|
"loss": 1.1822, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7836990595611285, |
|
"grad_norm": 8.125, |
|
"learning_rate": 0.00010438413361169104, |
|
"loss": 1.1415, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.799373040752351, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 0.00010647181628392484, |
|
"loss": 1.1534, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8150470219435737, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00010855949895615867, |
|
"loss": 1.1338, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8307210031347962, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.00011064718162839249, |
|
"loss": 1.1435, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8463949843260188, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.00011273486430062632, |
|
"loss": 1.1213, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 3.875, |
|
"learning_rate": 0.00011482254697286012, |
|
"loss": 1.123, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.877742946708464, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 0.00011691022964509395, |
|
"loss": 1.154, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8934169278996865, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 0.00011899791231732778, |
|
"loss": 1.1346, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 15.75, |
|
"learning_rate": 0.0001210855949895616, |
|
"loss": 1.0998, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9247648902821317, |
|
"grad_norm": 4.375, |
|
"learning_rate": 0.0001231732776617954, |
|
"loss": 1.0742, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9404388714733543, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.00012526096033402923, |
|
"loss": 1.0956, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9561128526645768, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 0.00012734864300626306, |
|
"loss": 1.0698, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9717868338557993, |
|
"grad_norm": 2.75, |
|
"learning_rate": 0.0001294363256784969, |
|
"loss": 1.0526, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.987460815047022, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0001315240083507307, |
|
"loss": 1.0589, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.7604742050170898, |
|
"eval_runtime": 0.8071, |
|
"eval_samples_per_second": 2.478, |
|
"eval_steps_per_second": 1.239, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.0031347962382444, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.00013361169102296452, |
|
"loss": 1.06, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0188087774294672, |
|
"grad_norm": 34.0, |
|
"learning_rate": 0.00013569937369519835, |
|
"loss": 1.0232, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.00013778705636743215, |
|
"loss": 1.0624, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0501567398119123, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 0.00013987473903966598, |
|
"loss": 1.0581, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.0658307210031348, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 0.00014196242171189978, |
|
"loss": 1.0274, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0815047021943573, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 0.0001440501043841336, |
|
"loss": 1.0301, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.09717868338558, |
|
"grad_norm": 10.25, |
|
"learning_rate": 0.00014613778705636744, |
|
"loss": 1.0221, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1128526645768024, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.00014822546972860124, |
|
"loss": 1.0185, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.1285266457680252, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.00015031315240083507, |
|
"loss": 1.0146, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1442006269592477, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 0.0001524008350730689, |
|
"loss": 1.0134, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.1598746081504703, |
|
"grad_norm": 5.375, |
|
"learning_rate": 0.00015448851774530273, |
|
"loss": 1.0372, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1755485893416928, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00015657620041753653, |
|
"loss": 1.0096, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.1912225705329154, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 0.00015866388308977036, |
|
"loss": 1.0143, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 0.0001607515657620042, |
|
"loss": 1.0159, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.2225705329153604, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00016283924843423802, |
|
"loss": 1.0069, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.238244514106583, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.00016492693110647182, |
|
"loss": 1.0019, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.2539184952978055, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.00016701461377870565, |
|
"loss": 0.9979, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2695924764890283, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00016910229645093947, |
|
"loss": 1.0066, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.2852664576802508, |
|
"grad_norm": 3.25, |
|
"learning_rate": 0.0001711899791231733, |
|
"loss": 1.0002, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3009404388714734, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0001732776617954071, |
|
"loss": 1.0036, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.316614420062696, |
|
"grad_norm": 3.125, |
|
"learning_rate": 0.00017536534446764093, |
|
"loss": 1.003, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3322884012539185, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.00017745302713987476, |
|
"loss": 1.0207, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.347962382445141, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.00017954070981210856, |
|
"loss": 1.0097, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.0001816283924843424, |
|
"loss": 0.9984, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001837160751565762, |
|
"loss": 1.019, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3949843260188088, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00018580375782881002, |
|
"loss": 1.0093, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.4106583072100314, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.00018789144050104385, |
|
"loss": 0.9906, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.426332288401254, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.00018997912317327765, |
|
"loss": 0.9956, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.4420062695924765, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 0.00019206680584551148, |
|
"loss": 1.0026, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.457680250783699, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 0.0001941544885177453, |
|
"loss": 0.9779, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.4733542319749215, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019624217118997914, |
|
"loss": 0.9981, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.489028213166144, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.00019832985386221294, |
|
"loss": 1.0087, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.5047021943573666, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.0001999999733852936, |
|
"loss": 1.0154, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5203761755485894, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019999904187205744, |
|
"loss": 1.0196, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.536050156739812, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00019999677963766844, |
|
"loss": 1.0196, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.00019999318671223102, |
|
"loss": 1.003, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.567398119122257, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.0001999882631435574, |
|
"loss": 1.0022, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5830721003134798, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00019998200899716724, |
|
"loss": 0.9932, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.5987460815047023, |
|
"grad_norm": 3.625, |
|
"learning_rate": 0.00019997442435628653, |
|
"loss": 1.0083, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6144200626959249, |
|
"grad_norm": 6.375, |
|
"learning_rate": 0.00019996550932184666, |
|
"loss": 1.0153, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.6300940438871474, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00019995526401248302, |
|
"loss": 1.0158, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.64576802507837, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.00019994368856453341, |
|
"loss": 1.0308, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.6614420062695925, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.00019993078313203632, |
|
"loss": 0.9834, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.677115987460815, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0001999165478867286, |
|
"loss": 0.9937, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.6927899686520376, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.00019990098301804357, |
|
"loss": 0.9884, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.70846394984326, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.00019988408873310815, |
|
"loss": 0.9846, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00019986586525674036, |
|
"loss": 0.9711, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7398119122257052, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 0.00019984631283144616, |
|
"loss": 0.9789, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.7554858934169277, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.0001998254317174163, |
|
"loss": 0.984, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7711598746081505, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 0.00019980322219252284, |
|
"loss": 0.9609, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.786833855799373, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0001997796845523155, |
|
"loss": 0.9997, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8025078369905956, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.00019975481911001762, |
|
"loss": 0.9772, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.00019972862619652203, |
|
"loss": 0.9939, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8338557993730409, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.00019970110616038673, |
|
"loss": 0.9794, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.8495297805642634, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 0.0001996722593678302, |
|
"loss": 0.9702, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.865203761755486, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00019964208620272647, |
|
"loss": 0.9667, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.8808777429467085, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00019961058706660005, |
|
"loss": 0.9632, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 12.5, |
|
"learning_rate": 0.00019957776237862067, |
|
"loss": 0.9554, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.9122257053291536, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.00019954361257559756, |
|
"loss": 0.9902, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9278996865203761, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001995081381119737, |
|
"loss": 0.998, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.9435736677115987, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00019947133945981987, |
|
"loss": 0.9781, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9592476489028212, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.00019943321710882815, |
|
"loss": 0.9956, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.9749216300940438, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 0.0001993937715663056, |
|
"loss": 0.9776, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9905956112852663, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.00019935300335716748, |
|
"loss": 1.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.7305909395217896, |
|
"eval_runtime": 0.8036, |
|
"eval_samples_per_second": 2.489, |
|
"eval_steps_per_second": 1.244, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.006269592476489, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.00019931091302393008, |
|
"loss": 0.9559, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0219435736677114, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.00019926750112670382, |
|
"loss": 0.8412, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.0376175548589344, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.00019922276824318547, |
|
"loss": 0.8475, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.053291536050157, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0001991767149686507, |
|
"loss": 0.8512, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001991293419159461, |
|
"loss": 0.8301, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.084639498432602, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 0.00019908064971548085, |
|
"loss": 0.8622, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.1003134796238245, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001990306390152186, |
|
"loss": 0.8375, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.115987460815047, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.00019897931048066877, |
|
"loss": 0.8571, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.1316614420062696, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.00019892666479487744, |
|
"loss": 0.904, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.147335423197492, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 0.00019887270265841868, |
|
"loss": 0.8602, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.1630094043887147, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019881742478938496, |
|
"loss": 0.8618, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1786833855799372, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 0.00019876083192337757, |
|
"loss": 0.8737, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.19435736677116, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 0.00019870292481349698, |
|
"loss": 0.8688, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.2100313479623823, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.00019864370423033274, |
|
"loss": 0.8821, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.225705329153605, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019858317096195323, |
|
"loss": 0.8746, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.00019852132581389513, |
|
"loss": 0.8742, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.2570532915360504, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.00019845816960915286, |
|
"loss": 0.8747, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.0001983937031881674, |
|
"loss": 0.843, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.2884012539184955, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 0.0001983279274088153, |
|
"loss": 0.8682, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.304075235109718, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.00019826084314639714, |
|
"loss": 0.856, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.3197492163009406, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00019819245129362595, |
|
"loss": 0.8663, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.335423197492163, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.00019812275276061533, |
|
"loss": 0.8483, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.3510971786833856, |
|
"grad_norm": 9.875, |
|
"learning_rate": 0.00019805174847486721, |
|
"loss": 0.8416, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.366771159874608, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00019797943938125977, |
|
"loss": 0.8743, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.3824451410658307, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00019790582644203458, |
|
"loss": 0.8529, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3981191222570533, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 0.00019783091063678402, |
|
"loss": 0.8628, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.00019775469296243807, |
|
"loss": 0.8689, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4294670846394983, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0001976771744332512, |
|
"loss": 0.8671, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.445141065830721, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00019759835608078877, |
|
"loss": 0.8832, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4608150470219434, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00019751823895391323, |
|
"loss": 0.878, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.476489028213166, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.00019743682411877046, |
|
"loss": 0.8882, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.492163009404389, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.00019735411265877522, |
|
"loss": 0.8934, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.507836990595611, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00019727010567459696, |
|
"loss": 0.8815, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.523510971786834, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019718480428414505, |
|
"loss": 0.8925, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.5391849529780566, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00019709820962255409, |
|
"loss": 0.8956, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.554858934169279, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 0.00019701032284216857, |
|
"loss": 0.8828, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.5705329153605017, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00019692114511252767, |
|
"loss": 0.9194, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00019683067762034967, |
|
"loss": 0.8825, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.6018808777429467, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00019673892156951613, |
|
"loss": 0.8725, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.6175548589341693, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00019664587818105596, |
|
"loss": 0.877, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.633228840125392, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001965515486931291, |
|
"loss": 0.8764, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6489028213166144, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00019645593436101, |
|
"loss": 0.8571, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.664576802507837, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00019635903645707096, |
|
"loss": 0.887, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6802507836990594, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00019626085627076528, |
|
"loss": 0.8755, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.695924764890282, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00019616139510861, |
|
"loss": 0.8664, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.7115987460815045, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00019606065429416848, |
|
"loss": 0.8888, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00019595863516803293, |
|
"loss": 0.8772, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7429467084639496, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.0001958553390878064, |
|
"loss": 0.8919, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00019575076742808488, |
|
"loss": 0.8806, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.774294670846395, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019564492158043891, |
|
"loss": 0.8722, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.7899686520376177, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.0001955378029533951, |
|
"loss": 0.8887, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.80564263322884, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00019542941297241722, |
|
"loss": 0.9079, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.8213166144200628, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00019531975307988763, |
|
"loss": 0.877, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8369905956112853, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00019520882473508762, |
|
"loss": 0.8953, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.852664576802508, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00019509662941417826, |
|
"loss": 0.8886, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8683385579937304, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00019498316861018086, |
|
"loss": 0.9104, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.884012539184953, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0001948684438329566, |
|
"loss": 0.8815, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8996865203761755, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.00019475245660918717, |
|
"loss": 0.8718, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.915360501567398, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.00019463520848235377, |
|
"loss": 0.8774, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.000194516701012717, |
|
"loss": 0.8853, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.946708463949843, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.00019439693577729593, |
|
"loss": 0.8833, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.962382445141066, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0001942759143698472, |
|
"loss": 0.8867, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.978056426332288, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001941536384008437, |
|
"loss": 0.8923, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.993730407523511, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0001940301094974531, |
|
"loss": 0.8741, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.8115239143371582, |
|
"eval_runtime": 0.806, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 1.241, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 3.0094043887147337, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00019390532930351652, |
|
"loss": 0.7936, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.0250783699059562, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00019377929947952626, |
|
"loss": 0.7324, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.040752351097179, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00019365202170260393, |
|
"loss": 0.726, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0564263322884013, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.000193523497666478, |
|
"loss": 0.7355, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.072100313479624, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.00019339372908146147, |
|
"loss": 0.7393, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0877742946708464, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00019326271767442884, |
|
"loss": 0.736, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019313046518879337, |
|
"loss": 0.7157, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.1191222570532915, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00019299697338448369, |
|
"loss": 0.7231, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.134796238244514, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001928622440379205, |
|
"loss": 0.7221, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.1504702194357366, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.0001927262789419929, |
|
"loss": 0.7352, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.166144200626959, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001925890799060345, |
|
"loss": 0.7196, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00019245064875579942, |
|
"loss": 0.7269, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.197492163009404, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00019231098733343783, |
|
"loss": 0.7225, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.2131661442006267, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00019217009749747174, |
|
"loss": 0.734, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.2288401253918497, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001920279811227699, |
|
"loss": 0.7387, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.2445141065830723, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.00019188464010052312, |
|
"loss": 0.7303, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 3.260188087774295, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00019174007633821893, |
|
"loss": 0.7565, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.2758620689655173, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 0.00019159429175961634, |
|
"loss": 0.7588, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 3.29153605015674, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001914472883047202, |
|
"loss": 0.7452, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.3072100313479624, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00019129906792975527, |
|
"loss": 0.7395, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 3.322884012539185, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0001911496326071404, |
|
"loss": 0.7429, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.3385579937304075, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00019099898432546202, |
|
"loss": 0.7643, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 3.35423197492163, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019084712508944793, |
|
"loss": 0.755, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.3699059561128526, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.00019069405691994045, |
|
"loss": 0.7381, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.385579937304075, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00019053978185386964, |
|
"loss": 0.7546, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.4012539184952977, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.00019038430194422606, |
|
"loss": 0.7624, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 3.41692789968652, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019022761926003359, |
|
"loss": 0.7657, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.4326018808777428, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00019006973588632184, |
|
"loss": 0.7433, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0001899106539240984, |
|
"loss": 0.7767, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4639498432601883, |
|
"grad_norm": 33.75, |
|
"learning_rate": 0.00018975037549032086, |
|
"loss": 0.755, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 3.479623824451411, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0001895889027178687, |
|
"loss": 0.7631, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4952978056426334, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001894262377555148, |
|
"loss": 0.7545, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 3.510971786833856, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00018926238276789704, |
|
"loss": 0.7491, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.5266457680250785, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001890973399354892, |
|
"loss": 0.7663, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.542319749216301, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018893111145457225, |
|
"loss": 0.755, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.5579937304075235, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.00018876369953720496, |
|
"loss": 0.7681, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 3.573667711598746, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00018859510641119448, |
|
"loss": 0.766, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5893416927899686, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00018842533432006662, |
|
"loss": 0.7801, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 3.605015673981191, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00018825438552303621, |
|
"loss": 0.7647, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.6206896551724137, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00018808226229497684, |
|
"loss": 0.7768, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00018790896692639068, |
|
"loss": 0.7786, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.652037617554859, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00018773450172337793, |
|
"loss": 0.762, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 3.6677115987460818, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00018755886900760619, |
|
"loss": 0.7612, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.683385579937304, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00018738207111627958, |
|
"loss": 0.7718, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.699059561128527, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00018720411040210752, |
|
"loss": 0.7577, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.714733542319749, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00018702498923327366, |
|
"loss": 0.7429, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 3.730407523510972, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00018684470999340405, |
|
"loss": 0.7552, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.7460815047021945, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00018666327508153567, |
|
"loss": 0.7606, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 3.761755485893417, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001864806869120844, |
|
"loss": 0.7678, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.7774294670846396, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00018629694791481296, |
|
"loss": 0.7985, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00018611206053479842, |
|
"loss": 0.7712, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.8087774294670846, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00018592602723239984, |
|
"loss": 0.7745, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 3.824451410658307, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00018573885048322547, |
|
"loss": 0.7684, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.8401253918495297, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00018555053277809975, |
|
"loss": 0.7811, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 3.8557993730407523, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00018536107662303026, |
|
"loss": 0.7732, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.871473354231975, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00018517048453917424, |
|
"loss": 0.7668, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 3.8871473354231973, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00018497875906280515, |
|
"loss": 0.759, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.9028213166144203, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00018478590274527898, |
|
"loss": 0.7763, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 3.9184952978056424, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0001845919181530001, |
|
"loss": 0.7633, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.9341692789968654, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00018439680786738722, |
|
"loss": 0.7853, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 3.9498432601880875, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00018420057448483905, |
|
"loss": 0.7856, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.9655172413793105, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00018400322061669982, |
|
"loss": 0.7831, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 3.981191222570533, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018380474888922426, |
|
"loss": 0.7952, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.9968652037617556, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.000183605161943543, |
|
"loss": 0.7735, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.923946738243103, |
|
"eval_runtime": 0.8, |
|
"eval_samples_per_second": 2.5, |
|
"eval_steps_per_second": 1.25, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 4.012539184952978, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001834044624356272, |
|
"loss": 0.6611, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.028213166144201, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001832026530362532, |
|
"loss": 0.5993, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 4.043887147335423, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00018299973643096714, |
|
"loss": 0.6197, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.059561128526646, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00018279571532004907, |
|
"loss": 0.6147, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 4.075235109717869, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00018259059241847707, |
|
"loss": 0.6295, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00018238437045589115, |
|
"loss": 0.6219, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 4.106583072100314, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00018217705217655689, |
|
"loss": 0.6033, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.122257053291536, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001819686403393289, |
|
"loss": 0.622, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00018175913771761417, |
|
"loss": 0.6166, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.153605015673981, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0001815485470993351, |
|
"loss": 0.6335, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 4.169278996865204, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00018133687128689242, |
|
"loss": 0.6204, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.184952978056426, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.000181124113097128, |
|
"loss": 0.635, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 4.200626959247649, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00018091027536128716, |
|
"loss": 0.6224, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.216300940438871, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00018069536092498112, |
|
"loss": 0.6314, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 4.231974921630094, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00018047937264814917, |
|
"loss": 0.6421, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.247648902821316, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00018026231340502057, |
|
"loss": 0.6335, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 4.263322884012539, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00018004418608407626, |
|
"loss": 0.6365, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.278996865203762, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00017982499358801037, |
|
"loss": 0.6289, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 4.294670846394984, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00017960473883369186, |
|
"loss": 0.6297, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.310344827586207, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00017938342475212532, |
|
"loss": 0.6496, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 4.326018808777429, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00017916105428841234, |
|
"loss": 0.6454, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.341692789968652, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00017893763040171203, |
|
"loss": 0.6452, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 4.3573667711598745, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00017871315606520183, |
|
"loss": 0.6542, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.3730407523510975, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0001784876342660378, |
|
"loss": 0.6448, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 4.38871473354232, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00017826106800531498, |
|
"loss": 0.63, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.4043887147335425, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.0001780334602980275, |
|
"loss": 0.6317, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 4.420062695924765, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0001778048141730282, |
|
"loss": 0.6375, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.435736677115988, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00017757513267298856, |
|
"loss": 0.645, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 4.45141065830721, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.00017734441885435828, |
|
"loss": 0.6431, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.467084639498433, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00017711267578732423, |
|
"loss": 0.6494, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.482758620689655, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00017687990655577008, |
|
"loss": 0.6464, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.498432601880878, |
|
"grad_norm": 33.75, |
|
"learning_rate": 0.00017664611425723486, |
|
"loss": 0.6598, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 4.514106583072101, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.00017641130200287197, |
|
"loss": 0.6602, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.529780564263323, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00017617547291740767, |
|
"loss": 0.6443, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00017593863013909956, |
|
"loss": 0.6441, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.561128526645768, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00017570077681969474, |
|
"loss": 0.6405, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 4.576802507836991, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.00017546191612438804, |
|
"loss": 0.6605, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.592476489028213, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0001752220512317797, |
|
"loss": 0.6572, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 4.608150470219436, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 0.00017498118533383316, |
|
"loss": 0.6396, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.623824451410658, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0001747393216358326, |
|
"loss": 0.656, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 4.639498432601881, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00017449646335634017, |
|
"loss": 0.6602, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.655172413793103, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00017425261372715345, |
|
"loss": 0.6519, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 4.670846394984326, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00017400777599326203, |
|
"loss": 0.6475, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.686520376175548, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00017376195341280468, |
|
"loss": 0.6656, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 4.702194357366771, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00017351514925702583, |
|
"loss": 0.6655, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.717868338557993, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00017326736681023204, |
|
"loss": 0.672, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 4.733542319749216, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00017301860936974834, |
|
"loss": 0.6635, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.749216300940439, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00017276888024587433, |
|
"loss": 0.6666, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 4.764890282131661, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00017251818276184012, |
|
"loss": 0.6692, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.7805642633228835, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0001722665202537621, |
|
"loss": 0.6578, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 4.7962382445141065, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00017201389607059863, |
|
"loss": 0.6607, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.8119122257053295, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.00017176031357410537, |
|
"loss": 0.6538, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.0001715057761387905, |
|
"loss": 0.6703, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.843260188087775, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00017125028715187, |
|
"loss": 0.6761, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 4.858934169278997, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001709938500132225, |
|
"loss": 0.6616, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.87460815047022, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00017073646813534388, |
|
"loss": 0.6597, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 4.890282131661442, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00017047814494330207, |
|
"loss": 0.6733, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.905956112852665, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.00017021888387469135, |
|
"loss": 0.6737, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 4.921630094043887, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00016995868837958665, |
|
"loss": 0.6736, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.93730407523511, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001696975619204977, |
|
"loss": 0.68, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 4.952978056426332, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.0001694355079723227, |
|
"loss": 0.6755, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.968652037617555, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0001691725300223025, |
|
"loss": 0.6827, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 4.984326018808778, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0001689086315699738, |
|
"loss": 0.6681, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00016864381612712276, |
|
"loss": 0.6599, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.0616867542266846, |
|
"eval_runtime": 0.7937, |
|
"eval_samples_per_second": 2.52, |
|
"eval_steps_per_second": 1.26, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 5.015673981191223, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.00016837808721773827, |
|
"loss": 0.5239, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.031347962382445, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.000168111448377965, |
|
"loss": 0.509, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 5.047021943573668, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 0.0001678439031560564, |
|
"loss": 0.5041, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.06269592476489, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00016757545511232746, |
|
"loss": 0.5195, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 5.078369905956113, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00016730610781910728, |
|
"loss": 0.4963, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.094043887147335, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00016703586486069164, |
|
"loss": 0.5159, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 5.109717868338558, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001667647298332952, |
|
"loss": 0.5111, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.12539184952978, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00016649270634500366, |
|
"loss": 0.5309, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 5.141065830721003, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00016621979801572585, |
|
"loss": 0.5254, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.156739811912225, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00016594600847714538, |
|
"loss": 0.5274, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 5.172413793103448, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001656713413726725, |
|
"loss": 0.5239, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.1880877742946705, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00016539580035739547, |
|
"loss": 0.5246, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 5.2037617554858935, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.00016511938909803204, |
|
"loss": 0.5367, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.219435736677116, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00016484211127288048, |
|
"loss": 0.5356, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 5.235109717868339, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00016456397057177085, |
|
"loss": 0.5367, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.250783699059561, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00016428497069601578, |
|
"loss": 0.534, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 5.266457680250784, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00016400511535836118, |
|
"loss": 0.5476, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.282131661442007, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001637244082829369, |
|
"loss": 0.544, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 5.297805642633229, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00016344285320520717, |
|
"loss": 0.5414, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.313479623824452, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00016316045387192087, |
|
"loss": 0.5435, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 5.329153605015674, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00016287721404106167, |
|
"loss": 0.5412, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.344827586206897, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00016259313748179802, |
|
"loss": 0.5448, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 5.360501567398119, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.000162308227974433, |
|
"loss": 0.5523, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.376175548589342, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016202248931035404, |
|
"loss": 0.5382, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 5.391849529780564, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001617359252919824, |
|
"loss": 0.5427, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.407523510971787, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00016144853973272262, |
|
"loss": 0.5426, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 5.423197492163009, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00016116033645691174, |
|
"loss": 0.5514, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.438871473354232, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00016087131929976852, |
|
"loss": 0.5471, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00016058149210734223, |
|
"loss": 0.5617, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.470219435736677, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0001602908587364616, |
|
"loss": 0.5598, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 5.485893416927899, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00015999942305468338, |
|
"loss": 0.5457, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.501567398119122, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.000159707188940241, |
|
"loss": 0.5667, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.00015941416028199298, |
|
"loss": 0.552, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.532915360501567, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00015912034097937094, |
|
"loss": 0.5408, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 5.54858934169279, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00015882573494232797, |
|
"loss": 0.5516, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.564263322884012, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00015853034609128648, |
|
"loss": 0.5649, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 5.579937304075235, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00015823417835708606, |
|
"loss": 0.5666, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.5956112852664575, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00015793723568093118, |
|
"loss": 0.5653, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 5.61128526645768, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00015763952201433866, |
|
"loss": 0.5663, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.6269592476489025, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00015734104131908522, |
|
"loss": 0.5582, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 5.6426332288401255, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00015704179756715467, |
|
"loss": 0.5569, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.658307210031348, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00015674179474068508, |
|
"loss": 0.5618, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 5.673981191222571, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00015644103683191575, |
|
"loss": 0.5636, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.689655172413794, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00015613952784313418, |
|
"loss": 0.5562, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 5.705329153605016, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.00015583727178662262, |
|
"loss": 0.5566, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.721003134796238, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00015553427268460496, |
|
"loss": 0.5591, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 5.736677115987461, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00015523053456919294, |
|
"loss": 0.5666, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.752351097178684, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00015492606148233265, |
|
"loss": 0.5664, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 5.768025078369906, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00015462085747575068, |
|
"loss": 0.5624, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.783699059561129, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00015431492661090022, |
|
"loss": 0.5587, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 5.799373040752351, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00015400827295890702, |
|
"loss": 0.5645, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.815047021943574, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0001537009006005152, |
|
"loss": 0.5631, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 5.830721003134796, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001533928136260329, |
|
"loss": 0.5712, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.846394984326019, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00015308401613527796, |
|
"loss": 0.5676, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 5.862068965517241, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00015277451223752326, |
|
"loss": 0.5761, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.877742946708464, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00015246430605144216, |
|
"loss": 0.5685, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 5.893416927899686, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00015215340170505348, |
|
"loss": 0.5678, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.909090909090909, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0001518418033356668, |
|
"loss": 0.569, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 5.924764890282132, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00015152951508982726, |
|
"loss": 0.5669, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.940438871473354, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0001512165411232604, |
|
"loss": 0.5759, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 5.956112852664576, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00015090288560081692, |
|
"loss": 0.5641, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.971786833855799, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001505885526964172, |
|
"loss": 0.574, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 5.987460815047022, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00015027354659299578, |
|
"loss": 0.5764, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.323450803756714, |
|
"eval_runtime": 0.8121, |
|
"eval_samples_per_second": 2.463, |
|
"eval_steps_per_second": 1.231, |
|
"step": 1914 |
|
}, |
|
{ |
|
"epoch": 6.003134796238244, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00014995787148244563, |
|
"loss": 0.5514, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 6.018808777429467, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00014964153156556245, |
|
"loss": 0.4392, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.0344827586206895, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00014932453105198884, |
|
"loss": 0.4499, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 6.0501567398119125, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0001490068741601581, |
|
"loss": 0.4379, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.065830721003135, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00014868856511723814, |
|
"loss": 0.431, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 6.081504702194358, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00014836960815907532, |
|
"loss": 0.4428, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.09717868338558, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.000148050007530138, |
|
"loss": 0.4482, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 6.112852664576803, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00014772976748346015, |
|
"loss": 0.4478, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.128526645768025, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.00014740889228058462, |
|
"loss": 0.4414, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 6.144200626959248, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001470873861915065, |
|
"loss": 0.4466, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.15987460815047, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00014676525349461637, |
|
"loss": 0.4356, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 6.175548589341693, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00014644249847664317, |
|
"loss": 0.4526, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.191222570532915, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00014611912543259742, |
|
"loss": 0.442, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00014579513866571378, |
|
"loss": 0.4463, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.222570532915361, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00014547054248739404, |
|
"loss": 0.444, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 6.238244514106583, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0001451453412171496, |
|
"loss": 0.4513, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.253918495297806, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.000144819539182544, |
|
"loss": 0.4494, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 6.269592476489028, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00014449314071913533, |
|
"loss": 0.4493, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.285266457680251, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00014416615017041868, |
|
"loss": 0.4605, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 6.300940438871473, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00014383857188776807, |
|
"loss": 0.4642, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.316614420062696, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00014351041023037884, |
|
"loss": 0.4526, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 6.332288401253918, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00014318166956520936, |
|
"loss": 0.4634, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.347962382445141, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00014285235426692315, |
|
"loss": 0.4608, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00014252246871783051, |
|
"loss": 0.4588, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.379310344827586, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00014219201730783024, |
|
"loss": 0.467, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 6.394984326018808, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0001418610044343514, |
|
"loss": 0.4641, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.410658307210031, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00014152943450229443, |
|
"loss": 0.4647, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 6.4263322884012535, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00014119731192397284, |
|
"loss": 0.4627, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 6.4420062695924765, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0001408646411190544, |
|
"loss": 0.4664, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 6.4576802507836994, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0001405314265145023, |
|
"loss": 0.4697, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.4733542319749215, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0001401976725445162, |
|
"loss": 0.4664, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 6.4890282131661445, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001398633836504734, |
|
"loss": 0.4692, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.504702194357367, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00013952856428086952, |
|
"loss": 0.4658, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 6.52037617554859, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00013919321889125941, |
|
"loss": 0.4711, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.536050156739812, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0001388573519441979, |
|
"loss": 0.4675, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 6.551724137931035, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00013852096790918026, |
|
"loss": 0.4677, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 6.567398119122257, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00013818407126258293, |
|
"loss": 0.4728, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 6.58307210031348, |
|
"grad_norm": 2.75, |
|
"learning_rate": 0.0001378466664876038, |
|
"loss": 0.4769, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.598746081504702, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.00013750875807420259, |
|
"loss": 0.463, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 6.614420062695925, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00013717035051904114, |
|
"loss": 0.4663, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 6.630094043887147, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00013683144832542352, |
|
"loss": 0.4699, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 6.64576802507837, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00013649205600323609, |
|
"loss": 0.4703, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.661442006269592, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00013615217806888755, |
|
"loss": 0.4643, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 6.677115987460815, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0001358118190452488, |
|
"loss": 0.4669, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.692789968652038, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001354709834615928, |
|
"loss": 0.4703, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 6.70846394984326, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00013512967585353413, |
|
"loss": 0.4714, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.724137931034483, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00013478790076296892, |
|
"loss": 0.4658, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 6.739811912225705, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00013444566273801414, |
|
"loss": 0.4649, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 6.755485893416928, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013410296633294727, |
|
"loss": 0.4783, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 6.77115987460815, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00013375981610814545, |
|
"loss": 0.4741, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.786833855799373, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00013341621663002514, |
|
"loss": 0.4651, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 6.802507836990595, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.0001330721724709811, |
|
"loss": 0.4682, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00013272768820932554, |
|
"loss": 0.4761, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 6.83385579937304, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.0001323827684292273, |
|
"loss": 0.467, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 6.849529780564263, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0001320374177206509, |
|
"loss": 0.4719, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 6.8652037617554855, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00013169164067929526, |
|
"loss": 0.4829, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.8808777429467085, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00013134544190653274, |
|
"loss": 0.4743, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00013099882600934773, |
|
"loss": 0.4701, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.912225705329154, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00013065179760027556, |
|
"loss": 0.4703, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 6.927899686520377, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00013030436129734082, |
|
"loss": 0.4802, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.943573667711599, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00012995652172399623, |
|
"loss": 0.4781, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 6.959247648902822, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00012960828350906095, |
|
"loss": 0.4838, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.974921630094044, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00012925965128665897, |
|
"loss": 0.4751, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 6.990595611285267, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001289106296961574, |
|
"loss": 0.4817, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.6759016513824463, |
|
"eval_runtime": 0.7898, |
|
"eval_samples_per_second": 2.532, |
|
"eval_steps_per_second": 1.266, |
|
"step": 2233 |
|
}, |
|
{ |
|
"epoch": 7.006269592476489, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00012856122338210493, |
|
"loss": 0.4256, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 7.021943573667712, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00012821143699416984, |
|
"loss": 0.3775, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 7.037617554858934, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00012786127518707818, |
|
"loss": 0.3705, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 7.053291536050157, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00012751074262055178, |
|
"loss": 0.3732, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 7.068965517241379, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012715984395924643, |
|
"loss": 0.3737, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 7.084639498432602, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00012680858387268952, |
|
"loss": 0.3788, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 7.100313479623824, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00012645696703521818, |
|
"loss": 0.3711, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 7.115987460815047, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00012610499812591673, |
|
"loss": 0.3725, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 7.131661442006269, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0001257526818285549, |
|
"loss": 0.371, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 7.147335423197492, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0001254000228315251, |
|
"loss": 0.3751, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.163009404388715, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00012504702582778008, |
|
"loss": 0.3798, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 7.178683385579937, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00012469369551477074, |
|
"loss": 0.3826, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 7.19435736677116, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0001243400365943833, |
|
"loss": 0.3846, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 7.210031347962382, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00012398605377287694, |
|
"loss": 0.383, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.225705329153605, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.000123631751760821, |
|
"loss": 0.387, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 7.241379310344827, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00012327713527303255, |
|
"loss": 0.3752, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.25705329153605, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0001229222090285134, |
|
"loss": 0.3832, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00012256697775038741, |
|
"loss": 0.39, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.2884012539184955, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00012221144616583765, |
|
"loss": 0.3902, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 7.304075235109718, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00012185561900604341, |
|
"loss": 0.376, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 7.3197492163009406, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00012149950100611738, |
|
"loss": 0.3873, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 7.335423197492163, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00012114309690504249, |
|
"loss": 0.388, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.351097178683386, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00012078641144560898, |
|
"loss": 0.3889, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 7.366771159874608, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00012042944937435116, |
|
"loss": 0.395, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 7.382445141065831, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00012007221544148435, |
|
"loss": 0.3957, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 7.398119122257054, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00011971471440084157, |
|
"loss": 0.393, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 7.413793103448276, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00011935695100981041, |
|
"loss": 0.3884, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 7.429467084639499, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011899893002926958, |
|
"loss": 0.3907, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.445141065830721, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011864065622352568, |
|
"loss": 0.3865, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 7.460815047021944, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00011828213436024968, |
|
"loss": 0.3866, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.476489028213166, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00011792336921041359, |
|
"loss": 0.3878, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 7.492163009404389, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00011756436554822685, |
|
"loss": 0.3933, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.507836990595611, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00011720512815107292, |
|
"loss": 0.3825, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 7.523510971786834, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00011684566179944567, |
|
"loss": 0.3892, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.539184952978056, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00011648597127688567, |
|
"loss": 0.3918, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 7.554858934169279, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00011612606136991665, |
|
"loss": 0.3952, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 7.570532915360501, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011576593686798181, |
|
"loss": 0.3941, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 7.586206896551724, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00011540560256337995, |
|
"loss": 0.3996, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.601880877742946, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00011504506325120184, |
|
"loss": 0.3973, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 7.617554858934169, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0001146843237292663, |
|
"loss": 0.3944, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.633228840125392, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001143233887980565, |
|
"loss": 0.4008, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 7.648902821316614, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00011396226326065593, |
|
"loss": 0.3967, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.664576802507837, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00011360095192268454, |
|
"loss": 0.3979, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 7.6802507836990594, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00011323945959223477, |
|
"loss": 0.3957, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 7.695924764890282, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00011287779107980766, |
|
"loss": 0.398, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 7.7115987460815045, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0001125159511982487, |
|
"loss": 0.3976, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.7272727272727275, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00011215394476268387, |
|
"loss": 0.3941, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 7.74294670846395, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00011179177659045554, |
|
"loss": 0.4061, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 7.758620689655173, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00011142945150105839, |
|
"loss": 0.3991, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 7.774294670846395, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00011106697431607518, |
|
"loss": 0.3863, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 7.789968652037618, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00011070434985911271, |
|
"loss": 0.3923, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 7.80564263322884, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001103415829557376, |
|
"loss": 0.3988, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 7.821316614420063, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00010997867843341198, |
|
"loss": 0.3868, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 7.836990595611285, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001096156411214294, |
|
"loss": 0.3949, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.852664576802508, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010925247585085044, |
|
"loss": 0.4005, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 7.868338557993731, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00010888918745443845, |
|
"loss": 0.3974, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 7.884012539184953, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00010852578076659535, |
|
"loss": 0.4004, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 7.899686520376176, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00010816226062329706, |
|
"loss": 0.3997, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 7.915360501567398, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00010779863186202943, |
|
"loss": 0.4002, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 7.931034482758621, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00010743489932172366, |
|
"loss": 0.3973, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 7.946708463949843, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00010707106784269196, |
|
"loss": 0.3968, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 7.962382445141066, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00010670714226656315, |
|
"loss": 0.397, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 7.978056426332288, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00010634312743621832, |
|
"loss": 0.4065, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 7.993730407523511, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00010597902819572619, |
|
"loss": 0.4016, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.0132577419281006, |
|
"eval_runtime": 0.7738, |
|
"eval_samples_per_second": 2.585, |
|
"eval_steps_per_second": 1.292, |
|
"step": 2552 |
|
}, |
|
{ |
|
"epoch": 8.009404388714733, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00010561484939027877, |
|
"loss": 0.361, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 8.025078369905955, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00010525059586612693, |
|
"loss": 0.3253, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 8.04075235109718, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001048862724705158, |
|
"loss": 0.3205, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 8.056426332288401, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00010452188405162033, |
|
"loss": 0.3241, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 8.072100313479623, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010415743545848072, |
|
"loss": 0.3193, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 8.087774294670846, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00010379293154093796, |
|
"loss": 0.3212, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.10344827586207, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00010342837714956928, |
|
"loss": 0.3161, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 8.119122257053291, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010306377713562354, |
|
"loss": 0.321, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 8.134796238244514, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00010269913635095676, |
|
"loss": 0.3245, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 8.150470219435737, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00010233445964796749, |
|
"loss": 0.3279, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.16614420062696, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010196975187953221, |
|
"loss": 0.3233, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00010160501789894086, |
|
"loss": 0.3207, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 8.197492163009404, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0001012402625598322, |
|
"loss": 0.3217, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 8.213166144200628, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00010087549071612919, |
|
"loss": 0.3255, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 8.22884012539185, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00010051070722197438, |
|
"loss": 0.3256, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 8.244514106583072, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001001459169316654, |
|
"loss": 0.3233, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 8.260188087774294, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.978112469959033e-05, |
|
"loss": 0.3232, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.941633538016315e-05, |
|
"loss": 0.3237, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 8.29153605015674, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.90515538277589e-05, |
|
"loss": 0.3307, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 8.307210031347962, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.868678489664945e-05, |
|
"loss": 0.3228, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 8.322884012539184, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.832203344093855e-05, |
|
"loss": 0.3238, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 8.338557993730408, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 9.795730431449759e-05, |
|
"loss": 0.3302, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 8.35423197492163, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.759260237090058e-05, |
|
"loss": 0.3243, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 8.369905956112852, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.722793246336006e-05, |
|
"loss": 0.3255, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 8.385579937304076, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.686329944466203e-05, |
|
"loss": 0.323, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 8.401253918495298, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.649870816710172e-05, |
|
"loss": 0.3278, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 8.41692789968652, |
|
"grad_norm": 0.875, |
|
"learning_rate": 9.613416348241887e-05, |
|
"loss": 0.3282, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 8.432601880877742, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.576967024173323e-05, |
|
"loss": 0.3226, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 8.448275862068966, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.540523329547984e-05, |
|
"loss": 0.3361, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 8.463949843260188, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 9.504085749334479e-05, |
|
"loss": 0.3309, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.47962382445141, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 9.467654768420032e-05, |
|
"loss": 0.325, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 8.495297805642632, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 9.431230871604067e-05, |
|
"loss": 0.3265, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 8.510971786833856, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 9.394814543591719e-05, |
|
"loss": 0.3302, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 8.526645768025078, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.358406268987417e-05, |
|
"loss": 0.3299, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.5423197492163, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.322006532288411e-05, |
|
"loss": 0.3303, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 8.557993730407524, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 9.285615817878342e-05, |
|
"loss": 0.3246, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.573667711598747, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 9.249234610020779e-05, |
|
"loss": 0.3256, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 8.589341692789969, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.212863392852793e-05, |
|
"loss": 0.3286, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.60501567398119, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.176502650378499e-05, |
|
"loss": 0.3301, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 8.620689655172415, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 9.140152866462629e-05, |
|
"loss": 0.3345, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 8.636363636363637, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 9.103814524824073e-05, |
|
"loss": 0.3335, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 8.652037617554859, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 9.067488109029474e-05, |
|
"loss": 0.3287, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.66771159874608, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.031174102486752e-05, |
|
"loss": 0.3286, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 8.683385579937305, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.994872988438711e-05, |
|
"loss": 0.3283, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 8.699059561128527, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.958585249956578e-05, |
|
"loss": 0.3308, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 8.714733542319749, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.922311369933598e-05, |
|
"loss": 0.3308, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 8.730407523510971, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.886051831078582e-05, |
|
"loss": 0.3314, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 8.746081504702195, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 8.849807115909513e-05, |
|
"loss": 0.33, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 8.761755485893417, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 8.8135777067471e-05, |
|
"loss": 0.329, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 8.77742946708464, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 8.777364085708378e-05, |
|
"loss": 0.3356, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.793103448275861, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 8.741166734700273e-05, |
|
"loss": 0.3384, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 8.808777429467085, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 8.704986135413212e-05, |
|
"loss": 0.3324, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 8.824451410658307, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 8.668822769314691e-05, |
|
"loss": 0.3291, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 8.84012539184953, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.632677117642892e-05, |
|
"loss": 0.3358, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 8.855799373040753, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 8.596549661400248e-05, |
|
"loss": 0.3331, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 8.871473354231975, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 8.560440881347071e-05, |
|
"loss": 0.3306, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 8.887147335423197, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 8.524351257995135e-05, |
|
"loss": 0.3322, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 8.90282131661442, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.488281271601302e-05, |
|
"loss": 0.3246, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 8.918495297805643, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.452231402161099e-05, |
|
"loss": 0.3339, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 8.934169278996865, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 8.416202129402371e-05, |
|
"loss": 0.3291, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 8.949843260188088, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 8.380193932778857e-05, |
|
"loss": 0.3268, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 8.96551724137931, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 8.344207291463843e-05, |
|
"loss": 0.3258, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 8.981191222570533, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.30824268434376e-05, |
|
"loss": 0.3345, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 8.996865203761756, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 8.27230059001184e-05, |
|
"loss": 0.327, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.4537088871002197, |
|
"eval_runtime": 0.7983, |
|
"eval_samples_per_second": 2.505, |
|
"eval_steps_per_second": 1.253, |
|
"step": 2871 |
|
}, |
|
{ |
|
"epoch": 9.012539184952978, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 8.2363814867617e-05, |
|
"loss": 0.2932, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 9.0282131661442, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.200485852581036e-05, |
|
"loss": 0.2798, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.043887147335424, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 8.16461416514522e-05, |
|
"loss": 0.2751, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 9.059561128526646, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 8.12876690181096e-05, |
|
"loss": 0.2771, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 9.075235109717868, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.092944539609937e-05, |
|
"loss": 0.281, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 8.057147555242473e-05, |
|
"loss": 0.2709, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.106583072100314, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.021376425071175e-05, |
|
"loss": 0.2787, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 9.122257053291536, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 7.985631625114603e-05, |
|
"loss": 0.2817, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 9.137931034482758, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 7.94991363104092e-05, |
|
"loss": 0.2798, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 9.153605015673982, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.914222918161589e-05, |
|
"loss": 0.2804, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.169278996865204, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.878559961425025e-05, |
|
"loss": 0.2778, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 9.184952978056426, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 7.842925235410288e-05, |
|
"loss": 0.2832, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 9.200626959247648, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.807319214320747e-05, |
|
"loss": 0.2781, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 9.216300940438872, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 7.771742371977811e-05, |
|
"loss": 0.277, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 9.231974921630094, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 7.73619518181457e-05, |
|
"loss": 0.2701, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 9.247648902821316, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.700678116869543e-05, |
|
"loss": 0.2771, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 9.263322884012538, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.66519164978035e-05, |
|
"loss": 0.2738, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 9.278996865203762, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.629736252777445e-05, |
|
"loss": 0.2735, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 9.294670846394984, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.594312397677809e-05, |
|
"loss": 0.2773, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 9.310344827586206, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 7.558920555878696e-05, |
|
"loss": 0.2764, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 9.32601880877743, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 7.523561198351342e-05, |
|
"loss": 0.2828, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 9.341692789968652, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.48823479563471e-05, |
|
"loss": 0.2834, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 9.357366771159874, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 7.452941817829212e-05, |
|
"loss": 0.2848, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 9.373040752351097, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 7.417682734590469e-05, |
|
"loss": 0.2801, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 9.38871473354232, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 7.382458015123057e-05, |
|
"loss": 0.2822, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 9.404388714733543, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 7.347268128174265e-05, |
|
"loss": 0.2736, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.420062695924765, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.31211354202784e-05, |
|
"loss": 0.2801, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 9.435736677115987, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.276994724497787e-05, |
|
"loss": 0.2799, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 9.45141065830721, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.241912142922109e-05, |
|
"loss": 0.2768, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 9.467084639498433, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.206866264156623e-05, |
|
"loss": 0.2851, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 9.482758620689655, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.171857554568706e-05, |
|
"loss": 0.284, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 9.498432601880877, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.136886480031138e-05, |
|
"loss": 0.2826, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 9.5141065830721, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.101953505915857e-05, |
|
"loss": 0.2823, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 9.529780564263323, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.067059097087796e-05, |
|
"loss": 0.2767, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 9.545454545454545, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.03220371789868e-05, |
|
"loss": 0.2808, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 9.561128526645769, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.997387832180864e-05, |
|
"loss": 0.2847, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 9.576802507836991, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.962611903241132e-05, |
|
"loss": 0.2829, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 9.592476489028213, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.92787639385457e-05, |
|
"loss": 0.2815, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.608150470219435, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 6.893181766258373e-05, |
|
"loss": 0.281, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 9.623824451410659, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.858528482145716e-05, |
|
"loss": 0.2807, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 9.639498432601881, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.823917002659596e-05, |
|
"loss": 0.2884, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.789347788386706e-05, |
|
"loss": 0.2789, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 9.670846394984325, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.754821299351299e-05, |
|
"loss": 0.2733, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 9.68652037617555, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.720337995009076e-05, |
|
"loss": 0.2876, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 9.702194357366771, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.68589833424105e-05, |
|
"loss": 0.2856, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 9.717868338557993, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.651502775347469e-05, |
|
"loss": 0.2829, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.733542319749215, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.617151776041692e-05, |
|
"loss": 0.2875, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 9.74921630094044, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.582845793444119e-05, |
|
"loss": 0.2746, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 9.764890282131661, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 6.548585284076084e-05, |
|
"loss": 0.288, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 9.780564263322884, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.514370703853806e-05, |
|
"loss": 0.2807, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 9.796238244514107, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.480202508082298e-05, |
|
"loss": 0.2858, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 9.81191222570533, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 6.44608115144933e-05, |
|
"loss": 0.2793, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 9.827586206896552, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.412007088019364e-05, |
|
"loss": 0.2818, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 9.843260188087774, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.377980771227509e-05, |
|
"loss": 0.2864, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 9.858934169278998, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.344002653873504e-05, |
|
"loss": 0.2819, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 9.87460815047022, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.31007318811567e-05, |
|
"loss": 0.2833, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 9.890282131661442, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.276192825464918e-05, |
|
"loss": 0.2888, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 9.905956112852664, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.242362016778713e-05, |
|
"loss": 0.2819, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 9.921630094043888, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 6.208581212255104e-05, |
|
"loss": 0.2858, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 9.93730407523511, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.1748508614267e-05, |
|
"loss": 0.2793, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 9.952978056426332, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.141171413154722e-05, |
|
"loss": 0.2795, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 9.968652037617554, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 6.107543315623001e-05, |
|
"loss": 0.2794, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 9.984326018808778, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 6.073967016332041e-05, |
|
"loss": 0.2833, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.040442962093029e-05, |
|
"loss": 0.2814, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.8273370265960693, |
|
"eval_runtime": 0.8006, |
|
"eval_samples_per_second": 2.498, |
|
"eval_steps_per_second": 1.249, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 10.015673981191222, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 6.006971599021928e-05, |
|
"loss": 0.2526, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 10.031347962382446, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 5.973553372533509e-05, |
|
"loss": 0.2478, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 10.047021943573668, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 5.9401887273354475e-05, |
|
"loss": 0.2487, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 10.06269592476489, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 5.9068781074223824e-05, |
|
"loss": 0.2522, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 10.078369905956112, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 5.8736219560700324e-05, |
|
"loss": 0.2503, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 10.094043887147336, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 5.840420715829272e-05, |
|
"loss": 0.247, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 10.109717868338558, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.807274828520266e-05, |
|
"loss": 0.2521, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 10.12539184952978, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 5.774184735226571e-05, |
|
"loss": 0.2484, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 10.141065830721002, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 5.741150876289283e-05, |
|
"loss": 0.2474, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 10.156739811912226, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 5.708173691301153e-05, |
|
"loss": 0.2506, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 10.172413793103448, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 5.675253619100772e-05, |
|
"loss": 0.2544, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 10.18808777429467, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 5.642391097766693e-05, |
|
"loss": 0.249, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 10.203761755485893, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 5.609586564611631e-05, |
|
"loss": 0.2535, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 10.219435736677116, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 5.576840456176631e-05, |
|
"loss": 0.2504, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 10.235109717868339, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 5.544153208225265e-05, |
|
"loss": 0.2524, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 10.25078369905956, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 5.511525255737815e-05, |
|
"loss": 0.2549, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 10.266457680250785, |
|
"grad_norm": 0.625, |
|
"learning_rate": 5.478957032905514e-05, |
|
"loss": 0.2521, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 10.282131661442007, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 5.446448973124736e-05, |
|
"loss": 0.2531, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 10.297805642633229, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 5.414001508991264e-05, |
|
"loss": 0.2533, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 10.31347962382445, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 5.3816150722944916e-05, |
|
"loss": 0.2544, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 10.329153605015675, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 5.3492900940117264e-05, |
|
"loss": 0.2491, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 10.344827586206897, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 5.3170270043024015e-05, |
|
"loss": 0.2522, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 10.360501567398119, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 5.284826232502399e-05, |
|
"loss": 0.249, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 10.376175548589341, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.252688207118297e-05, |
|
"loss": 0.2459, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 10.391849529780565, |
|
"grad_norm": 0.625, |
|
"learning_rate": 5.220613355821704e-05, |
|
"loss": 0.2503, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 10.407523510971787, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 5.188602105443533e-05, |
|
"loss": 0.2543, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 10.423197492163009, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 5.156654881968348e-05, |
|
"loss": 0.2509, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 10.438871473354231, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 5.124772110528684e-05, |
|
"loss": 0.2524, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 10.454545454545455, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 5.0929542153993926e-05, |
|
"loss": 0.2525, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 10.470219435736677, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 5.061201619991984e-05, |
|
"loss": 0.248, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 10.4858934169279, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 5.029514746849018e-05, |
|
"loss": 0.2516, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 10.501567398119121, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.9978940176384514e-05, |
|
"loss": 0.2509, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 10.517241379310345, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.966339853148055e-05, |
|
"loss": 0.2506, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 10.532915360501567, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.934852673279787e-05, |
|
"loss": 0.254, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 10.54858934169279, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.9034328970442275e-05, |
|
"loss": 0.2519, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 10.564263322884013, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.8720809425549916e-05, |
|
"loss": 0.2541, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 10.579937304075235, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.8407972270231704e-05, |
|
"loss": 0.2563, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 10.595611285266457, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.809582166751765e-05, |
|
"loss": 0.2529, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 10.61128526645768, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.778436177130173e-05, |
|
"loss": 0.2546, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 10.626959247648903, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.747359672628631e-05, |
|
"loss": 0.2545, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 10.642633228840126, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.7163530667927226e-05, |
|
"loss": 0.2564, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 10.658307210031348, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.685416772237864e-05, |
|
"loss": 0.2506, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 10.67398119122257, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.65455120064382e-05, |
|
"loss": 0.2505, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 10.689655172413794, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.623756762749207e-05, |
|
"loss": 0.2499, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 10.705329153605016, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.593033868346059e-05, |
|
"loss": 0.2529, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 10.721003134796238, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.5623829262743414e-05, |
|
"loss": 0.2549, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 10.736677115987462, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.531804344416536e-05, |
|
"loss": 0.2524, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 10.752351097178684, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.501298529692194e-05, |
|
"loss": 0.2556, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 10.768025078369906, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.470865888052537e-05, |
|
"loss": 0.2518, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 10.783699059561128, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.4405068244750446e-05, |
|
"loss": 0.2544, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 10.799373040752352, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.410221742958073e-05, |
|
"loss": 0.2493, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 10.815047021943574, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.380011046515461e-05, |
|
"loss": 0.2556, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 10.830721003134796, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.349875137171196e-05, |
|
"loss": 0.2525, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 10.846394984326018, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.3198144159540346e-05, |
|
"loss": 0.2532, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 10.862068965517242, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.289829282892188e-05, |
|
"loss": 0.2494, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 10.877742946708464, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.2599201370079875e-05, |
|
"loss": 0.2495, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 10.893416927899686, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.230087376312582e-05, |
|
"loss": 0.2483, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.2003313978006244e-05, |
|
"loss": 0.2505, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 10.924764890282132, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.170652597445016e-05, |
|
"loss": 0.2499, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 10.940438871473354, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.1410513701916086e-05, |
|
"loss": 0.2501, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 10.956112852664576, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.111528109953975e-05, |
|
"loss": 0.2547, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 10.971786833855798, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.0820832096081415e-05, |
|
"loss": 0.252, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 10.987460815047022, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.052717060987386e-05, |
|
"loss": 0.2539, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 4.135468006134033, |
|
"eval_runtime": 0.805, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 1.242, |
|
"step": 3509 |
|
}, |
|
{ |
|
"epoch": 11.003134796238244, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 4.023430054876999e-05, |
|
"loss": 0.2514, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 11.018808777429467, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 3.994222581009107e-05, |
|
"loss": 0.2418, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 11.03448275862069, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 3.965095028057461e-05, |
|
"loss": 0.2369, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 11.050156739811912, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 3.936047783632286e-05, |
|
"loss": 0.2349, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 11.065830721003135, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 3.907081234275109e-05, |
|
"loss": 0.2392, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 11.081504702194357, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 3.878195765453626e-05, |
|
"loss": 0.2316, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 11.09717868338558, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 3.849391761556559e-05, |
|
"loss": 0.2321, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 11.112852664576803, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 3.820669605888556e-05, |
|
"loss": 0.235, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 11.128526645768025, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 3.79202968066508e-05, |
|
"loss": 0.2333, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 11.144200626959247, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 3.7634723670073294e-05, |
|
"loss": 0.2376, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 11.15987460815047, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.7349980449371516e-05, |
|
"loss": 0.2379, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 11.175548589341693, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 3.706607093372012e-05, |
|
"loss": 0.2344, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 11.191222570532915, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 3.67829989011992e-05, |
|
"loss": 0.2355, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 11.206896551724139, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.65007681187443e-05, |
|
"loss": 0.2332, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 11.22257053291536, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 3.621938234209613e-05, |
|
"loss": 0.2346, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 11.238244514106583, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 3.5938845315750666e-05, |
|
"loss": 0.2385, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 11.253918495297805, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 3.565916077290914e-05, |
|
"loss": 0.2367, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 11.269592476489029, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 3.5380332435428655e-05, |
|
"loss": 0.2383, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 11.285266457680251, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 3.510236401377236e-05, |
|
"loss": 0.2373, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 11.300940438871473, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 3.482525920696036e-05, |
|
"loss": 0.2352, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 11.316614420062695, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.454902170252019e-05, |
|
"loss": 0.2343, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 11.33228840125392, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.4273655176438014e-05, |
|
"loss": 0.2317, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 11.347962382445141, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 3.3999163293109534e-05, |
|
"loss": 0.2375, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 11.363636363636363, |
|
"grad_norm": 0.625, |
|
"learning_rate": 3.372554970529137e-05, |
|
"loss": 0.2384, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 11.379310344827585, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 3.345281805405219e-05, |
|
"loss": 0.2385, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 11.39498432601881, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 3.318097196872464e-05, |
|
"loss": 0.2429, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 11.410658307210031, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 3.291001506685666e-05, |
|
"loss": 0.2351, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 11.426332288401253, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 3.2639950954163644e-05, |
|
"loss": 0.2377, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 11.442006269592476, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.23707832244803e-05, |
|
"loss": 0.2353, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 11.4576802507837, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 3.2102515459712876e-05, |
|
"loss": 0.2361, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 11.473354231974922, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 3.1835151229791435e-05, |
|
"loss": 0.2357, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 11.489028213166144, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 3.1568694092622475e-05, |
|
"loss": 0.2359, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 11.504702194357368, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 3.1303147594041394e-05, |
|
"loss": 0.2333, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 11.52037617554859, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 3.1038515267765545e-05, |
|
"loss": 0.236, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 11.536050156739812, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.0774800635346934e-05, |
|
"loss": 0.2323, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 11.551724137931034, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 3.0512007206125638e-05, |
|
"loss": 0.2358, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 11.567398119122258, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.0250138477182886e-05, |
|
"loss": 0.2379, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 11.58307210031348, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.9989197933294687e-05, |
|
"loss": 0.2353, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 11.598746081504702, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.9729189046885266e-05, |
|
"loss": 0.2359, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 11.614420062695924, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 2.947011527798107e-05, |
|
"loss": 0.2375, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 11.630094043887148, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 2.9211980074164514e-05, |
|
"loss": 0.2364, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 11.64576802507837, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 2.895478687052835e-05, |
|
"loss": 0.2393, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 11.661442006269592, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 2.8698539089629662e-05, |
|
"loss": 0.235, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 11.677115987460816, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.844324014144457e-05, |
|
"loss": 0.2341, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 11.692789968652038, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.818889342332275e-05, |
|
"loss": 0.2299, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 11.70846394984326, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 2.793550231994222e-05, |
|
"loss": 0.2388, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 11.724137931034482, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 2.768307020326425e-05, |
|
"loss": 0.2341, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 11.739811912225706, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.7431600432488657e-05, |
|
"loss": 0.234, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 11.755485893416928, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 2.7181096354008884e-05, |
|
"loss": 0.2364, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 11.77115987460815, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.6931561301367646e-05, |
|
"loss": 0.2331, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 11.786833855799372, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 2.6682998595212505e-05, |
|
"loss": 0.2357, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 11.802507836990596, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.6435411543251677e-05, |
|
"loss": 0.2389, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 11.818181818181818, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 2.6188803440209942e-05, |
|
"loss": 0.241, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 11.83385579937304, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 2.5943177567785015e-05, |
|
"loss": 0.2361, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 11.849529780564263, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.5698537194603566e-05, |
|
"loss": 0.2352, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 11.865203761755486, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.5454885576178067e-05, |
|
"loss": 0.2389, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 11.880877742946709, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.5212225954863132e-05, |
|
"loss": 0.2367, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 11.89655172413793, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 2.4970561559812645e-05, |
|
"loss": 0.2383, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 11.912225705329153, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 2.472989560693665e-05, |
|
"loss": 0.2314, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 11.927899686520377, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.449023129885859e-05, |
|
"loss": 0.2388, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 11.943573667711599, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.425157182487262e-05, |
|
"loss": 0.2383, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 11.95924764890282, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.401392036090132e-05, |
|
"loss": 0.2384, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 11.974921630094045, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.3777280069453245e-05, |
|
"loss": 0.2358, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 11.990595611285267, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 2.3541654099581e-05, |
|
"loss": 0.233, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 4.354940891265869, |
|
"eval_runtime": 0.795, |
|
"eval_samples_per_second": 2.516, |
|
"eval_steps_per_second": 1.258, |
|
"step": 3828 |
|
}, |
|
{ |
|
"epoch": 12.006269592476489, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 2.330704558683926e-05, |
|
"loss": 0.2349, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 12.021943573667711, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 2.307345765324306e-05, |
|
"loss": 0.2297, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 12.037617554858935, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.284089340722618e-05, |
|
"loss": 0.2337, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 12.053291536050157, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.2609355943599942e-05, |
|
"loss": 0.2295, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 12.068965517241379, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.2378848343511804e-05, |
|
"loss": 0.2287, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 12.084639498432601, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 2.214937367440463e-05, |
|
"loss": 0.2281, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 12.100313479623825, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.192093498997555e-05, |
|
"loss": 0.2299, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 12.115987460815047, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 2.169353533013565e-05, |
|
"loss": 0.2287, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 12.13166144200627, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.1467177720969268e-05, |
|
"loss": 0.2281, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 12.147335423197493, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.12418651746939e-05, |
|
"loss": 0.2238, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 12.163009404388715, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.101760068961992e-05, |
|
"loss": 0.2331, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 12.178683385579937, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 2.0794387250110913e-05, |
|
"loss": 0.2286, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 12.19435736677116, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.0572227826543755e-05, |
|
"loss": 0.2323, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 12.210031347962383, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.0351125375269264e-05, |
|
"loss": 0.2269, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 12.225705329153605, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 2.0131082838572655e-05, |
|
"loss": 0.2288, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 12.241379310344827, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.991210314463461e-05, |
|
"loss": 0.236, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 12.25705329153605, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.969418920749214e-05, |
|
"loss": 0.229, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 12.272727272727273, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.9477343926999913e-05, |
|
"loss": 0.2305, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 12.288401253918495, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.9261570188791555e-05, |
|
"loss": 0.2287, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 12.304075235109718, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.9046870864241384e-05, |
|
"loss": 0.2274, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 12.31974921630094, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.8833248810426073e-05, |
|
"loss": 0.2282, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 12.335423197492164, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.8620706870086723e-05, |
|
"loss": 0.2282, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 12.351097178683386, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.8409247871591006e-05, |
|
"loss": 0.2323, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 12.366771159874608, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.8198874628895524e-05, |
|
"loss": 0.228, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 12.38244514106583, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.798958994150829e-05, |
|
"loss": 0.2227, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 12.398119122257054, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.7781396594451637e-05, |
|
"loss": 0.235, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 12.413793103448276, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.757429735822499e-05, |
|
"loss": 0.2303, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 12.429467084639498, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.7368294988768097e-05, |
|
"loss": 0.2318, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 12.445141065830722, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.716339222742436e-05, |
|
"loss": 0.2305, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 12.460815047021944, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.695959180090425e-05, |
|
"loss": 0.2268, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 12.476489028213166, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 1.6756896421249168e-05, |
|
"loss": 0.2279, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 12.492163009404388, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.6555308785795232e-05, |
|
"loss": 0.2267, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 12.507836990595612, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.6354831577137485e-05, |
|
"loss": 0.2287, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 12.523510971786834, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 1.6155467463094066e-05, |
|
"loss": 0.2248, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 12.539184952978056, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.5957219096670883e-05, |
|
"loss": 0.2299, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 12.554858934169278, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.576008911602609e-05, |
|
"loss": 0.2288, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 12.570532915360502, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.5564080144435212e-05, |
|
"loss": 0.2318, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 12.586206896551724, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.536919479025609e-05, |
|
"loss": 0.233, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 12.601880877742946, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.517543564689422e-05, |
|
"loss": 0.2253, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 12.61755485893417, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.4982805292768165e-05, |
|
"loss": 0.2266, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 12.633228840125392, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 1.4791306291275398e-05, |
|
"loss": 0.2272, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 12.648902821316614, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.4600941190758022e-05, |
|
"loss": 0.2304, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 12.664576802507836, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.4411712524469012e-05, |
|
"loss": 0.2314, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 12.68025078369906, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.4223622810538328e-05, |
|
"loss": 0.2303, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 12.695924764890282, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 1.4036674551939599e-05, |
|
"loss": 0.2323, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 12.711598746081505, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 1.385087023645667e-05, |
|
"loss": 0.2307, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 12.727272727272727, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.3666212336650586e-05, |
|
"loss": 0.2235, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 12.74294670846395, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 1.3482703309826584e-05, |
|
"loss": 0.2285, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 12.758620689655173, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.330034559800154e-05, |
|
"loss": 0.2311, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 12.774294670846395, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.31191416278713e-05, |
|
"loss": 0.2286, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 12.789968652037617, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.293909381077858e-05, |
|
"loss": 0.2294, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 12.80564263322884, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.2760204542680654e-05, |
|
"loss": 0.2309, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 12.821316614420063, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.2582476204117755e-05, |
|
"loss": 0.2294, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 12.836990595611285, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 1.2405911160181072e-05, |
|
"loss": 0.2241, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 12.852664576802507, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.2230511760481533e-05, |
|
"loss": 0.2253, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 12.86833855799373, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.2056280339118397e-05, |
|
"loss": 0.2358, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 12.884012539184953, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.188321921464829e-05, |
|
"loss": 0.229, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 12.899686520376175, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.1711330690054211e-05, |
|
"loss": 0.2299, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 12.915360501567399, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.1540617052715074e-05, |
|
"loss": 0.2283, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 12.931034482758621, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.1371080574375114e-05, |
|
"loss": 0.2297, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 12.946708463949843, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 1.1202723511113766e-05, |
|
"loss": 0.2338, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 12.962382445141065, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.1035548103315484e-05, |
|
"loss": 0.2337, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 12.978056426332289, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.086955657564015e-05, |
|
"loss": 0.2294, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 12.993730407523511, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 1.0704751136993251e-05, |
|
"loss": 0.2281, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 4.455935478210449, |
|
"eval_runtime": 0.8034, |
|
"eval_samples_per_second": 2.489, |
|
"eval_steps_per_second": 1.245, |
|
"step": 4147 |
|
}, |
|
{ |
|
"epoch": 13.009404388714733, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.0541133980496686e-05, |
|
"loss": 0.2257, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 13.025078369905955, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.0378707283459376e-05, |
|
"loss": 0.2262, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 13.04075235109718, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 1.0217473207348483e-05, |
|
"loss": 0.2247, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 13.056426332288401, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 1.0057433897760493e-05, |
|
"loss": 0.2277, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 13.072100313479623, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 9.898591484392793e-06, |
|
"loss": 0.2256, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 13.087774294670846, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.74094808101519e-06, |
|
"loss": 0.23, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 13.10344827586207, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.584505785441932e-06, |
|
"loss": 0.2266, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 13.119122257053291, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 9.429266679503657e-06, |
|
"loss": 0.2283, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 13.134796238244514, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 9.275232829019787e-06, |
|
"loss": 0.2257, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 13.150470219435737, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 9.122406283771002e-06, |
|
"loss": 0.2307, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 13.16614420062696, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 8.970789077471953e-06, |
|
"loss": 0.2259, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 13.181818181818182, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.82038322774419e-06, |
|
"loss": 0.2286, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 13.197492163009404, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 8.671190736089373e-06, |
|
"loss": 0.2277, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 13.213166144200628, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 8.523213587862533e-06, |
|
"loss": 0.2287, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 13.22884012539185, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.376453752245795e-06, |
|
"loss": 0.2266, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 13.244514106583072, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.230913182222e-06, |
|
"loss": 0.2264, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 13.260188087774294, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 8.086593814548882e-06, |
|
"loss": 0.2258, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 13.275862068965518, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.943497569733183e-06, |
|
"loss": 0.2295, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 13.29153605015674, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.801626352005186e-06, |
|
"loss": 0.2254, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 13.307210031347962, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 7.66098204929323e-06, |
|
"loss": 0.2284, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 13.322884012539184, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.521566533198765e-06, |
|
"loss": 0.2263, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 13.338557993730408, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 7.383381658971311e-06, |
|
"loss": 0.2279, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 13.35423197492163, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 7.246429265483856e-06, |
|
"loss": 0.2238, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 13.369905956112852, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 7.1107111752083175e-06, |
|
"loss": 0.2273, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 13.385579937304076, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.976229194191352e-06, |
|
"loss": 0.2244, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 13.401253918495298, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 6.842985112030253e-06, |
|
"loss": 0.2256, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 13.41692789968652, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.710980701849223e-06, |
|
"loss": 0.2246, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 13.432601880877742, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 6.580217720275661e-06, |
|
"loss": 0.2275, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 13.448275862068966, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.450697907416936e-06, |
|
"loss": 0.2269, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 13.463949843260188, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.3224229868370845e-06, |
|
"loss": 0.2311, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 13.47962382445141, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.19539466553396e-06, |
|
"loss": 0.2254, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 13.495297805642632, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 6.0696146339165095e-06, |
|
"loss": 0.2289, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 13.510971786833856, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 5.945084565782277e-06, |
|
"loss": 0.2319, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 13.526645768025078, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 5.82180611829507e-06, |
|
"loss": 0.224, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 13.5423197492163, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 5.699780931963006e-06, |
|
"loss": 0.2297, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 13.557993730407524, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 5.5790106306165766e-06, |
|
"loss": 0.228, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 13.573667711598747, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 5.459496821387166e-06, |
|
"loss": 0.2315, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 13.589341692789969, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 5.341241094685523e-06, |
|
"loss": 0.2283, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 13.60501567398119, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.2242450241806964e-06, |
|
"loss": 0.2277, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 13.620689655172415, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 5.108510166779068e-06, |
|
"loss": 0.2264, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 13.636363636363637, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.994038062603645e-06, |
|
"loss": 0.232, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 13.652037617554859, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 4.880830234973499e-06, |
|
"loss": 0.2254, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 13.66771159874608, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.7688881903835915e-06, |
|
"loss": 0.2277, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 13.683385579937305, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 4.658213418484636e-06, |
|
"loss": 0.2292, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 13.699059561128527, |
|
"grad_norm": 0.5, |
|
"learning_rate": 4.548807392063359e-06, |
|
"loss": 0.2246, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 13.714733542319749, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 4.4406715670228474e-06, |
|
"loss": 0.226, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 13.730407523510971, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 4.333807382363197e-06, |
|
"loss": 0.2292, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 13.746081504702195, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 4.22821626016231e-06, |
|
"loss": 0.2289, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 13.761755485893417, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 4.123899605557091e-06, |
|
"loss": 0.2247, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 13.77742946708464, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 4.020858806724592e-06, |
|
"loss": 0.2221, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 13.793103448275861, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 3.91909523486369e-06, |
|
"loss": 0.2277, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 13.808777429467085, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 3.818610244176702e-06, |
|
"loss": 0.2298, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 13.824451410658307, |
|
"grad_norm": 0.625, |
|
"learning_rate": 3.719405171851487e-06, |
|
"loss": 0.2223, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 13.84012539184953, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 3.621481338043564e-06, |
|
"loss": 0.2269, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 13.855799373040753, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 3.5248400458586127e-06, |
|
"loss": 0.2266, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 13.871473354231975, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 3.429482581335053e-06, |
|
"loss": 0.2296, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 13.887147335423197, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 3.3354102134269927e-06, |
|
"loss": 0.2281, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 13.90282131661442, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 3.2426241939873313e-06, |
|
"loss": 0.2288, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 13.918495297805643, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 3.151125757751083e-06, |
|
"loss": 0.2299, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 13.934169278996865, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 3.0609161223189575e-06, |
|
"loss": 0.2238, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 13.949843260188088, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.9719964881411712e-06, |
|
"loss": 0.2243, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 13.96551724137931, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 2.8843680385014284e-06, |
|
"loss": 0.2257, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 13.981191222570533, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 2.798031939501222e-06, |
|
"loss": 0.2286, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 13.996865203761756, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 2.7129893400442807e-06, |
|
"loss": 0.2274, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 4.4672441482543945, |
|
"eval_runtime": 0.8024, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 1.246, |
|
"step": 4466 |
|
}, |
|
{ |
|
"epoch": 14.012539184952978, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 2.629241371821334e-06, |
|
"loss": 0.2259, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 14.0282131661442, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.546789149294959e-06, |
|
"loss": 0.2248, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 14.043887147335424, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.4656337696848496e-06, |
|
"loss": 0.226, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 14.059561128526646, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 2.3857763129531473e-06, |
|
"loss": 0.2239, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 14.075235109717868, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.3072178417901326e-06, |
|
"loss": 0.2277, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 14.090909090909092, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 2.229959401599968e-06, |
|
"loss": 0.2246, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 14.106583072100314, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 2.154002020486945e-06, |
|
"loss": 0.2244, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 14.122257053291536, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 2.0793467092416696e-06, |
|
"loss": 0.2226, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 14.137931034482758, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.005994461327698e-06, |
|
"loss": 0.2299, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 14.153605015673982, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.933946252868224e-06, |
|
"loss": 0.2287, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 14.169278996865204, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.8632030426332215e-06, |
|
"loss": 0.2226, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 14.184952978056426, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.7937657720265454e-06, |
|
"loss": 0.2262, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 14.200626959247648, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.7256353650735302e-06, |
|
"loss": 0.2298, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 14.216300940438872, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 1.6588127284085652e-06, |
|
"loss": 0.2286, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 14.231974921630094, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 1.5932987512631614e-06, |
|
"loss": 0.2282, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 14.247648902821316, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.529094305453993e-06, |
|
"loss": 0.222, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 14.263322884012538, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 1.4662002453714074e-06, |
|
"loss": 0.2256, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 14.278996865203762, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.4046174079679787e-06, |
|
"loss": 0.2266, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 14.294670846394984, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.3443466127474046e-06, |
|
"loss": 0.2245, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 14.310344827586206, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.285388661753595e-06, |
|
"loss": 0.2332, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 14.32601880877743, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.2277443395599886e-06, |
|
"loss": 0.2266, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 14.341692789968652, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.1714144132591199e-06, |
|
"loss": 0.2281, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 14.357366771159874, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.116399632452414e-06, |
|
"loss": 0.2292, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 14.373040752351097, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.062700729240218e-06, |
|
"loss": 0.2246, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 14.38871473354232, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.0103184182120418e-06, |
|
"loss": 0.228, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 14.404388714733543, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.592533964370542e-07, |
|
"loss": 0.2257, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 14.420062695924765, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 9.095063434548135e-07, |
|
"loss": 0.229, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 14.435736677115987, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 8.61077921266229e-07, |
|
"loss": 0.2273, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 14.45141065830721, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 8.139687743247138e-07, |
|
"loss": 0.2256, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 14.467084639498433, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 7.681795295276684e-07, |
|
"loss": 0.229, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 14.482758620689655, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 7.237107962080991e-07, |
|
"loss": 0.2248, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 14.498432601880877, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 6.805631661265133e-07, |
|
"loss": 0.2292, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 14.5141065830721, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 6.387372134630587e-07, |
|
"loss": 0.2225, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 14.529780564263323, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 5.982334948098522e-07, |
|
"loss": 0.2291, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 14.545454545454545, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 5.5905254916363e-07, |
|
"loss": 0.2228, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 14.561128526645769, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 5.211948979184978e-07, |
|
"loss": 0.2282, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 14.576802507836991, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 4.846610448590804e-07, |
|
"loss": 0.2291, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 14.592476489028213, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 4.4945147615372827e-07, |
|
"loss": 0.229, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 14.608150470219435, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 4.1556666034811007e-07, |
|
"loss": 0.2294, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 14.623824451410659, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 3.8300704835896316e-07, |
|
"loss": 0.2268, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 14.639498432601881, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 3.517730734680869e-07, |
|
"loss": 0.2263, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 14.655172413793103, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 3.2186515131655823e-07, |
|
"loss": 0.2267, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 14.670846394984325, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.932836798992589e-07, |
|
"loss": 0.2281, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 14.68652037617555, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 2.660290395595011e-07, |
|
"loss": 0.2286, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 14.702194357366771, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 2.401015929840322e-07, |
|
"loss": 0.2275, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 14.717868338557993, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 2.155016851981717e-07, |
|
"loss": 0.229, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 14.733542319749215, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.9222964356123696e-07, |
|
"loss": 0.2284, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 14.74921630094044, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.7028577776216915e-07, |
|
"loss": 0.2328, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 14.764890282131661, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.496703798154364e-07, |
|
"loss": 0.2272, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 14.780564263322884, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 1.3038372405711487e-07, |
|
"loss": 0.2307, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 14.796238244514107, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 1.1242606714129134e-07, |
|
"loss": 0.2253, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 14.81191222570533, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.579764803658853e-08, |
|
"loss": 0.2238, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 14.827586206896552, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 8.049868802301187e-08, |
|
"loss": 0.2273, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 14.843260188087774, |
|
"grad_norm": 0.5, |
|
"learning_rate": 6.652939068899633e-08, |
|
"loss": 0.2315, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 14.858934169278998, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 5.388994192875307e-08, |
|
"loss": 0.2232, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 14.87460815047022, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 4.258050993967144e-08, |
|
"loss": 0.2258, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 14.890282131661442, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 3.260124522023178e-08, |
|
"loss": 0.2287, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 14.905956112852664, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 2.3952280567873796e-08, |
|
"loss": 0.2208, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 14.921630094043888, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.6633731077297933e-08, |
|
"loss": 0.2233, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 14.93730407523511, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.0645694138933237e-08, |
|
"loss": 0.2239, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 14.952978056426332, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 5.988249437627325e-09, |
|
"loss": 0.2257, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 14.968652037617554, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 2.6614589515583377e-09, |
|
"loss": 0.2308, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 14.984326018808778, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 6.65366951457802e-10, |
|
"loss": 0.2262, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0, |
|
"loss": 0.2251, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 4.466753005981445, |
|
"eval_runtime": 0.7937, |
|
"eval_samples_per_second": 2.52, |
|
"eval_steps_per_second": 1.26, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"step": 4785, |
|
"total_flos": 5.539666200113447e+18, |
|
"train_loss": 0.8980661474674348, |
|
"train_runtime": 27834.6576, |
|
"train_samples_per_second": 4.124, |
|
"train_steps_per_second": 0.172 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4785, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.539666200113447e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|