|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.016516516516516516, |
|
"eval_steps": 1, |
|
"global_step": 341, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.8435532306500046e-05, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 4.8435532306500046e-05, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 276.4656, |
|
"eval_samples_per_second": 122.138, |
|
"eval_steps_per_second": 3.82, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 9.687106461300009e-05, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 9.687106461300009e-05, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.1935, |
|
"eval_samples_per_second": 122.703, |
|
"eval_steps_per_second": 3.837, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00014530659691950015, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00014530659691950015, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.13, |
|
"eval_samples_per_second": 122.731, |
|
"eval_steps_per_second": 3.838, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00019374212922600018, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00019374212922600018, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 276.2699, |
|
"eval_samples_per_second": 122.225, |
|
"eval_steps_per_second": 3.822, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00024217766153250024, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.9062, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00024217766153250024, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 274.7331, |
|
"eval_samples_per_second": 122.908, |
|
"eval_steps_per_second": 3.844, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0002906131938390003, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0002906131938390003, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.5447, |
|
"eval_samples_per_second": 122.546, |
|
"eval_steps_per_second": 3.832, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00033904872614550033, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.9062, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00033904872614550033, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.1675, |
|
"eval_samples_per_second": 122.714, |
|
"eval_steps_per_second": 3.838, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00038748425845200037, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.9062, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00038748425845200037, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.5655, |
|
"eval_samples_per_second": 122.537, |
|
"eval_steps_per_second": 3.832, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00043591979075850045, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.9062, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00043591979075850045, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 273.3419, |
|
"eval_samples_per_second": 123.534, |
|
"eval_steps_per_second": 3.863, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0004843553230650005, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0004843553230650005, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.1369, |
|
"eval_samples_per_second": 122.728, |
|
"eval_steps_per_second": 3.838, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005327908553715005, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0005327908553715005, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 274.4663, |
|
"eval_samples_per_second": 123.028, |
|
"eval_steps_per_second": 3.847, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0005812263876780006, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0005812263876780006, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 273.8795, |
|
"eval_samples_per_second": 123.291, |
|
"eval_steps_per_second": 3.856, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0006296619199845006, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0006296619199845006, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 273.17, |
|
"eval_samples_per_second": 123.612, |
|
"eval_steps_per_second": 3.866, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0006780974522910007, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.9062, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0006780974522910007, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 274.4591, |
|
"eval_samples_per_second": 123.031, |
|
"eval_steps_per_second": 3.848, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0007265329845975008, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0007265329845975008, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 275.1413, |
|
"eval_samples_per_second": 122.726, |
|
"eval_steps_per_second": 3.838, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0007749685169040007, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8984, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0007749685169040007, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 276.8384, |
|
"eval_samples_per_second": 121.974, |
|
"eval_steps_per_second": 3.814, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0008234040492105008, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.9062, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0008234040492105008, |
|
"eval_accuracy": 0.0001320069300164392, |
|
"eval_loss": 10.90625, |
|
"eval_runtime": 273.566, |
|
"eval_samples_per_second": 123.433, |
|
"eval_steps_per_second": 3.86, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0008718395815170009, |
|
"grad_norm": 6.863816738128662, |
|
"learning_rate": 9.99999515644677e-06, |
|
"loss": 10.9062, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0008718395815170009, |
|
"eval_accuracy": 0.010980023790775268, |
|
"eval_loss": 10.7578125, |
|
"eval_runtime": 273.6742, |
|
"eval_samples_per_second": 123.384, |
|
"eval_steps_per_second": 3.859, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0009202751138235009, |
|
"grad_norm": 6.298513889312744, |
|
"learning_rate": 9.999990312893539e-06, |
|
"loss": 10.7734, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0009202751138235009, |
|
"eval_accuracy": 0.028515349612393204, |
|
"eval_loss": 10.65625, |
|
"eval_runtime": 273.1376, |
|
"eval_samples_per_second": 123.626, |
|
"eval_steps_per_second": 3.866, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.000968710646130001, |
|
"grad_norm": 5.340964317321777, |
|
"learning_rate": 9.999985469340309e-06, |
|
"loss": 10.6797, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.000968710646130001, |
|
"eval_accuracy": 0.04691063110573666, |
|
"eval_loss": 10.578125, |
|
"eval_runtime": 273.0387, |
|
"eval_samples_per_second": 123.671, |
|
"eval_steps_per_second": 3.868, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.001017146178436501, |
|
"grad_norm": 4.327230930328369, |
|
"learning_rate": 9.999980625787079e-06, |
|
"loss": 10.6016, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.001017146178436501, |
|
"eval_accuracy": 0.04854858814680248, |
|
"eval_loss": 10.5234375, |
|
"eval_runtime": 272.9282, |
|
"eval_samples_per_second": 123.721, |
|
"eval_steps_per_second": 3.869, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.001065581710743001, |
|
"grad_norm": 3.803434133529663, |
|
"learning_rate": 9.999975782233847e-06, |
|
"loss": 10.5234, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.001065581710743001, |
|
"eval_accuracy": 0.04776647603534324, |
|
"eval_loss": 10.4765625, |
|
"eval_runtime": 272.6159, |
|
"eval_samples_per_second": 123.863, |
|
"eval_steps_per_second": 3.874, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.001114017243049501, |
|
"grad_norm": 3.2490711212158203, |
|
"learning_rate": 9.999970938680617e-06, |
|
"loss": 10.5, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.001114017243049501, |
|
"eval_accuracy": 0.04827195257153118, |
|
"eval_loss": 10.4375, |
|
"eval_runtime": 272.3658, |
|
"eval_samples_per_second": 123.977, |
|
"eval_steps_per_second": 3.877, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0011624527753560012, |
|
"grad_norm": 2.9085004329681396, |
|
"learning_rate": 9.999966095127386e-06, |
|
"loss": 10.4531, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0011624527753560012, |
|
"eval_accuracy": 0.05073292650302844, |
|
"eval_loss": 10.40625, |
|
"eval_runtime": 272.851, |
|
"eval_samples_per_second": 123.756, |
|
"eval_steps_per_second": 3.87, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0012108883076625012, |
|
"grad_norm": 2.668471574783325, |
|
"learning_rate": 9.999961251574155e-06, |
|
"loss": 10.4141, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0012108883076625012, |
|
"eval_accuracy": 0.05310853016333744, |
|
"eval_loss": 10.3828125, |
|
"eval_runtime": 272.4493, |
|
"eval_samples_per_second": 123.939, |
|
"eval_steps_per_second": 3.876, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0012593238399690012, |
|
"grad_norm": 2.864935874938965, |
|
"learning_rate": 9.999956408020926e-06, |
|
"loss": 10.3672, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0012593238399690012, |
|
"eval_accuracy": 0.0555671013371173, |
|
"eval_loss": 10.359375, |
|
"eval_runtime": 273.3963, |
|
"eval_samples_per_second": 123.509, |
|
"eval_steps_per_second": 3.863, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0013077593722755014, |
|
"grad_norm": 2.2354369163513184, |
|
"learning_rate": 9.999951564467694e-06, |
|
"loss": 10.3828, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0013077593722755014, |
|
"eval_accuracy": 0.05616663281094196, |
|
"eval_loss": 10.3359375, |
|
"eval_runtime": 272.6273, |
|
"eval_samples_per_second": 123.858, |
|
"eval_steps_per_second": 3.873, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0013561949045820013, |
|
"grad_norm": 2.1477534770965576, |
|
"learning_rate": 9.999946720914464e-06, |
|
"loss": 10.3594, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0013561949045820013, |
|
"eval_accuracy": 0.05624939768219569, |
|
"eval_loss": 10.3203125, |
|
"eval_runtime": 273.0918, |
|
"eval_samples_per_second": 123.647, |
|
"eval_steps_per_second": 3.867, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0014046304368885013, |
|
"grad_norm": 2.097315549850464, |
|
"learning_rate": 9.999941877361234e-06, |
|
"loss": 10.3281, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0014046304368885013, |
|
"eval_accuracy": 0.055895844911079295, |
|
"eval_loss": 10.3046875, |
|
"eval_runtime": 273.0626, |
|
"eval_samples_per_second": 123.66, |
|
"eval_steps_per_second": 3.867, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0014530659691950015, |
|
"grad_norm": 1.8777693510055542, |
|
"learning_rate": 9.999937033808002e-06, |
|
"loss": 10.3203, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0014530659691950015, |
|
"eval_accuracy": 0.0563090902895847, |
|
"eval_loss": 10.296875, |
|
"eval_runtime": 272.1369, |
|
"eval_samples_per_second": 124.081, |
|
"eval_steps_per_second": 3.88, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0015015015015015015, |
|
"grad_norm": 1.8313064575195312, |
|
"learning_rate": 9.999932190254772e-06, |
|
"loss": 10.3281, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0015015015015015015, |
|
"eval_accuracy": 0.05664084454791549, |
|
"eval_loss": 10.28125, |
|
"eval_runtime": 272.0928, |
|
"eval_samples_per_second": 124.101, |
|
"eval_steps_per_second": 3.881, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0015499370338080015, |
|
"grad_norm": 1.7771973609924316, |
|
"learning_rate": 9.999927346701542e-06, |
|
"loss": 10.3359, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0015499370338080015, |
|
"eval_accuracy": 0.05662431473277527, |
|
"eval_loss": 10.2734375, |
|
"eval_runtime": 272.172, |
|
"eval_samples_per_second": 124.065, |
|
"eval_steps_per_second": 3.88, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0015983725661145017, |
|
"grad_norm": 1.8934669494628906, |
|
"learning_rate": 9.99992250314831e-06, |
|
"loss": 10.2656, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0015983725661145017, |
|
"eval_accuracy": 0.057003458321026435, |
|
"eval_loss": 10.265625, |
|
"eval_runtime": 271.9989, |
|
"eval_samples_per_second": 124.144, |
|
"eval_steps_per_second": 3.882, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0016468080984210016, |
|
"grad_norm": 1.789952039718628, |
|
"learning_rate": 9.99991765959508e-06, |
|
"loss": 10.2656, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0016468080984210016, |
|
"eval_accuracy": 0.056085286435208145, |
|
"eval_loss": 10.2578125, |
|
"eval_runtime": 272.24, |
|
"eval_samples_per_second": 124.034, |
|
"eval_steps_per_second": 3.879, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0016952436307275016, |
|
"grad_norm": 1.7584354877471924, |
|
"learning_rate": 9.99991281604185e-06, |
|
"loss": 10.2656, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0016952436307275016, |
|
"eval_accuracy": 0.056159366640013426, |
|
"eval_loss": 10.2421875, |
|
"eval_runtime": 272.3595, |
|
"eval_samples_per_second": 123.98, |
|
"eval_steps_per_second": 3.877, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0017436791630340018, |
|
"grad_norm": 1.7618820667266846, |
|
"learning_rate": 9.999907972488618e-06, |
|
"loss": 10.2656, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0017436791630340018, |
|
"eval_accuracy": 0.057521267083571186, |
|
"eval_loss": 10.234375, |
|
"eval_runtime": 272.7034, |
|
"eval_samples_per_second": 123.823, |
|
"eval_steps_per_second": 3.872, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0017921146953405018, |
|
"grad_norm": 1.6511751413345337, |
|
"learning_rate": 9.999903128935388e-06, |
|
"loss": 10.2656, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0017921146953405018, |
|
"eval_accuracy": 0.05863849152438795, |
|
"eval_loss": 10.2265625, |
|
"eval_runtime": 273.7952, |
|
"eval_samples_per_second": 123.329, |
|
"eval_steps_per_second": 3.857, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0018405502276470018, |
|
"grad_norm": 1.8650130033493042, |
|
"learning_rate": 9.999898285382156e-06, |
|
"loss": 10.2109, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0018405502276470018, |
|
"eval_accuracy": 0.059270706292946944, |
|
"eval_loss": 10.21875, |
|
"eval_runtime": 274.2569, |
|
"eval_samples_per_second": 123.122, |
|
"eval_steps_per_second": 3.85, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.001888985759953502, |
|
"grad_norm": 1.7996951341629028, |
|
"learning_rate": 9.999893441828926e-06, |
|
"loss": 10.2656, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.001888985759953502, |
|
"eval_accuracy": 0.05958891247161815, |
|
"eval_loss": 10.2109375, |
|
"eval_runtime": 274.6294, |
|
"eval_samples_per_second": 122.955, |
|
"eval_steps_per_second": 3.845, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.001937421292260002, |
|
"grad_norm": 1.7773430347442627, |
|
"learning_rate": 9.999888598275696e-06, |
|
"loss": 10.2266, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001937421292260002, |
|
"eval_accuracy": 0.05994316001605042, |
|
"eval_loss": 10.203125, |
|
"eval_runtime": 273.9324, |
|
"eval_samples_per_second": 123.268, |
|
"eval_steps_per_second": 3.855, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001985856824566502, |
|
"grad_norm": 1.7419933080673218, |
|
"learning_rate": 9.999883754722464e-06, |
|
"loss": 10.2109, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.001985856824566502, |
|
"eval_accuracy": 0.06009832605659606, |
|
"eval_loss": 10.1953125, |
|
"eval_runtime": 274.4948, |
|
"eval_samples_per_second": 123.015, |
|
"eval_steps_per_second": 3.847, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.002034292356873002, |
|
"grad_norm": 1.7278474569320679, |
|
"learning_rate": 9.999878911169234e-06, |
|
"loss": 10.2109, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.002034292356873002, |
|
"eval_accuracy": 0.060353828943509456, |
|
"eval_loss": 10.1796875, |
|
"eval_runtime": 275.0795, |
|
"eval_samples_per_second": 122.754, |
|
"eval_steps_per_second": 3.839, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.002082727889179502, |
|
"grad_norm": 1.8463383913040161, |
|
"learning_rate": 9.999874067616004e-06, |
|
"loss": 10.2109, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.002082727889179502, |
|
"eval_accuracy": 0.060806444809914505, |
|
"eval_loss": 10.171875, |
|
"eval_runtime": 275.6513, |
|
"eval_samples_per_second": 122.499, |
|
"eval_steps_per_second": 3.831, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.002131163421486002, |
|
"grad_norm": 1.8434734344482422, |
|
"learning_rate": 9.999869224062774e-06, |
|
"loss": 10.1484, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.002131163421486002, |
|
"eval_accuracy": 0.061002660373873155, |
|
"eval_loss": 10.1640625, |
|
"eval_runtime": 275.2495, |
|
"eval_samples_per_second": 122.678, |
|
"eval_steps_per_second": 3.837, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.002179598953792502, |
|
"grad_norm": 1.8196474313735962, |
|
"learning_rate": 9.999864380509543e-06, |
|
"loss": 10.1875, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.002179598953792502, |
|
"eval_accuracy": 0.061108642253432405, |
|
"eval_loss": 10.1484375, |
|
"eval_runtime": 275.2499, |
|
"eval_samples_per_second": 122.678, |
|
"eval_steps_per_second": 3.837, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.002228034486099002, |
|
"grad_norm": 1.7358877658843994, |
|
"learning_rate": 9.999859536956312e-06, |
|
"loss": 10.1719, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.002228034486099002, |
|
"eval_accuracy": 0.061226840563795806, |
|
"eval_loss": 10.140625, |
|
"eval_runtime": 273.2604, |
|
"eval_samples_per_second": 123.571, |
|
"eval_steps_per_second": 3.864, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0022764700184055024, |
|
"grad_norm": 1.7613184452056885, |
|
"learning_rate": 9.999854693403081e-06, |
|
"loss": 10.1484, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0022764700184055024, |
|
"eval_accuracy": 0.06154533623134863, |
|
"eval_loss": 10.1328125, |
|
"eval_runtime": 274.8774, |
|
"eval_samples_per_second": 122.844, |
|
"eval_steps_per_second": 3.842, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0023249055507120024, |
|
"grad_norm": 1.926283597946167, |
|
"learning_rate": 9.999849849849851e-06, |
|
"loss": 10.1172, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0023249055507120024, |
|
"eval_accuracy": 0.062220076916616865, |
|
"eval_loss": 10.1171875, |
|
"eval_runtime": 275.5341, |
|
"eval_samples_per_second": 122.551, |
|
"eval_steps_per_second": 3.833, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0023733410830185024, |
|
"grad_norm": 1.7182645797729492, |
|
"learning_rate": 9.99984500629662e-06, |
|
"loss": 10.1797, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0023733410830185024, |
|
"eval_accuracy": 0.06321505020272762, |
|
"eval_loss": 10.109375, |
|
"eval_runtime": 275.3575, |
|
"eval_samples_per_second": 122.63, |
|
"eval_steps_per_second": 3.835, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0024217766153250024, |
|
"grad_norm": 1.756512999534607, |
|
"learning_rate": 9.99984016274339e-06, |
|
"loss": 10.1016, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0024217766153250024, |
|
"eval_accuracy": 0.06421378684429936, |
|
"eval_loss": 10.1015625, |
|
"eval_runtime": 274.9451, |
|
"eval_samples_per_second": 122.814, |
|
"eval_steps_per_second": 3.841, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0024702121476315024, |
|
"grad_norm": 1.8228658437728882, |
|
"learning_rate": 9.99983531919016e-06, |
|
"loss": 10.1406, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0024702121476315024, |
|
"eval_accuracy": 0.06511247612838496, |
|
"eval_loss": 10.09375, |
|
"eval_runtime": 274.7644, |
|
"eval_samples_per_second": 122.894, |
|
"eval_steps_per_second": 3.843, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0025186476799380023, |
|
"grad_norm": 1.6864567995071411, |
|
"learning_rate": 9.999830475636927e-06, |
|
"loss": 10.1406, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0025186476799380023, |
|
"eval_accuracy": 0.0658020965421682, |
|
"eval_loss": 10.0859375, |
|
"eval_runtime": 275.2863, |
|
"eval_samples_per_second": 122.661, |
|
"eval_steps_per_second": 3.836, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0025670832122445027, |
|
"grad_norm": 1.7754981517791748, |
|
"learning_rate": 9.999825632083697e-06, |
|
"loss": 10.1094, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0025670832122445027, |
|
"eval_accuracy": 0.06627448449918756, |
|
"eval_loss": 10.078125, |
|
"eval_runtime": 274.8943, |
|
"eval_samples_per_second": 122.836, |
|
"eval_steps_per_second": 3.841, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0026155187445510027, |
|
"grad_norm": 1.7636278867721558, |
|
"learning_rate": 9.999820788530467e-06, |
|
"loss": 10.1016, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0026155187445510027, |
|
"eval_accuracy": 0.06685522814459541, |
|
"eval_loss": 10.0703125, |
|
"eval_runtime": 275.7861, |
|
"eval_samples_per_second": 122.439, |
|
"eval_steps_per_second": 3.829, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0026639542768575027, |
|
"grad_norm": 1.7524579763412476, |
|
"learning_rate": 9.999815944977235e-06, |
|
"loss": 10.0781, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0026639542768575027, |
|
"eval_accuracy": 0.06716721031231189, |
|
"eval_loss": 10.0625, |
|
"eval_runtime": 274.8867, |
|
"eval_samples_per_second": 122.84, |
|
"eval_steps_per_second": 3.842, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0027123898091640027, |
|
"grad_norm": 1.8897311687469482, |
|
"learning_rate": 9.999811101424005e-06, |
|
"loss": 10.0703, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0027123898091640027, |
|
"eval_accuracy": 0.06777936350137496, |
|
"eval_loss": 10.0546875, |
|
"eval_runtime": 276.2638, |
|
"eval_samples_per_second": 122.227, |
|
"eval_steps_per_second": 3.822, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0027608253414705027, |
|
"grad_norm": 1.7320737838745117, |
|
"learning_rate": 9.999806257870775e-06, |
|
"loss": 10.0703, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0027608253414705027, |
|
"eval_accuracy": 0.06813957451676851, |
|
"eval_loss": 10.046875, |
|
"eval_runtime": 275.4873, |
|
"eval_samples_per_second": 122.572, |
|
"eval_steps_per_second": 3.833, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0028092608737770026, |
|
"grad_norm": 1.685152530670166, |
|
"learning_rate": 9.999801414317543e-06, |
|
"loss": 10.0469, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0028092608737770026, |
|
"eval_accuracy": 0.06857357624808572, |
|
"eval_loss": 10.0390625, |
|
"eval_runtime": 275.0861, |
|
"eval_samples_per_second": 122.751, |
|
"eval_steps_per_second": 3.839, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.002857696406083503, |
|
"grad_norm": 1.6026166677474976, |
|
"learning_rate": 9.999796570764313e-06, |
|
"loss": 10.1016, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.002857696406083503, |
|
"eval_accuracy": 0.06889201401786221, |
|
"eval_loss": 10.03125, |
|
"eval_runtime": 273.8074, |
|
"eval_samples_per_second": 123.324, |
|
"eval_steps_per_second": 3.857, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.002906131938390003, |
|
"grad_norm": 1.7406948804855347, |
|
"learning_rate": 9.999791727211083e-06, |
|
"loss": 10.0547, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002906131938390003, |
|
"eval_accuracy": 0.06942965276879759, |
|
"eval_loss": 10.03125, |
|
"eval_runtime": 274.1162, |
|
"eval_samples_per_second": 123.185, |
|
"eval_steps_per_second": 3.852, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002954567470696503, |
|
"grad_norm": 2.25240421295166, |
|
"learning_rate": 9.999786883657853e-06, |
|
"loss": 10.0391, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.002954567470696503, |
|
"eval_accuracy": 0.06947052859888163, |
|
"eval_loss": 10.0234375, |
|
"eval_runtime": 274.759, |
|
"eval_samples_per_second": 122.897, |
|
"eval_steps_per_second": 3.843, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.003003003003003003, |
|
"grad_norm": 1.6132714748382568, |
|
"learning_rate": 9.999782040104623e-06, |
|
"loss": 10.0547, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.003003003003003003, |
|
"eval_accuracy": 0.06921656000304079, |
|
"eval_loss": 10.015625, |
|
"eval_runtime": 275.2004, |
|
"eval_samples_per_second": 122.7, |
|
"eval_steps_per_second": 3.837, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.003051438535309503, |
|
"grad_norm": 1.6277832984924316, |
|
"learning_rate": 9.99977719655139e-06, |
|
"loss": 10.0312, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.003051438535309503, |
|
"eval_accuracy": 0.06882297091959703, |
|
"eval_loss": 10.0078125, |
|
"eval_runtime": 274.9078, |
|
"eval_samples_per_second": 122.83, |
|
"eval_steps_per_second": 3.841, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.003099874067616003, |
|
"grad_norm": 1.6769694089889526, |
|
"learning_rate": 9.99977235299816e-06, |
|
"loss": 10.0547, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.003099874067616003, |
|
"eval_accuracy": 0.06873357675295433, |
|
"eval_loss": 10.0, |
|
"eval_runtime": 273.9236, |
|
"eval_samples_per_second": 123.272, |
|
"eval_steps_per_second": 3.855, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0031483095999225033, |
|
"grad_norm": 1.6080327033996582, |
|
"learning_rate": 9.99976750944493e-06, |
|
"loss": 10.0547, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0031483095999225033, |
|
"eval_accuracy": 0.06925561205317066, |
|
"eval_loss": 9.9921875, |
|
"eval_runtime": 275.0761, |
|
"eval_samples_per_second": 122.755, |
|
"eval_steps_per_second": 3.839, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0031967451322290033, |
|
"grad_norm": 1.6163508892059326, |
|
"learning_rate": 9.999762665891699e-06, |
|
"loss": 9.9922, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0031967451322290033, |
|
"eval_accuracy": 0.0697469615319358, |
|
"eval_loss": 9.984375, |
|
"eval_runtime": 273.8628, |
|
"eval_samples_per_second": 123.299, |
|
"eval_steps_per_second": 3.856, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0032451806645355033, |
|
"grad_norm": 1.625279426574707, |
|
"learning_rate": 9.999757822338468e-06, |
|
"loss": 10.0234, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0032451806645355033, |
|
"eval_accuracy": 0.0704554855696885, |
|
"eval_loss": 9.9765625, |
|
"eval_runtime": 274.3158, |
|
"eval_samples_per_second": 123.095, |
|
"eval_steps_per_second": 3.85, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0032936161968420033, |
|
"grad_norm": 1.6738680601119995, |
|
"learning_rate": 9.999752978785238e-06, |
|
"loss": 10.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0032936161968420033, |
|
"eval_accuracy": 0.07112654974616023, |
|
"eval_loss": 9.96875, |
|
"eval_runtime": 275.9505, |
|
"eval_samples_per_second": 122.366, |
|
"eval_steps_per_second": 3.827, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0033420517291485033, |
|
"grad_norm": 1.5247821807861328, |
|
"learning_rate": 9.999748135232007e-06, |
|
"loss": 10.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0033420517291485033, |
|
"eval_accuracy": 0.0715033484744703, |
|
"eval_loss": 9.9609375, |
|
"eval_runtime": 274.4182, |
|
"eval_samples_per_second": 123.049, |
|
"eval_steps_per_second": 3.848, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0033904872614550032, |
|
"grad_norm": 1.8255083560943604, |
|
"learning_rate": 9.999743291678776e-06, |
|
"loss": 9.9688, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0033904872614550032, |
|
"eval_accuracy": 0.07161468589833944, |
|
"eval_loss": 9.9609375, |
|
"eval_runtime": 274.1089, |
|
"eval_samples_per_second": 123.188, |
|
"eval_steps_per_second": 3.852, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0034389227937615036, |
|
"grad_norm": 1.845422387123108, |
|
"learning_rate": 9.999738448125546e-06, |
|
"loss": 9.9922, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0034389227937615036, |
|
"eval_accuracy": 0.07169394795412562, |
|
"eval_loss": 9.953125, |
|
"eval_runtime": 275.2085, |
|
"eval_samples_per_second": 122.696, |
|
"eval_steps_per_second": 3.837, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0034873583260680036, |
|
"grad_norm": 1.663128137588501, |
|
"learning_rate": 9.999733604572314e-06, |
|
"loss": 9.9844, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0034873583260680036, |
|
"eval_accuracy": 0.07159517434771859, |
|
"eval_loss": 9.9453125, |
|
"eval_runtime": 274.6498, |
|
"eval_samples_per_second": 122.946, |
|
"eval_steps_per_second": 3.845, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0035357938583745036, |
|
"grad_norm": 1.6756772994995117, |
|
"learning_rate": 9.999728761019084e-06, |
|
"loss": 9.9688, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0035357938583745036, |
|
"eval_accuracy": 0.07181347791334446, |
|
"eval_loss": 9.9375, |
|
"eval_runtime": 274.5733, |
|
"eval_samples_per_second": 122.98, |
|
"eval_steps_per_second": 3.846, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0035842293906810036, |
|
"grad_norm": 1.746936559677124, |
|
"learning_rate": 9.999723917465854e-06, |
|
"loss": 9.9453, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0035842293906810036, |
|
"eval_accuracy": 0.07256360150338524, |
|
"eval_loss": 9.9296875, |
|
"eval_runtime": 275.0915, |
|
"eval_samples_per_second": 122.748, |
|
"eval_steps_per_second": 3.839, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0036326649229875036, |
|
"grad_norm": 1.6043540239334106, |
|
"learning_rate": 9.999719073912622e-06, |
|
"loss": 9.9375, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0036326649229875036, |
|
"eval_accuracy": 0.07335680103901034, |
|
"eval_loss": 9.921875, |
|
"eval_runtime": 274.3328, |
|
"eval_samples_per_second": 123.088, |
|
"eval_steps_per_second": 3.849, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0036811004552940035, |
|
"grad_norm": 1.6499953269958496, |
|
"learning_rate": 9.999714230359392e-06, |
|
"loss": 9.9141, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0036811004552940035, |
|
"eval_accuracy": 0.0744010163838838, |
|
"eval_loss": 9.9140625, |
|
"eval_runtime": 274.4219, |
|
"eval_samples_per_second": 123.048, |
|
"eval_steps_per_second": 3.848, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0037295359876005035, |
|
"grad_norm": 1.6161168813705444, |
|
"learning_rate": 9.999709386806162e-06, |
|
"loss": 9.9062, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0037295359876005035, |
|
"eval_accuracy": 0.07513791033203478, |
|
"eval_loss": 9.90625, |
|
"eval_runtime": 274.8621, |
|
"eval_samples_per_second": 122.851, |
|
"eval_steps_per_second": 3.842, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.003777971519907004, |
|
"grad_norm": 1.760338544845581, |
|
"learning_rate": 9.999704543252932e-06, |
|
"loss": 9.9219, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.003777971519907004, |
|
"eval_accuracy": 0.07549183943869726, |
|
"eval_loss": 9.90625, |
|
"eval_runtime": 273.942, |
|
"eval_samples_per_second": 123.263, |
|
"eval_steps_per_second": 3.855, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.003826407052213504, |
|
"grad_norm": 2.1402640342712402, |
|
"learning_rate": 9.9996996996997e-06, |
|
"loss": 9.9219, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.003826407052213504, |
|
"eval_accuracy": 0.07561446692894938, |
|
"eval_loss": 9.8984375, |
|
"eval_runtime": 273.1637, |
|
"eval_samples_per_second": 123.615, |
|
"eval_steps_per_second": 3.866, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.003874842584520004, |
|
"grad_norm": 1.5549274682998657, |
|
"learning_rate": 9.99969485614647e-06, |
|
"loss": 9.9219, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003874842584520004, |
|
"eval_accuracy": 0.07565650071455989, |
|
"eval_loss": 9.890625, |
|
"eval_runtime": 274.1597, |
|
"eval_samples_per_second": 123.165, |
|
"eval_steps_per_second": 3.852, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003923278116826504, |
|
"grad_norm": 1.619598388671875, |
|
"learning_rate": 9.99969001259324e-06, |
|
"loss": 9.875, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.003923278116826504, |
|
"eval_accuracy": 0.07585523483178858, |
|
"eval_loss": 9.8828125, |
|
"eval_runtime": 273.6756, |
|
"eval_samples_per_second": 123.383, |
|
"eval_steps_per_second": 3.859, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.003971713649133004, |
|
"grad_norm": 1.4982187747955322, |
|
"learning_rate": 9.999685169040008e-06, |
|
"loss": 9.9219, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.003971713649133004, |
|
"eval_accuracy": 0.07604369209371994, |
|
"eval_loss": 9.875, |
|
"eval_runtime": 274.2745, |
|
"eval_samples_per_second": 123.114, |
|
"eval_steps_per_second": 3.85, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.004020149181439504, |
|
"grad_norm": 1.8369065523147583, |
|
"learning_rate": 9.999680325486778e-06, |
|
"loss": 9.875, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.004020149181439504, |
|
"eval_accuracy": 0.07629766068956077, |
|
"eval_loss": 9.875, |
|
"eval_runtime": 273.8951, |
|
"eval_samples_per_second": 123.284, |
|
"eval_steps_per_second": 3.855, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.004068584713746004, |
|
"grad_norm": 1.5859246253967285, |
|
"learning_rate": 9.999675481933548e-06, |
|
"loss": 9.8672, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.004068584713746004, |
|
"eval_accuracy": 0.07654106294122268, |
|
"eval_loss": 9.8671875, |
|
"eval_runtime": 273.7145, |
|
"eval_samples_per_second": 123.366, |
|
"eval_steps_per_second": 3.858, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.004117020246052504, |
|
"grad_norm": 1.527214765548706, |
|
"learning_rate": 9.999670638380316e-06, |
|
"loss": 9.9062, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.004117020246052504, |
|
"eval_accuracy": 0.07687032759517157, |
|
"eval_loss": 9.859375, |
|
"eval_runtime": 273.3281, |
|
"eval_samples_per_second": 123.54, |
|
"eval_steps_per_second": 3.863, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.004165455778359004, |
|
"grad_norm": 1.5885719060897827, |
|
"learning_rate": 9.999665794827086e-06, |
|
"loss": 9.8828, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.004165455778359004, |
|
"eval_accuracy": 0.07730965592191048, |
|
"eval_loss": 9.8515625, |
|
"eval_runtime": 273.6316, |
|
"eval_samples_per_second": 123.403, |
|
"eval_steps_per_second": 3.859, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.004213891310665505, |
|
"grad_norm": 1.7169041633605957, |
|
"learning_rate": 9.999660951273856e-06, |
|
"loss": 9.8594, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.004213891310665505, |
|
"eval_accuracy": 0.07752955167638524, |
|
"eval_loss": 9.8515625, |
|
"eval_runtime": 273.4517, |
|
"eval_samples_per_second": 123.484, |
|
"eval_steps_per_second": 3.862, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.004262326842972004, |
|
"grad_norm": 1.5023819208145142, |
|
"learning_rate": 9.999656107720624e-06, |
|
"loss": 9.8906, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.004262326842972004, |
|
"eval_accuracy": 0.07768043328148298, |
|
"eval_loss": 9.84375, |
|
"eval_runtime": 275.7256, |
|
"eval_samples_per_second": 122.466, |
|
"eval_steps_per_second": 3.83, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0043107623752785046, |
|
"grad_norm": 1.6757872104644775, |
|
"learning_rate": 9.999651264167394e-06, |
|
"loss": 9.8047, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0043107623752785046, |
|
"eval_accuracy": 0.07773103593798929, |
|
"eval_loss": 9.8359375, |
|
"eval_runtime": 275.8678, |
|
"eval_samples_per_second": 122.403, |
|
"eval_steps_per_second": 3.828, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.004359197907585004, |
|
"grad_norm": 2.2149763107299805, |
|
"learning_rate": 9.999646420614163e-06, |
|
"loss": 9.8203, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004359197907585004, |
|
"eval_accuracy": 0.07783635199312082, |
|
"eval_loss": 9.8359375, |
|
"eval_runtime": 275.5895, |
|
"eval_samples_per_second": 122.526, |
|
"eval_steps_per_second": 3.832, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0044076334398915045, |
|
"grad_norm": 1.6437429189682007, |
|
"learning_rate": 9.999641577060932e-06, |
|
"loss": 9.8594, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0044076334398915045, |
|
"eval_accuracy": 0.07813093587905225, |
|
"eval_loss": 9.828125, |
|
"eval_runtime": 274.7605, |
|
"eval_samples_per_second": 122.896, |
|
"eval_steps_per_second": 3.843, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.004456068972198004, |
|
"grad_norm": 1.6756585836410522, |
|
"learning_rate": 9.999636733507701e-06, |
|
"loss": 9.8438, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.004456068972198004, |
|
"eval_accuracy": 0.07858427546766132, |
|
"eval_loss": 9.8203125, |
|
"eval_runtime": 275.56, |
|
"eval_samples_per_second": 122.54, |
|
"eval_steps_per_second": 3.832, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0045045045045045045, |
|
"grad_norm": 1.6290555000305176, |
|
"learning_rate": 9.999631889954471e-06, |
|
"loss": 9.8438, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0045045045045045045, |
|
"eval_accuracy": 0.07898075943992122, |
|
"eval_loss": 9.8203125, |
|
"eval_runtime": 275.4203, |
|
"eval_samples_per_second": 122.602, |
|
"eval_steps_per_second": 3.834, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.004552940036811005, |
|
"grad_norm": 1.552886724472046, |
|
"learning_rate": 9.999627046401241e-06, |
|
"loss": 9.8438, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.004552940036811005, |
|
"eval_accuracy": 0.0792507946686917, |
|
"eval_loss": 9.8125, |
|
"eval_runtime": 276.2603, |
|
"eval_samples_per_second": 122.229, |
|
"eval_steps_per_second": 3.822, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.004601375569117504, |
|
"grad_norm": 1.6093745231628418, |
|
"learning_rate": 9.999622202848011e-06, |
|
"loss": 9.8359, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.004601375569117504, |
|
"eval_accuracy": 0.07942483538431863, |
|
"eval_loss": 9.8046875, |
|
"eval_runtime": 277.5976, |
|
"eval_samples_per_second": 121.64, |
|
"eval_steps_per_second": 3.804, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.004649811101424005, |
|
"grad_norm": 1.6716474294662476, |
|
"learning_rate": 9.99961735929478e-06, |
|
"loss": 9.8281, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.004649811101424005, |
|
"eval_accuracy": 0.07951587963758655, |
|
"eval_loss": 9.8046875, |
|
"eval_runtime": 277.2247, |
|
"eval_samples_per_second": 121.804, |
|
"eval_steps_per_second": 3.809, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.004698246633730504, |
|
"grad_norm": 1.5188281536102295, |
|
"learning_rate": 9.999612515741549e-06, |
|
"loss": 9.8516, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.004698246633730504, |
|
"eval_accuracy": 0.07964241522774047, |
|
"eval_loss": 9.796875, |
|
"eval_runtime": 277.219, |
|
"eval_samples_per_second": 121.806, |
|
"eval_steps_per_second": 3.809, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.004746682166037005, |
|
"grad_norm": 1.5686155557632446, |
|
"learning_rate": 9.999607672188319e-06, |
|
"loss": 9.8281, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.004746682166037005, |
|
"eval_accuracy": 0.07971328210595982, |
|
"eval_loss": 9.7890625, |
|
"eval_runtime": 276.6601, |
|
"eval_samples_per_second": 122.052, |
|
"eval_steps_per_second": 3.817, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.004795117698343505, |
|
"grad_norm": 1.6188207864761353, |
|
"learning_rate": 9.999602828635087e-06, |
|
"loss": 9.7734, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.004795117698343505, |
|
"eval_accuracy": 0.0798379649672714, |
|
"eval_loss": 9.7890625, |
|
"eval_runtime": 276.9636, |
|
"eval_samples_per_second": 121.919, |
|
"eval_steps_per_second": 3.813, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.004843553230650005, |
|
"grad_norm": 1.6795498132705688, |
|
"learning_rate": 9.999597985081857e-06, |
|
"loss": 9.8125, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004843553230650005, |
|
"eval_accuracy": 0.08018245673639325, |
|
"eval_loss": 9.78125, |
|
"eval_runtime": 277.3903, |
|
"eval_samples_per_second": 121.731, |
|
"eval_steps_per_second": 3.807, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004891988762956505, |
|
"grad_norm": 1.516228199005127, |
|
"learning_rate": 9.999593141528627e-06, |
|
"loss": 9.8203, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.004891988762956505, |
|
"eval_accuracy": 0.08056970601332963, |
|
"eval_loss": 9.7734375, |
|
"eval_runtime": 277.0892, |
|
"eval_samples_per_second": 121.863, |
|
"eval_steps_per_second": 3.811, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.004940424295263005, |
|
"grad_norm": 1.485206961631775, |
|
"learning_rate": 9.999588297975395e-06, |
|
"loss": 9.8281, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.004940424295263005, |
|
"eval_accuracy": 0.0809254588999463, |
|
"eval_loss": 9.7734375, |
|
"eval_runtime": 277.0597, |
|
"eval_samples_per_second": 121.876, |
|
"eval_steps_per_second": 3.811, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.004988859827569505, |
|
"grad_norm": 1.6925771236419678, |
|
"learning_rate": 9.999583454422165e-06, |
|
"loss": 9.7734, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.004988859827569505, |
|
"eval_accuracy": 0.081113974059654, |
|
"eval_loss": 9.765625, |
|
"eval_runtime": 276.5455, |
|
"eval_samples_per_second": 122.103, |
|
"eval_steps_per_second": 3.819, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.005037295359876005, |
|
"grad_norm": 1.6215219497680664, |
|
"learning_rate": 9.999578610868935e-06, |
|
"loss": 9.7891, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.005037295359876005, |
|
"eval_accuracy": 0.08127527726448987, |
|
"eval_loss": 9.7578125, |
|
"eval_runtime": 275.7199, |
|
"eval_samples_per_second": 122.469, |
|
"eval_steps_per_second": 3.83, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.005085730892182505, |
|
"grad_norm": 1.5104496479034424, |
|
"learning_rate": 9.999573767315703e-06, |
|
"loss": 9.8047, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.005085730892182505, |
|
"eval_accuracy": 0.08140079964355813, |
|
"eval_loss": 9.7578125, |
|
"eval_runtime": 275.8702, |
|
"eval_samples_per_second": 122.402, |
|
"eval_steps_per_second": 3.828, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0051341664244890055, |
|
"grad_norm": 1.5603739023208618, |
|
"learning_rate": 9.999568923762473e-06, |
|
"loss": 9.7578, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0051341664244890055, |
|
"eval_accuracy": 0.0814951151211883, |
|
"eval_loss": 9.75, |
|
"eval_runtime": 276.6948, |
|
"eval_samples_per_second": 122.037, |
|
"eval_steps_per_second": 3.816, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.005182601956795505, |
|
"grad_norm": 1.6554555892944336, |
|
"learning_rate": 9.999564080209243e-06, |
|
"loss": 9.7734, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.005182601956795505, |
|
"eval_accuracy": 0.08162946691114582, |
|
"eval_loss": 9.75, |
|
"eval_runtime": 277.5011, |
|
"eval_samples_per_second": 121.682, |
|
"eval_steps_per_second": 3.805, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.0052310374891020054, |
|
"grad_norm": 1.4874709844589233, |
|
"learning_rate": 9.99955923665601e-06, |
|
"loss": 9.7891, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.0052310374891020054, |
|
"eval_accuracy": 0.08175817366791184, |
|
"eval_loss": 9.7421875, |
|
"eval_runtime": 277.2974, |
|
"eval_samples_per_second": 121.772, |
|
"eval_steps_per_second": 3.808, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.005279473021408505, |
|
"grad_norm": 1.5930671691894531, |
|
"learning_rate": 9.99955439310278e-06, |
|
"loss": 9.75, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.005279473021408505, |
|
"eval_accuracy": 0.081921676988248, |
|
"eval_loss": 9.734375, |
|
"eval_runtime": 278.2891, |
|
"eval_samples_per_second": 121.338, |
|
"eval_steps_per_second": 3.795, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.005327908553715005, |
|
"grad_norm": 1.7005099058151245, |
|
"learning_rate": 9.99954954954955e-06, |
|
"loss": 9.75, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.005327908553715005, |
|
"eval_accuracy": 0.08213621719841287, |
|
"eval_loss": 9.734375, |
|
"eval_runtime": 276.9822, |
|
"eval_samples_per_second": 121.91, |
|
"eval_steps_per_second": 3.813, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.005376344086021506, |
|
"grad_norm": 1.5735907554626465, |
|
"learning_rate": 9.99954470599632e-06, |
|
"loss": 9.7266, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.005376344086021506, |
|
"eval_accuracy": 0.08227051109059406, |
|
"eval_loss": 9.7265625, |
|
"eval_runtime": 277.2678, |
|
"eval_samples_per_second": 121.785, |
|
"eval_steps_per_second": 3.809, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.005424779618328005, |
|
"grad_norm": 1.473027229309082, |
|
"learning_rate": 9.99953986244309e-06, |
|
"loss": 9.7656, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.005424779618328005, |
|
"eval_accuracy": 0.08236607137041518, |
|
"eval_loss": 9.71875, |
|
"eval_runtime": 276.699, |
|
"eval_samples_per_second": 122.035, |
|
"eval_steps_per_second": 3.816, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.005473215150634506, |
|
"grad_norm": 1.4636644124984741, |
|
"learning_rate": 9.999535018889858e-06, |
|
"loss": 9.7812, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.005473215150634506, |
|
"eval_accuracy": 0.0823751613212979, |
|
"eval_loss": 9.71875, |
|
"eval_runtime": 276.4525, |
|
"eval_samples_per_second": 122.144, |
|
"eval_steps_per_second": 3.82, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.005521650682941005, |
|
"grad_norm": 1.4979418516159058, |
|
"learning_rate": 9.999530175336628e-06, |
|
"loss": 9.7734, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.005521650682941005, |
|
"eval_accuracy": 0.08236062897944081, |
|
"eval_loss": 9.7109375, |
|
"eval_runtime": 277.8616, |
|
"eval_samples_per_second": 121.525, |
|
"eval_steps_per_second": 3.8, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.005570086215247506, |
|
"grad_norm": 1.8021794557571411, |
|
"learning_rate": 9.999525331783398e-06, |
|
"loss": 9.7266, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.005570086215247506, |
|
"eval_accuracy": 0.08244796777502407, |
|
"eval_loss": 9.7109375, |
|
"eval_runtime": 277.0309, |
|
"eval_samples_per_second": 121.889, |
|
"eval_steps_per_second": 3.812, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.005618521747554005, |
|
"grad_norm": 1.8129605054855347, |
|
"learning_rate": 9.999520488230166e-06, |
|
"loss": 9.7266, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.005618521747554005, |
|
"eval_accuracy": 0.08262273221285504, |
|
"eval_loss": 9.703125, |
|
"eval_runtime": 278.4245, |
|
"eval_samples_per_second": 121.279, |
|
"eval_steps_per_second": 3.793, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.005666957279860506, |
|
"grad_norm": 1.5428948402404785, |
|
"learning_rate": 9.999515644676936e-06, |
|
"loss": 9.7109, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.005666957279860506, |
|
"eval_accuracy": 0.08277873777115737, |
|
"eval_loss": 9.6953125, |
|
"eval_runtime": 276.3715, |
|
"eval_samples_per_second": 122.18, |
|
"eval_steps_per_second": 3.821, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.005715392812167006, |
|
"grad_norm": 1.7619973421096802, |
|
"learning_rate": 9.999510801123706e-06, |
|
"loss": 9.6719, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.005715392812167006, |
|
"eval_accuracy": 0.08289928094146184, |
|
"eval_loss": 9.6953125, |
|
"eval_runtime": 277.665, |
|
"eval_samples_per_second": 121.611, |
|
"eval_steps_per_second": 3.803, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.005763828344473506, |
|
"grad_norm": 1.5316611528396606, |
|
"learning_rate": 9.999505957570474e-06, |
|
"loss": 9.6953, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.005763828344473506, |
|
"eval_accuracy": 0.08300393117216567, |
|
"eval_loss": 9.6875, |
|
"eval_runtime": 276.1306, |
|
"eval_samples_per_second": 122.286, |
|
"eval_steps_per_second": 3.824, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.005812263876780006, |
|
"grad_norm": 1.7051466703414917, |
|
"learning_rate": 9.999501114017244e-06, |
|
"loss": 9.6719, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.005812263876780006, |
|
"eval_accuracy": 0.08307213475267416, |
|
"eval_loss": 9.6875, |
|
"eval_runtime": 276.7696, |
|
"eval_samples_per_second": 122.004, |
|
"eval_steps_per_second": 3.815, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.005860699409086506, |
|
"grad_norm": 1.6584818363189697, |
|
"learning_rate": 9.999496270464012e-06, |
|
"loss": 9.6953, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.005860699409086506, |
|
"eval_accuracy": 0.08307086100159505, |
|
"eval_loss": 9.6796875, |
|
"eval_runtime": 277.4403, |
|
"eval_samples_per_second": 121.709, |
|
"eval_steps_per_second": 3.806, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.005909134941393006, |
|
"grad_norm": 1.7079665660858154, |
|
"learning_rate": 9.999491426910782e-06, |
|
"loss": 9.6875, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.005909134941393006, |
|
"eval_accuracy": 0.08310041781640795, |
|
"eval_loss": 9.6796875, |
|
"eval_runtime": 275.7807, |
|
"eval_samples_per_second": 122.442, |
|
"eval_steps_per_second": 3.829, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0059575704736995055, |
|
"grad_norm": 1.6613987684249878, |
|
"learning_rate": 9.999486583357552e-06, |
|
"loss": 9.6719, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0059575704736995055, |
|
"eval_accuracy": 0.08318772766310303, |
|
"eval_loss": 9.671875, |
|
"eval_runtime": 277.1478, |
|
"eval_samples_per_second": 121.838, |
|
"eval_steps_per_second": 3.81, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.006006006006006006, |
|
"grad_norm": 1.5512877702713013, |
|
"learning_rate": 9.99948173980432e-06, |
|
"loss": 9.6719, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.006006006006006006, |
|
"eval_accuracy": 0.08327202682542932, |
|
"eval_loss": 9.6640625, |
|
"eval_runtime": 276.8801, |
|
"eval_samples_per_second": 121.955, |
|
"eval_steps_per_second": 3.814, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.006054441538312506, |
|
"grad_norm": 1.6818300485610962, |
|
"learning_rate": 9.99947689625109e-06, |
|
"loss": 9.625, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.006054441538312506, |
|
"eval_accuracy": 0.08333837767709547, |
|
"eval_loss": 9.6640625, |
|
"eval_runtime": 275.4944, |
|
"eval_samples_per_second": 122.569, |
|
"eval_steps_per_second": 3.833, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.006102877070619006, |
|
"grad_norm": 1.497159719467163, |
|
"learning_rate": 9.99947205269786e-06, |
|
"loss": 9.6719, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.006102877070619006, |
|
"eval_accuracy": 0.08344485168775347, |
|
"eval_loss": 9.65625, |
|
"eval_runtime": 276.6512, |
|
"eval_samples_per_second": 122.056, |
|
"eval_steps_per_second": 3.817, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.006151312602925506, |
|
"grad_norm": 1.4452403783798218, |
|
"learning_rate": 9.99946720914463e-06, |
|
"loss": 9.6953, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.006151312602925506, |
|
"eval_accuracy": 0.08355928664265588, |
|
"eval_loss": 9.65625, |
|
"eval_runtime": 277.0928, |
|
"eval_samples_per_second": 121.862, |
|
"eval_steps_per_second": 3.811, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.006199748135232006, |
|
"grad_norm": 1.4734400510787964, |
|
"learning_rate": 9.9994623655914e-06, |
|
"loss": 9.6719, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.006199748135232006, |
|
"eval_accuracy": 0.08367464796197946, |
|
"eval_loss": 9.6484375, |
|
"eval_runtime": 278.2759, |
|
"eval_samples_per_second": 121.344, |
|
"eval_steps_per_second": 3.795, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.006248183667538506, |
|
"grad_norm": 1.4783730506896973, |
|
"learning_rate": 9.999457522038168e-06, |
|
"loss": 9.6797, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.006248183667538506, |
|
"eval_accuracy": 0.08380396264539687, |
|
"eval_loss": 9.640625, |
|
"eval_runtime": 276.6233, |
|
"eval_samples_per_second": 122.069, |
|
"eval_steps_per_second": 3.817, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.006296619199845007, |
|
"grad_norm": 1.7012325525283813, |
|
"learning_rate": 9.999452678484938e-06, |
|
"loss": 9.6484, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.006296619199845007, |
|
"eval_accuracy": 0.08385152566864622, |
|
"eval_loss": 9.640625, |
|
"eval_runtime": 276.6061, |
|
"eval_samples_per_second": 122.076, |
|
"eval_steps_per_second": 3.818, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.006345054732151506, |
|
"grad_norm": 1.5358777046203613, |
|
"learning_rate": 9.999447834931707e-06, |
|
"loss": 9.6719, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.006345054732151506, |
|
"eval_accuracy": 0.0839065864539294, |
|
"eval_loss": 9.6328125, |
|
"eval_runtime": 276.1031, |
|
"eval_samples_per_second": 122.299, |
|
"eval_steps_per_second": 3.825, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.006393490264458007, |
|
"grad_norm": 1.5622602701187134, |
|
"learning_rate": 9.999442991378476e-06, |
|
"loss": 9.6328, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.006393490264458007, |
|
"eval_accuracy": 0.08391194199823927, |
|
"eval_loss": 9.6328125, |
|
"eval_runtime": 276.4355, |
|
"eval_samples_per_second": 122.151, |
|
"eval_steps_per_second": 3.82, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.006441925796764506, |
|
"grad_norm": 1.5135513544082642, |
|
"learning_rate": 9.999438147825245e-06, |
|
"loss": 9.6719, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.006441925796764506, |
|
"eval_accuracy": 0.08392389788904997, |
|
"eval_loss": 9.625, |
|
"eval_runtime": 277.1805, |
|
"eval_samples_per_second": 121.823, |
|
"eval_steps_per_second": 3.81, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.006490361329071007, |
|
"grad_norm": 1.4829246997833252, |
|
"learning_rate": 9.999433304272015e-06, |
|
"loss": 9.6484, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.006490361329071007, |
|
"eval_accuracy": 0.08400426000258629, |
|
"eval_loss": 9.6171875, |
|
"eval_runtime": 276.9095, |
|
"eval_samples_per_second": 121.942, |
|
"eval_steps_per_second": 3.814, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.006538796861377506, |
|
"grad_norm": 1.506585955619812, |
|
"learning_rate": 9.999428460718784e-06, |
|
"loss": 9.6406, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.006538796861377506, |
|
"eval_accuracy": 0.084119823964127, |
|
"eval_loss": 9.6171875, |
|
"eval_runtime": 275.7802, |
|
"eval_samples_per_second": 122.442, |
|
"eval_steps_per_second": 3.829, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0065872323936840066, |
|
"grad_norm": 1.597743272781372, |
|
"learning_rate": 9.999423617165553e-06, |
|
"loss": 9.6094, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0065872323936840066, |
|
"eval_accuracy": 0.08430503895058428, |
|
"eval_loss": 9.609375, |
|
"eval_runtime": 278.6385, |
|
"eval_samples_per_second": 121.186, |
|
"eval_steps_per_second": 3.79, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.006635667925990507, |
|
"grad_norm": 1.5326935052871704, |
|
"learning_rate": 9.999418773612323e-06, |
|
"loss": 9.625, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.006635667925990507, |
|
"eval_accuracy": 0.08447033710198644, |
|
"eval_loss": 9.609375, |
|
"eval_runtime": 276.8784, |
|
"eval_samples_per_second": 121.956, |
|
"eval_steps_per_second": 3.814, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0066841034582970065, |
|
"grad_norm": 1.5170117616653442, |
|
"learning_rate": 9.999413930059091e-06, |
|
"loss": 9.6562, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0066841034582970065, |
|
"eval_accuracy": 0.0845831509191518, |
|
"eval_loss": 9.6015625, |
|
"eval_runtime": 276.8242, |
|
"eval_samples_per_second": 121.98, |
|
"eval_steps_per_second": 3.815, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.006732538990603507, |
|
"grad_norm": 1.5148200988769531, |
|
"learning_rate": 9.999409086505861e-06, |
|
"loss": 9.6172, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.006732538990603507, |
|
"eval_accuracy": 0.0846733846035512, |
|
"eval_loss": 9.6015625, |
|
"eval_runtime": 276.0924, |
|
"eval_samples_per_second": 122.303, |
|
"eval_steps_per_second": 3.825, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0067809745229100065, |
|
"grad_norm": 1.584030032157898, |
|
"learning_rate": 9.999404242952631e-06, |
|
"loss": 9.6094, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0067809745229100065, |
|
"eval_accuracy": 0.08471049707817424, |
|
"eval_loss": 9.59375, |
|
"eval_runtime": 276.8263, |
|
"eval_samples_per_second": 121.979, |
|
"eval_steps_per_second": 3.815, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.006829410055216507, |
|
"grad_norm": 1.5023019313812256, |
|
"learning_rate": 9.9993993993994e-06, |
|
"loss": 9.6562, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.006829410055216507, |
|
"eval_accuracy": 0.08469269351195492, |
|
"eval_loss": 9.5859375, |
|
"eval_runtime": 276.3818, |
|
"eval_samples_per_second": 122.175, |
|
"eval_steps_per_second": 3.821, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.006877845587523007, |
|
"grad_norm": 1.5090259313583374, |
|
"learning_rate": 9.99939455584617e-06, |
|
"loss": 9.6562, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.006877845587523007, |
|
"eval_accuracy": 0.08472097657568871, |
|
"eval_loss": 9.5859375, |
|
"eval_runtime": 277.9886, |
|
"eval_samples_per_second": 121.469, |
|
"eval_steps_per_second": 3.799, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.006926281119829507, |
|
"grad_norm": 1.4967498779296875, |
|
"learning_rate": 9.999389712292939e-06, |
|
"loss": 9.6562, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.006926281119829507, |
|
"eval_accuracy": 0.0847566126570155, |
|
"eval_loss": 9.578125, |
|
"eval_runtime": 276.5132, |
|
"eval_samples_per_second": 122.117, |
|
"eval_steps_per_second": 3.819, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.006974716652136007, |
|
"grad_norm": 1.8095794916152954, |
|
"learning_rate": 9.999384868739709e-06, |
|
"loss": 9.6016, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.006974716652136007, |
|
"eval_accuracy": 0.08490364405998777, |
|
"eval_loss": 9.578125, |
|
"eval_runtime": 276.1595, |
|
"eval_samples_per_second": 122.274, |
|
"eval_steps_per_second": 3.824, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.007023152184442507, |
|
"grad_norm": 1.7810986042022705, |
|
"learning_rate": 9.999380025186479e-06, |
|
"loss": 9.6094, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.007023152184442507, |
|
"eval_accuracy": 0.08503437723892511, |
|
"eval_loss": 9.5703125, |
|
"eval_runtime": 276.2917, |
|
"eval_samples_per_second": 122.215, |
|
"eval_steps_per_second": 3.822, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.007071587716749007, |
|
"grad_norm": 1.5788795948028564, |
|
"learning_rate": 9.999375181633247e-06, |
|
"loss": 9.5938, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.007071587716749007, |
|
"eval_accuracy": 0.08506937644471235, |
|
"eval_loss": 9.5703125, |
|
"eval_runtime": 276.3857, |
|
"eval_samples_per_second": 122.173, |
|
"eval_steps_per_second": 3.821, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.007120023249055507, |
|
"grad_norm": 1.8074451684951782, |
|
"learning_rate": 9.999370338080017e-06, |
|
"loss": 9.5703, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.007120023249055507, |
|
"eval_accuracy": 0.08507481883568672, |
|
"eval_loss": 9.5625, |
|
"eval_runtime": 274.2982, |
|
"eval_samples_per_second": 123.103, |
|
"eval_steps_per_second": 3.85, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.007168458781362007, |
|
"grad_norm": 1.7187494039535522, |
|
"learning_rate": 9.999365494526787e-06, |
|
"loss": 9.5859, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.007168458781362007, |
|
"eval_accuracy": 0.08513216658313465, |
|
"eval_loss": 9.5625, |
|
"eval_runtime": 275.5423, |
|
"eval_samples_per_second": 122.547, |
|
"eval_steps_per_second": 3.832, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.007216894313668508, |
|
"grad_norm": 1.6044690608978271, |
|
"learning_rate": 9.999360650973555e-06, |
|
"loss": 9.625, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.007216894313668508, |
|
"eval_accuracy": 0.08522213972754059, |
|
"eval_loss": 9.5546875, |
|
"eval_runtime": 275.5733, |
|
"eval_samples_per_second": 122.534, |
|
"eval_steps_per_second": 3.832, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.007265329845975007, |
|
"grad_norm": 1.7572296857833862, |
|
"learning_rate": 9.999355807420325e-06, |
|
"loss": 9.5859, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.007265329845975007, |
|
"eval_accuracy": 0.0853520623376094, |
|
"eval_loss": 9.546875, |
|
"eval_runtime": 275.4002, |
|
"eval_samples_per_second": 122.611, |
|
"eval_steps_per_second": 3.834, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0073137653782815075, |
|
"grad_norm": 1.5954887866973877, |
|
"learning_rate": 9.999350963867095e-06, |
|
"loss": 9.5625, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.0073137653782815075, |
|
"eval_accuracy": 0.0855014675494109, |
|
"eval_loss": 9.546875, |
|
"eval_runtime": 275.8854, |
|
"eval_samples_per_second": 122.395, |
|
"eval_steps_per_second": 3.828, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.007362200910588007, |
|
"grad_norm": 1.6131614446640015, |
|
"learning_rate": 9.999346120313863e-06, |
|
"loss": 9.5547, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.007362200910588007, |
|
"eval_accuracy": 0.0856389168704017, |
|
"eval_loss": 9.5390625, |
|
"eval_runtime": 276.1363, |
|
"eval_samples_per_second": 122.284, |
|
"eval_steps_per_second": 3.824, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.0074106364428945075, |
|
"grad_norm": 1.4832433462142944, |
|
"learning_rate": 9.999341276760633e-06, |
|
"loss": 9.5703, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.0074106364428945075, |
|
"eval_accuracy": 0.08576559720499642, |
|
"eval_loss": 9.5390625, |
|
"eval_runtime": 274.8867, |
|
"eval_samples_per_second": 122.84, |
|
"eval_steps_per_second": 3.842, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.007459071975201007, |
|
"grad_norm": 1.7311336994171143, |
|
"learning_rate": 9.999336433207402e-06, |
|
"loss": 9.5391, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.007459071975201007, |
|
"eval_accuracy": 0.08582592668792499, |
|
"eval_loss": 9.53125, |
|
"eval_runtime": 275.7658, |
|
"eval_samples_per_second": 122.448, |
|
"eval_steps_per_second": 3.829, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0075075075075075074, |
|
"grad_norm": 1.9239146709442139, |
|
"learning_rate": 9.99933158965417e-06, |
|
"loss": 9.5391, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0075075075075075074, |
|
"eval_accuracy": 0.08592603194318746, |
|
"eval_loss": 9.53125, |
|
"eval_runtime": 274.2731, |
|
"eval_samples_per_second": 123.115, |
|
"eval_steps_per_second": 3.85, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.007555943039814008, |
|
"grad_norm": 1.8369977474212646, |
|
"learning_rate": 9.99932674610094e-06, |
|
"loss": 9.5, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.007555943039814008, |
|
"eval_accuracy": 0.08610542820312428, |
|
"eval_loss": 9.5234375, |
|
"eval_runtime": 274.5904, |
|
"eval_samples_per_second": 122.972, |
|
"eval_steps_per_second": 3.846, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.007604378572120507, |
|
"grad_norm": 1.5703845024108887, |
|
"learning_rate": 9.99932190254771e-06, |
|
"loss": 9.5547, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.007604378572120507, |
|
"eval_accuracy": 0.08628861676741025, |
|
"eval_loss": 9.515625, |
|
"eval_runtime": 275.108, |
|
"eval_samples_per_second": 122.741, |
|
"eval_steps_per_second": 3.838, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.007652814104427008, |
|
"grad_norm": 1.5686722993850708, |
|
"learning_rate": 9.999317058994478e-06, |
|
"loss": 9.5391, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.007652814104427008, |
|
"eval_accuracy": 0.08633840885504802, |
|
"eval_loss": 9.515625, |
|
"eval_runtime": 274.8868, |
|
"eval_samples_per_second": 122.84, |
|
"eval_steps_per_second": 3.842, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.007701249636733507, |
|
"grad_norm": 1.6259181499481201, |
|
"learning_rate": 9.999312215441248e-06, |
|
"loss": 9.5312, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.007701249636733507, |
|
"eval_accuracy": 0.08635719668346484, |
|
"eval_loss": 9.515625, |
|
"eval_runtime": 274.9284, |
|
"eval_samples_per_second": 122.821, |
|
"eval_steps_per_second": 3.841, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.007749685169040008, |
|
"grad_norm": 1.6887496709823608, |
|
"learning_rate": 9.999307371888018e-06, |
|
"loss": 9.5391, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.007749685169040008, |
|
"eval_accuracy": 0.08644670664566019, |
|
"eval_loss": 9.5078125, |
|
"eval_runtime": 274.7492, |
|
"eval_samples_per_second": 122.901, |
|
"eval_steps_per_second": 3.844, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.007798120701346508, |
|
"grad_norm": 1.6951507329940796, |
|
"learning_rate": 9.999302528334788e-06, |
|
"loss": 9.4688, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.007798120701346508, |
|
"eval_accuracy": 0.08658233218669682, |
|
"eval_loss": 9.5, |
|
"eval_runtime": 275.2855, |
|
"eval_samples_per_second": 122.662, |
|
"eval_steps_per_second": 3.836, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.007846556233653008, |
|
"grad_norm": 1.4970242977142334, |
|
"learning_rate": 9.999297684781556e-06, |
|
"loss": 9.5547, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.007846556233653008, |
|
"eval_accuracy": 0.0867301741585376, |
|
"eval_loss": 9.5, |
|
"eval_runtime": 272.7321, |
|
"eval_samples_per_second": 123.81, |
|
"eval_steps_per_second": 3.872, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.007894991765959508, |
|
"grad_norm": 1.5665501356124878, |
|
"learning_rate": 9.999292841228326e-06, |
|
"loss": 9.5078, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.007894991765959508, |
|
"eval_accuracy": 0.08686166000856714, |
|
"eval_loss": 9.4921875, |
|
"eval_runtime": 274.5735, |
|
"eval_samples_per_second": 122.98, |
|
"eval_steps_per_second": 3.846, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.007943427298266009, |
|
"grad_norm": 1.5631929636001587, |
|
"learning_rate": 9.999287997675096e-06, |
|
"loss": 9.5078, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.007943427298266009, |
|
"eval_accuracy": 0.08703300847759506, |
|
"eval_loss": 9.4921875, |
|
"eval_runtime": 275.4664, |
|
"eval_samples_per_second": 122.581, |
|
"eval_steps_per_second": 3.833, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.007991862830572507, |
|
"grad_norm": 1.5439754724502563, |
|
"learning_rate": 9.999283154121864e-06, |
|
"loss": 9.5, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.007991862830572507, |
|
"eval_accuracy": 0.08722841347268517, |
|
"eval_loss": 9.484375, |
|
"eval_runtime": 275.1068, |
|
"eval_samples_per_second": 122.741, |
|
"eval_steps_per_second": 3.839, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.008040298362879008, |
|
"grad_norm": 1.5011335611343384, |
|
"learning_rate": 9.999278310568634e-06, |
|
"loss": 9.5312, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.008040298362879008, |
|
"eval_accuracy": 0.08746153886904974, |
|
"eval_loss": 9.484375, |
|
"eval_runtime": 274.8226, |
|
"eval_samples_per_second": 122.868, |
|
"eval_steps_per_second": 3.842, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.008088733895185508, |
|
"grad_norm": 1.5114489793777466, |
|
"learning_rate": 9.999273467015404e-06, |
|
"loss": 9.5156, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.008088733895185508, |
|
"eval_accuracy": 0.08767709229030025, |
|
"eval_loss": 9.4765625, |
|
"eval_runtime": 275.1958, |
|
"eval_samples_per_second": 122.702, |
|
"eval_steps_per_second": 3.837, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.008137169427492008, |
|
"grad_norm": 1.6843675374984741, |
|
"learning_rate": 9.999268623462172e-06, |
|
"loss": 9.4844, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.008137169427492008, |
|
"eval_accuracy": 0.08782279204441709, |
|
"eval_loss": 9.4765625, |
|
"eval_runtime": 275.685, |
|
"eval_samples_per_second": 122.484, |
|
"eval_steps_per_second": 3.83, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.008185604959798509, |
|
"grad_norm": 1.6421033143997192, |
|
"learning_rate": 9.999263779908942e-06, |
|
"loss": 9.4688, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.008185604959798509, |
|
"eval_accuracy": 0.08784629854160422, |
|
"eval_loss": 9.46875, |
|
"eval_runtime": 276.249, |
|
"eval_samples_per_second": 122.234, |
|
"eval_steps_per_second": 3.823, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.008234040492105008, |
|
"grad_norm": 1.6387994289398193, |
|
"learning_rate": 9.999258936355712e-06, |
|
"loss": 9.5156, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.008234040492105008, |
|
"eval_accuracy": 0.08786224937898121, |
|
"eval_loss": 9.4609375, |
|
"eval_runtime": 276.6173, |
|
"eval_samples_per_second": 122.071, |
|
"eval_steps_per_second": 3.818, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.008282476024411508, |
|
"grad_norm": 1.5107547044754028, |
|
"learning_rate": 9.99925409280248e-06, |
|
"loss": 9.4922, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.008282476024411508, |
|
"eval_accuracy": 0.08785258045033527, |
|
"eval_loss": 9.4609375, |
|
"eval_runtime": 276.9505, |
|
"eval_samples_per_second": 121.924, |
|
"eval_steps_per_second": 3.813, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.008330911556718008, |
|
"grad_norm": 1.5190666913986206, |
|
"learning_rate": 9.99924924924925e-06, |
|
"loss": 9.4844, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.008330911556718008, |
|
"eval_accuracy": 0.08782476056881207, |
|
"eval_loss": 9.453125, |
|
"eval_runtime": 275.3735, |
|
"eval_samples_per_second": 122.623, |
|
"eval_steps_per_second": 3.835, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.008379347089024509, |
|
"grad_norm": 1.560573935508728, |
|
"learning_rate": 9.99924440569602e-06, |
|
"loss": 9.5234, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.008379347089024509, |
|
"eval_accuracy": 0.08786801020772535, |
|
"eval_loss": 9.453125, |
|
"eval_runtime": 276.2814, |
|
"eval_samples_per_second": 122.22, |
|
"eval_steps_per_second": 3.822, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.00842778262133101, |
|
"grad_norm": 1.7032357454299927, |
|
"learning_rate": 9.999239562142788e-06, |
|
"loss": 9.4844, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.00842778262133101, |
|
"eval_accuracy": 0.08790746754228948, |
|
"eval_loss": 9.4453125, |
|
"eval_runtime": 275.8479, |
|
"eval_samples_per_second": 122.412, |
|
"eval_steps_per_second": 3.828, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.008476218153637508, |
|
"grad_norm": 1.550713300704956, |
|
"learning_rate": 9.999234718589558e-06, |
|
"loss": 9.4219, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.008476218153637508, |
|
"eval_accuracy": 0.08798829283803639, |
|
"eval_loss": 9.4453125, |
|
"eval_runtime": 275.7584, |
|
"eval_samples_per_second": 122.451, |
|
"eval_steps_per_second": 3.829, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.008524653685944008, |
|
"grad_norm": 1.6866670846939087, |
|
"learning_rate": 9.999229875036328e-06, |
|
"loss": 9.4062, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.008524653685944008, |
|
"eval_accuracy": 0.08809621429310245, |
|
"eval_loss": 9.4375, |
|
"eval_runtime": 276.5351, |
|
"eval_samples_per_second": 122.107, |
|
"eval_steps_per_second": 3.819, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.008573089218250509, |
|
"grad_norm": 1.622749924659729, |
|
"learning_rate": 9.999225031483096e-06, |
|
"loss": 9.4375, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.008573089218250509, |
|
"eval_accuracy": 0.08827711589522366, |
|
"eval_loss": 9.4375, |
|
"eval_runtime": 278.1964, |
|
"eval_samples_per_second": 121.378, |
|
"eval_steps_per_second": 3.796, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.008621524750557009, |
|
"grad_norm": 1.5966665744781494, |
|
"learning_rate": 9.999220187929867e-06, |
|
"loss": 9.4375, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.008621524750557009, |
|
"eval_accuracy": 0.08849081658763186, |
|
"eval_loss": 9.4296875, |
|
"eval_runtime": 277.208, |
|
"eval_samples_per_second": 121.811, |
|
"eval_steps_per_second": 3.809, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.00866996028286351, |
|
"grad_norm": 1.499353289604187, |
|
"learning_rate": 9.999215344376635e-06, |
|
"loss": 9.4688, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.00866996028286351, |
|
"eval_accuracy": 0.08868196609616225, |
|
"eval_loss": 9.4296875, |
|
"eval_runtime": 276.9344, |
|
"eval_samples_per_second": 121.931, |
|
"eval_steps_per_second": 3.813, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.008718395815170008, |
|
"grad_norm": 1.5957542657852173, |
|
"learning_rate": 9.999210500823405e-06, |
|
"loss": 9.4453, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.008718395815170008, |
|
"eval_accuracy": 0.08884607734314978, |
|
"eval_loss": 9.421875, |
|
"eval_runtime": 277.5515, |
|
"eval_samples_per_second": 121.66, |
|
"eval_steps_per_second": 3.805, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.008766831347476509, |
|
"grad_norm": 1.519926905632019, |
|
"learning_rate": 9.999205657270175e-06, |
|
"loss": 9.4219, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.008766831347476509, |
|
"eval_accuracy": 0.0889903586017467, |
|
"eval_loss": 9.421875, |
|
"eval_runtime": 277.0317, |
|
"eval_samples_per_second": 121.889, |
|
"eval_steps_per_second": 3.812, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.008815266879783009, |
|
"grad_norm": 1.5913316011428833, |
|
"learning_rate": 9.999200813716943e-06, |
|
"loss": 9.4141, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.008815266879783009, |
|
"eval_accuracy": 0.08903491094062725, |
|
"eval_loss": 9.4140625, |
|
"eval_runtime": 277.095, |
|
"eval_samples_per_second": 121.861, |
|
"eval_steps_per_second": 3.811, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.00886370241208951, |
|
"grad_norm": 1.5328583717346191, |
|
"learning_rate": 9.999195970163713e-06, |
|
"loss": 9.4375, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.00886370241208951, |
|
"eval_accuracy": 0.08903395562731792, |
|
"eval_loss": 9.40625, |
|
"eval_runtime": 276.2305, |
|
"eval_samples_per_second": 122.242, |
|
"eval_steps_per_second": 3.823, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.008912137944396008, |
|
"grad_norm": 1.5967031717300415, |
|
"learning_rate": 9.999191126610483e-06, |
|
"loss": 9.3984, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.008912137944396008, |
|
"eval_accuracy": 0.08904220606044394, |
|
"eval_loss": 9.40625, |
|
"eval_runtime": 278.9395, |
|
"eval_samples_per_second": 121.055, |
|
"eval_steps_per_second": 3.786, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.008960573476702509, |
|
"grad_norm": 1.596799612045288, |
|
"learning_rate": 9.999186283057251e-06, |
|
"loss": 9.4297, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.008960573476702509, |
|
"eval_accuracy": 0.08908192393500153, |
|
"eval_loss": 9.3984375, |
|
"eval_runtime": 277.1703, |
|
"eval_samples_per_second": 121.828, |
|
"eval_steps_per_second": 3.81, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.009009009009009009, |
|
"grad_norm": 1.5406758785247803, |
|
"learning_rate": 9.999181439504021e-06, |
|
"loss": 9.3984, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.009009009009009009, |
|
"eval_accuracy": 0.089139011142456, |
|
"eval_loss": 9.3984375, |
|
"eval_runtime": 276.7972, |
|
"eval_samples_per_second": 121.992, |
|
"eval_steps_per_second": 3.815, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.00905744454131551, |
|
"grad_norm": 1.6137006282806396, |
|
"learning_rate": 9.999176595950791e-06, |
|
"loss": 9.3906, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.00905744454131551, |
|
"eval_accuracy": 0.08919705366321981, |
|
"eval_loss": 9.390625, |
|
"eval_runtime": 277.048, |
|
"eval_samples_per_second": 121.881, |
|
"eval_steps_per_second": 3.812, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.00910588007362201, |
|
"grad_norm": 1.5155887603759766, |
|
"learning_rate": 9.999171752397559e-06, |
|
"loss": 9.4219, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.00910588007362201, |
|
"eval_accuracy": 0.08929362715412657, |
|
"eval_loss": 9.390625, |
|
"eval_runtime": 277.0594, |
|
"eval_samples_per_second": 121.876, |
|
"eval_steps_per_second": 3.811, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.009154315605928508, |
|
"grad_norm": 1.7281869649887085, |
|
"learning_rate": 9.999166908844329e-06, |
|
"loss": 9.4062, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.009154315605928508, |
|
"eval_accuracy": 0.08947652622953092, |
|
"eval_loss": 9.3828125, |
|
"eval_runtime": 277.1329, |
|
"eval_samples_per_second": 121.844, |
|
"eval_steps_per_second": 3.81, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.009202751138235009, |
|
"grad_norm": 1.5536915063858032, |
|
"learning_rate": 9.999162065291099e-06, |
|
"loss": 9.375, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.009202751138235009, |
|
"eval_accuracy": 0.08965256441844101, |
|
"eval_loss": 9.3828125, |
|
"eval_runtime": 277.1552, |
|
"eval_samples_per_second": 121.834, |
|
"eval_steps_per_second": 3.81, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.00925118667054151, |
|
"grad_norm": 1.6295173168182373, |
|
"learning_rate": 9.999157221737867e-06, |
|
"loss": 9.3828, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.00925118667054151, |
|
"eval_accuracy": 0.08979913263920268, |
|
"eval_loss": 9.375, |
|
"eval_runtime": 276.596, |
|
"eval_samples_per_second": 122.081, |
|
"eval_steps_per_second": 3.818, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.00929962220284801, |
|
"grad_norm": 1.5873547792434692, |
|
"learning_rate": 9.999152378184637e-06, |
|
"loss": 9.3906, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.00929962220284801, |
|
"eval_accuracy": 0.08980952529005266, |
|
"eval_loss": 9.375, |
|
"eval_runtime": 275.7787, |
|
"eval_samples_per_second": 122.442, |
|
"eval_steps_per_second": 3.829, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.00934805773515451, |
|
"grad_norm": 1.4720993041992188, |
|
"learning_rate": 9.999147534631407e-06, |
|
"loss": 9.3906, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.00934805773515451, |
|
"eval_accuracy": 0.08985268808230146, |
|
"eval_loss": 9.3671875, |
|
"eval_runtime": 276.9238, |
|
"eval_samples_per_second": 121.936, |
|
"eval_steps_per_second": 3.813, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.009396493267461009, |
|
"grad_norm": 1.603896975517273, |
|
"learning_rate": 9.999142691078175e-06, |
|
"loss": 9.4141, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.009396493267461009, |
|
"eval_accuracy": 0.08978121327743072, |
|
"eval_loss": 9.3671875, |
|
"eval_runtime": 275.8739, |
|
"eval_samples_per_second": 122.4, |
|
"eval_steps_per_second": 3.828, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.00944492879976751, |
|
"grad_norm": 1.6265010833740234, |
|
"learning_rate": 9.999137847524946e-06, |
|
"loss": 9.3203, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.00944492879976751, |
|
"eval_accuracy": 0.08979632459705102, |
|
"eval_loss": 9.359375, |
|
"eval_runtime": 276.051, |
|
"eval_samples_per_second": 122.322, |
|
"eval_steps_per_second": 3.825, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.00949336433207401, |
|
"grad_norm": 1.609118103981018, |
|
"learning_rate": 9.999133003971715e-06, |
|
"loss": 9.3906, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.00949336433207401, |
|
"eval_accuracy": 0.08980888841451311, |
|
"eval_loss": 9.359375, |
|
"eval_runtime": 274.9932, |
|
"eval_samples_per_second": 122.792, |
|
"eval_steps_per_second": 3.84, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.00954179986438051, |
|
"grad_norm": 1.6511759757995605, |
|
"learning_rate": 9.999128160418484e-06, |
|
"loss": 9.3594, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.00954179986438051, |
|
"eval_accuracy": 0.08997595244809313, |
|
"eval_loss": 9.3515625, |
|
"eval_runtime": 275.7963, |
|
"eval_samples_per_second": 122.435, |
|
"eval_steps_per_second": 3.829, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.00959023539668701, |
|
"grad_norm": 1.5398412942886353, |
|
"learning_rate": 9.999123316865254e-06, |
|
"loss": 9.3516, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.00959023539668701, |
|
"eval_accuracy": 0.09011600716901846, |
|
"eval_loss": 9.3515625, |
|
"eval_runtime": 275.8953, |
|
"eval_samples_per_second": 122.391, |
|
"eval_steps_per_second": 3.828, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.009638670928993509, |
|
"grad_norm": 1.5655171871185303, |
|
"learning_rate": 9.999118473312022e-06, |
|
"loss": 9.3438, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.009638670928993509, |
|
"eval_accuracy": 0.0902209468886039, |
|
"eval_loss": 9.34375, |
|
"eval_runtime": 276.4152, |
|
"eval_samples_per_second": 122.16, |
|
"eval_steps_per_second": 3.82, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.00968710646130001, |
|
"grad_norm": 1.5900487899780273, |
|
"learning_rate": 9.999113629758792e-06, |
|
"loss": 9.3516, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00968710646130001, |
|
"eval_accuracy": 0.09037614187803769, |
|
"eval_loss": 9.34375, |
|
"eval_runtime": 274.9294, |
|
"eval_samples_per_second": 122.821, |
|
"eval_steps_per_second": 3.841, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00973554199360651, |
|
"grad_norm": 1.549442172050476, |
|
"learning_rate": 9.999108786205562e-06, |
|
"loss": 9.3125, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.00973554199360651, |
|
"eval_accuracy": 0.09055328012469792, |
|
"eval_loss": 9.3359375, |
|
"eval_runtime": 275.1689, |
|
"eval_samples_per_second": 122.714, |
|
"eval_steps_per_second": 3.838, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.00978397752591301, |
|
"grad_norm": 1.5649633407592773, |
|
"learning_rate": 9.99910394265233e-06, |
|
"loss": 9.3516, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.00978397752591301, |
|
"eval_accuracy": 0.0907463113109588, |
|
"eval_loss": 9.3359375, |
|
"eval_runtime": 276.2363, |
|
"eval_samples_per_second": 122.24, |
|
"eval_steps_per_second": 3.823, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.009832413058219509, |
|
"grad_norm": 1.6223474740982056, |
|
"learning_rate": 9.9990990990991e-06, |
|
"loss": 9.3359, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.009832413058219509, |
|
"eval_accuracy": 0.09079908513407721, |
|
"eval_loss": 9.328125, |
|
"eval_runtime": 276.2485, |
|
"eval_samples_per_second": 122.234, |
|
"eval_steps_per_second": 3.823, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.00988084859052601, |
|
"grad_norm": 1.5935430526733398, |
|
"learning_rate": 9.999094255545868e-06, |
|
"loss": 9.3516, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.00988084859052601, |
|
"eval_accuracy": 0.09073201056020702, |
|
"eval_loss": 9.328125, |
|
"eval_runtime": 277.2721, |
|
"eval_samples_per_second": 121.783, |
|
"eval_steps_per_second": 3.809, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.00992928412283251, |
|
"grad_norm": 1.6288846731185913, |
|
"learning_rate": 9.999089411992638e-06, |
|
"loss": 9.3281, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.00992928412283251, |
|
"eval_accuracy": 0.09064690082901221, |
|
"eval_loss": 9.3203125, |
|
"eval_runtime": 276.8611, |
|
"eval_samples_per_second": 121.964, |
|
"eval_steps_per_second": 3.814, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.00997771965513901, |
|
"grad_norm": 1.4847911596298218, |
|
"learning_rate": 9.999084568439408e-06, |
|
"loss": 9.375, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.00997771965513901, |
|
"eval_accuracy": 0.0904633648780683, |
|
"eval_loss": 9.3125, |
|
"eval_runtime": 275.7135, |
|
"eval_samples_per_second": 122.471, |
|
"eval_steps_per_second": 3.83, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.01002615518744551, |
|
"grad_norm": 1.6263271570205688, |
|
"learning_rate": 9.999079724886176e-06, |
|
"loss": 9.2812, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.01002615518744551, |
|
"eval_accuracy": 0.09040885412166019, |
|
"eval_loss": 9.3125, |
|
"eval_runtime": 275.4549, |
|
"eval_samples_per_second": 122.586, |
|
"eval_steps_per_second": 3.834, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.01007459071975201, |
|
"grad_norm": 1.5669511556625366, |
|
"learning_rate": 9.999074881332946e-06, |
|
"loss": 9.3281, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.01007459071975201, |
|
"eval_accuracy": 0.09057241533977267, |
|
"eval_loss": 9.3046875, |
|
"eval_runtime": 276.0055, |
|
"eval_samples_per_second": 122.342, |
|
"eval_steps_per_second": 3.826, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.01012302625205851, |
|
"grad_norm": 1.5233213901519775, |
|
"learning_rate": 9.999070037779716e-06, |
|
"loss": 9.3281, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.01012302625205851, |
|
"eval_accuracy": 0.09081425435147383, |
|
"eval_loss": 9.3046875, |
|
"eval_runtime": 275.4632, |
|
"eval_samples_per_second": 122.583, |
|
"eval_steps_per_second": 3.834, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.01017146178436501, |
|
"grad_norm": 1.6155483722686768, |
|
"learning_rate": 9.999065194226484e-06, |
|
"loss": 9.3594, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01017146178436501, |
|
"eval_accuracy": 0.0911761733512689, |
|
"eval_loss": 9.296875, |
|
"eval_runtime": 276.0337, |
|
"eval_samples_per_second": 122.329, |
|
"eval_steps_per_second": 3.826, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01021989731667151, |
|
"grad_norm": 1.5271143913269043, |
|
"learning_rate": 9.999060350673254e-06, |
|
"loss": 9.3438, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.01021989731667151, |
|
"eval_accuracy": 0.09151863869821945, |
|
"eval_loss": 9.296875, |
|
"eval_runtime": 274.7088, |
|
"eval_samples_per_second": 122.919, |
|
"eval_steps_per_second": 3.844, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.010268332848978011, |
|
"grad_norm": 1.6638132333755493, |
|
"learning_rate": 9.999055507120024e-06, |
|
"loss": 9.2891, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.010268332848978011, |
|
"eval_accuracy": 0.09163796601522115, |
|
"eval_loss": 9.2890625, |
|
"eval_runtime": 275.693, |
|
"eval_samples_per_second": 122.48, |
|
"eval_steps_per_second": 3.83, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.01031676838128451, |
|
"grad_norm": 1.5015349388122559, |
|
"learning_rate": 9.999050663566794e-06, |
|
"loss": 9.3438, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.01031676838128451, |
|
"eval_accuracy": 0.09161217255586926, |
|
"eval_loss": 9.2890625, |
|
"eval_runtime": 274.9875, |
|
"eval_samples_per_second": 122.795, |
|
"eval_steps_per_second": 3.84, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.01036520391359101, |
|
"grad_norm": 1.5039061307907104, |
|
"learning_rate": 9.999045820013564e-06, |
|
"loss": 9.3047, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.01036520391359101, |
|
"eval_accuracy": 0.09152989981571427, |
|
"eval_loss": 9.28125, |
|
"eval_runtime": 274.8846, |
|
"eval_samples_per_second": 122.841, |
|
"eval_steps_per_second": 3.842, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.01041363944589751, |
|
"grad_norm": 1.6265090703964233, |
|
"learning_rate": 9.999040976460332e-06, |
|
"loss": 9.2656, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.01041363944589751, |
|
"eval_accuracy": 0.09139931138121775, |
|
"eval_loss": 9.28125, |
|
"eval_runtime": 274.6634, |
|
"eval_samples_per_second": 122.94, |
|
"eval_steps_per_second": 3.845, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.010462074978204011, |
|
"grad_norm": 1.5140306949615479, |
|
"learning_rate": 9.999036132907102e-06, |
|
"loss": 9.2734, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.010462074978204011, |
|
"eval_accuracy": 0.09134954824246813, |
|
"eval_loss": 9.2734375, |
|
"eval_runtime": 274.7537, |
|
"eval_samples_per_second": 122.899, |
|
"eval_steps_per_second": 3.843, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.010510510510510511, |
|
"grad_norm": 1.5547981262207031, |
|
"learning_rate": 9.999031289353872e-06, |
|
"loss": 9.2891, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.010510510510510511, |
|
"eval_accuracy": 0.09132992089629463, |
|
"eval_loss": 9.2734375, |
|
"eval_runtime": 275.3328, |
|
"eval_samples_per_second": 122.641, |
|
"eval_steps_per_second": 3.835, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.01055894604281701, |
|
"grad_norm": 1.5140680074691772, |
|
"learning_rate": 9.99902644580064e-06, |
|
"loss": 9.2969, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.01055894604281701, |
|
"eval_accuracy": 0.091310090907904, |
|
"eval_loss": 9.265625, |
|
"eval_runtime": 275.0207, |
|
"eval_samples_per_second": 122.78, |
|
"eval_steps_per_second": 3.84, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.01060738157512351, |
|
"grad_norm": 1.5878396034240723, |
|
"learning_rate": 9.99902160224741e-06, |
|
"loss": 9.25, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.01060738157512351, |
|
"eval_accuracy": 0.09137461797961599, |
|
"eval_loss": 9.265625, |
|
"eval_runtime": 275.3479, |
|
"eval_samples_per_second": 122.634, |
|
"eval_steps_per_second": 3.835, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.01065581710743001, |
|
"grad_norm": 1.5309175252914429, |
|
"learning_rate": 9.99901675869418e-06, |
|
"loss": 9.2578, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01065581710743001, |
|
"eval_accuracy": 0.09149657964544039, |
|
"eval_loss": 9.2578125, |
|
"eval_runtime": 274.9496, |
|
"eval_samples_per_second": 122.812, |
|
"eval_steps_per_second": 3.841, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.010704252639736511, |
|
"grad_norm": 1.5207297801971436, |
|
"learning_rate": 9.999011915140948e-06, |
|
"loss": 9.25, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.010704252639736511, |
|
"eval_accuracy": 0.09163194464648355, |
|
"eval_loss": 9.2578125, |
|
"eval_runtime": 274.7916, |
|
"eval_samples_per_second": 122.882, |
|
"eval_steps_per_second": 3.843, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.010752688172043012, |
|
"grad_norm": 1.5458952188491821, |
|
"learning_rate": 9.999007071587717e-06, |
|
"loss": 9.2656, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.010752688172043012, |
|
"eval_accuracy": 0.09197788386001349, |
|
"eval_loss": 9.25, |
|
"eval_runtime": 274.1902, |
|
"eval_samples_per_second": 123.152, |
|
"eval_steps_per_second": 3.851, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.01080112370434951, |
|
"grad_norm": 1.468177080154419, |
|
"learning_rate": 9.999002228034487e-06, |
|
"loss": 9.2578, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.01080112370434951, |
|
"eval_accuracy": 0.09228760801445338, |
|
"eval_loss": 9.25, |
|
"eval_runtime": 274.801, |
|
"eval_samples_per_second": 122.878, |
|
"eval_steps_per_second": 3.843, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.01084955923665601, |
|
"grad_norm": 1.466130018234253, |
|
"learning_rate": 9.998997384481255e-06, |
|
"loss": 9.2734, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.01084955923665601, |
|
"eval_accuracy": 0.09260112447324241, |
|
"eval_loss": 9.2421875, |
|
"eval_runtime": 274.0532, |
|
"eval_samples_per_second": 123.213, |
|
"eval_steps_per_second": 3.853, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.010897994768962511, |
|
"grad_norm": 1.4513353109359741, |
|
"learning_rate": 9.998992540928025e-06, |
|
"loss": 9.2891, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.010897994768962511, |
|
"eval_accuracy": 0.09285208238471446, |
|
"eval_loss": 9.2421875, |
|
"eval_runtime": 274.2911, |
|
"eval_samples_per_second": 123.106, |
|
"eval_steps_per_second": 3.85, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.010946430301269012, |
|
"grad_norm": 1.6049507856369019, |
|
"learning_rate": 9.998987697374795e-06, |
|
"loss": 9.25, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.010946430301269012, |
|
"eval_accuracy": 0.09283341035185029, |
|
"eval_loss": 9.234375, |
|
"eval_runtime": 275.8109, |
|
"eval_samples_per_second": 122.428, |
|
"eval_steps_per_second": 3.829, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.01099486583357551, |
|
"grad_norm": 1.6145049333572388, |
|
"learning_rate": 9.998982853821563e-06, |
|
"loss": 9.2344, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.01099486583357551, |
|
"eval_accuracy": 0.09279450304616123, |
|
"eval_loss": 9.234375, |
|
"eval_runtime": 275.8634, |
|
"eval_samples_per_second": 122.405, |
|
"eval_steps_per_second": 3.828, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.01104330136588201, |
|
"grad_norm": 1.5092509984970093, |
|
"learning_rate": 9.998978010268333e-06, |
|
"loss": 9.2656, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.01104330136588201, |
|
"eval_accuracy": 0.09270768533056489, |
|
"eval_loss": 9.2265625, |
|
"eval_runtime": 275.1066, |
|
"eval_samples_per_second": 122.742, |
|
"eval_steps_per_second": 3.839, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.011091736898188511, |
|
"grad_norm": 1.6245758533477783, |
|
"learning_rate": 9.998973166715103e-06, |
|
"loss": 9.2656, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.011091736898188511, |
|
"eval_accuracy": 0.09277232819782952, |
|
"eval_loss": 9.2265625, |
|
"eval_runtime": 274.1156, |
|
"eval_samples_per_second": 123.185, |
|
"eval_steps_per_second": 3.852, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.011140172430495011, |
|
"grad_norm": 1.5349066257476807, |
|
"learning_rate": 9.998968323161873e-06, |
|
"loss": 9.2656, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.011140172430495011, |
|
"eval_accuracy": 0.09297572308605222, |
|
"eval_loss": 9.21875, |
|
"eval_runtime": 275.053, |
|
"eval_samples_per_second": 122.765, |
|
"eval_steps_per_second": 3.839, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.011188607962801512, |
|
"grad_norm": 1.5491435527801514, |
|
"learning_rate": 9.998963479608643e-06, |
|
"loss": 9.25, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.011188607962801512, |
|
"eval_accuracy": 0.09332889952162252, |
|
"eval_loss": 9.21875, |
|
"eval_runtime": 275.8534, |
|
"eval_samples_per_second": 122.409, |
|
"eval_steps_per_second": 3.828, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.01123704349510801, |
|
"grad_norm": 1.5584843158721924, |
|
"learning_rate": 9.998958636055411e-06, |
|
"loss": 9.2891, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.01123704349510801, |
|
"eval_accuracy": 0.0936537349956827, |
|
"eval_loss": 9.2109375, |
|
"eval_runtime": 275.2078, |
|
"eval_samples_per_second": 122.696, |
|
"eval_steps_per_second": 3.837, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.011285479027414511, |
|
"grad_norm": 1.6923131942749023, |
|
"learning_rate": 9.99895379250218e-06, |
|
"loss": 9.2188, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.011285479027414511, |
|
"eval_accuracy": 0.09384227910427856, |
|
"eval_loss": 9.203125, |
|
"eval_runtime": 275.578, |
|
"eval_samples_per_second": 122.532, |
|
"eval_steps_per_second": 3.832, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.011333914559721011, |
|
"grad_norm": 1.636615514755249, |
|
"learning_rate": 9.99894894894895e-06, |
|
"loss": 9.2578, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.011333914559721011, |
|
"eval_accuracy": 0.09388075217664518, |
|
"eval_loss": 9.203125, |
|
"eval_runtime": 275.1327, |
|
"eval_samples_per_second": 122.73, |
|
"eval_steps_per_second": 3.838, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.011382350092027512, |
|
"grad_norm": 1.5573487281799316, |
|
"learning_rate": 9.998944105395719e-06, |
|
"loss": 9.2422, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.011382350092027512, |
|
"eval_accuracy": 0.09375699567975478, |
|
"eval_loss": 9.1953125, |
|
"eval_runtime": 274.1524, |
|
"eval_samples_per_second": 123.169, |
|
"eval_steps_per_second": 3.852, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.011430785624334012, |
|
"grad_norm": 1.758978009223938, |
|
"learning_rate": 9.998939261842489e-06, |
|
"loss": 9.2109, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.011430785624334012, |
|
"eval_accuracy": 0.09350146384395322, |
|
"eval_loss": 9.1953125, |
|
"eval_runtime": 276.5184, |
|
"eval_samples_per_second": 122.115, |
|
"eval_steps_per_second": 3.819, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.01147922115664051, |
|
"grad_norm": 1.6766207218170166, |
|
"learning_rate": 9.998934418289259e-06, |
|
"loss": 9.1797, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.01147922115664051, |
|
"eval_accuracy": 0.09353747626082612, |
|
"eval_loss": 9.1953125, |
|
"eval_runtime": 275.7856, |
|
"eval_samples_per_second": 122.439, |
|
"eval_steps_per_second": 3.829, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.011527656688947011, |
|
"grad_norm": 1.7581781148910522, |
|
"learning_rate": 9.998929574736027e-06, |
|
"loss": 9.1953, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.011527656688947011, |
|
"eval_accuracy": 0.09377196225493427, |
|
"eval_loss": 9.1875, |
|
"eval_runtime": 276.3511, |
|
"eval_samples_per_second": 122.189, |
|
"eval_steps_per_second": 3.821, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.011576092221253512, |
|
"grad_norm": 2.0294253826141357, |
|
"learning_rate": 9.998924731182797e-06, |
|
"loss": 9.1797, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.011576092221253512, |
|
"eval_accuracy": 0.09428256274432681, |
|
"eval_loss": 9.1875, |
|
"eval_runtime": 275.8013, |
|
"eval_samples_per_second": 122.432, |
|
"eval_steps_per_second": 3.829, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.011624527753560012, |
|
"grad_norm": 1.4771103858947754, |
|
"learning_rate": 9.998919887629566e-06, |
|
"loss": 9.2266, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.011624527753560012, |
|
"eval_accuracy": 0.09478583916501448, |
|
"eval_loss": 9.1796875, |
|
"eval_runtime": 273.9293, |
|
"eval_samples_per_second": 123.269, |
|
"eval_steps_per_second": 3.855, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.011672963285866512, |
|
"grad_norm": 1.494795322418213, |
|
"learning_rate": 9.998915044076335e-06, |
|
"loss": 9.2109, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.011672963285866512, |
|
"eval_accuracy": 0.09512494644093829, |
|
"eval_loss": 9.171875, |
|
"eval_runtime": 274.6237, |
|
"eval_samples_per_second": 122.957, |
|
"eval_steps_per_second": 3.845, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.011721398818173011, |
|
"grad_norm": 1.4708678722381592, |
|
"learning_rate": 9.998910200523105e-06, |
|
"loss": 9.1719, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.011721398818173011, |
|
"eval_accuracy": 0.0953564506995658, |
|
"eval_loss": 9.171875, |
|
"eval_runtime": 274.319, |
|
"eval_samples_per_second": 123.094, |
|
"eval_steps_per_second": 3.85, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.011769834350479512, |
|
"grad_norm": 1.5596672296524048, |
|
"learning_rate": 9.998905356969873e-06, |
|
"loss": 9.2031, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.011769834350479512, |
|
"eval_accuracy": 0.09549230783170773, |
|
"eval_loss": 9.171875, |
|
"eval_runtime": 276.3644, |
|
"eval_samples_per_second": 122.183, |
|
"eval_steps_per_second": 3.821, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.011818269882786012, |
|
"grad_norm": 1.6623671054840088, |
|
"learning_rate": 9.998900513416643e-06, |
|
"loss": 9.1953, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.011818269882786012, |
|
"eval_accuracy": 0.09535106620636777, |
|
"eval_loss": 9.1640625, |
|
"eval_runtime": 276.0732, |
|
"eval_samples_per_second": 122.312, |
|
"eval_steps_per_second": 3.825, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.011866705415092512, |
|
"grad_norm": 1.5597991943359375, |
|
"learning_rate": 9.998895669863412e-06, |
|
"loss": 9.1875, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.011866705415092512, |
|
"eval_accuracy": 0.09501265370375983, |
|
"eval_loss": 9.1640625, |
|
"eval_runtime": 276.6977, |
|
"eval_samples_per_second": 122.036, |
|
"eval_steps_per_second": 3.816, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.011915140947399011, |
|
"grad_norm": 1.540256381034851, |
|
"learning_rate": 9.998890826310182e-06, |
|
"loss": 9.2031, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.011915140947399011, |
|
"eval_accuracy": 0.09489992673325895, |
|
"eval_loss": 9.15625, |
|
"eval_runtime": 275.9711, |
|
"eval_samples_per_second": 122.357, |
|
"eval_steps_per_second": 3.826, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.011963576479705511, |
|
"grad_norm": 1.7622281312942505, |
|
"learning_rate": 9.998885982756952e-06, |
|
"loss": 9.1797, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.011963576479705511, |
|
"eval_accuracy": 0.09502625968119574, |
|
"eval_loss": 9.1484375, |
|
"eval_runtime": 276.4265, |
|
"eval_samples_per_second": 122.155, |
|
"eval_steps_per_second": 3.82, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.012012012012012012, |
|
"grad_norm": 1.5139068365097046, |
|
"learning_rate": 9.99888113920372e-06, |
|
"loss": 9.1484, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.012012012012012012, |
|
"eval_accuracy": 0.09517540435300378, |
|
"eval_loss": 9.1484375, |
|
"eval_runtime": 276.0853, |
|
"eval_samples_per_second": 122.306, |
|
"eval_steps_per_second": 3.825, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.012060447544318512, |
|
"grad_norm": 1.8858153820037842, |
|
"learning_rate": 9.99887629565049e-06, |
|
"loss": 9.1406, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.012060447544318512, |
|
"eval_accuracy": 0.0953735305435811, |
|
"eval_loss": 9.1484375, |
|
"eval_runtime": 277.0667, |
|
"eval_samples_per_second": 121.873, |
|
"eval_steps_per_second": 3.811, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.012108883076625013, |
|
"grad_norm": 1.5456604957580566, |
|
"learning_rate": 9.99887145209726e-06, |
|
"loss": 9.1641, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.012108883076625013, |
|
"eval_accuracy": 0.09559869499570124, |
|
"eval_loss": 9.140625, |
|
"eval_runtime": 277.6666, |
|
"eval_samples_per_second": 121.61, |
|
"eval_steps_per_second": 3.803, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.012157318608931511, |
|
"grad_norm": 1.594663143157959, |
|
"learning_rate": 9.998866608544028e-06, |
|
"loss": 9.1406, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.012157318608931511, |
|
"eval_accuracy": 0.09564637381450322, |
|
"eval_loss": 9.140625, |
|
"eval_runtime": 276.8219, |
|
"eval_samples_per_second": 121.981, |
|
"eval_steps_per_second": 3.815, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.012205754141238012, |
|
"grad_norm": 1.6868451833724976, |
|
"learning_rate": 9.998861764990798e-06, |
|
"loss": 9.1719, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.012205754141238012, |
|
"eval_accuracy": 0.09536970929034377, |
|
"eval_loss": 9.1328125, |
|
"eval_runtime": 274.9951, |
|
"eval_samples_per_second": 122.791, |
|
"eval_steps_per_second": 3.84, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.012254189673544512, |
|
"grad_norm": 1.5256409645080566, |
|
"learning_rate": 9.998856921437568e-06, |
|
"loss": 9.125, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.012254189673544512, |
|
"eval_accuracy": 0.09525680862651392, |
|
"eval_loss": 9.1328125, |
|
"eval_runtime": 275.3586, |
|
"eval_samples_per_second": 122.629, |
|
"eval_steps_per_second": 3.835, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.012302625205851013, |
|
"grad_norm": 1.565302848815918, |
|
"learning_rate": 9.998852077884336e-06, |
|
"loss": 9.1719, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.012302625205851013, |
|
"eval_accuracy": 0.09499291056203368, |
|
"eval_loss": 9.125, |
|
"eval_runtime": 275.0624, |
|
"eval_samples_per_second": 122.761, |
|
"eval_steps_per_second": 3.839, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.012351060738157513, |
|
"grad_norm": 1.4815526008605957, |
|
"learning_rate": 9.998847234331106e-06, |
|
"loss": 9.1797, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.012351060738157513, |
|
"eval_accuracy": 0.09496150101837846, |
|
"eval_loss": 9.125, |
|
"eval_runtime": 275.5286, |
|
"eval_samples_per_second": 122.554, |
|
"eval_steps_per_second": 3.833, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.012399496270464012, |
|
"grad_norm": 1.6366430521011353, |
|
"learning_rate": 9.998842390777876e-06, |
|
"loss": 9.0859, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.012399496270464012, |
|
"eval_accuracy": 0.09506522488466111, |
|
"eval_loss": 9.1171875, |
|
"eval_runtime": 276.1079, |
|
"eval_samples_per_second": 122.296, |
|
"eval_steps_per_second": 3.825, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.012447931802770512, |
|
"grad_norm": 1.6034120321273804, |
|
"learning_rate": 9.998837547224644e-06, |
|
"loss": 9.1875, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.012447931802770512, |
|
"eval_accuracy": 0.09566794073618354, |
|
"eval_loss": 9.1171875, |
|
"eval_runtime": 275.8176, |
|
"eval_samples_per_second": 122.425, |
|
"eval_steps_per_second": 3.829, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.012496367335077013, |
|
"grad_norm": 1.6382652521133423, |
|
"learning_rate": 9.998832703671414e-06, |
|
"loss": 9.1094, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.012496367335077013, |
|
"eval_accuracy": 0.09628628898731317, |
|
"eval_loss": 9.109375, |
|
"eval_runtime": 275.7044, |
|
"eval_samples_per_second": 122.475, |
|
"eval_steps_per_second": 3.83, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.012544802867383513, |
|
"grad_norm": 1.4967926740646362, |
|
"learning_rate": 9.998827860118184e-06, |
|
"loss": 9.0938, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.012544802867383513, |
|
"eval_accuracy": 0.09678612049030963, |
|
"eval_loss": 9.109375, |
|
"eval_runtime": 276.0511, |
|
"eval_samples_per_second": 122.322, |
|
"eval_steps_per_second": 3.825, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.012593238399690013, |
|
"grad_norm": 2.137125015258789, |
|
"learning_rate": 9.998823016564952e-06, |
|
"loss": 9.1016, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.012593238399690013, |
|
"eval_accuracy": 0.09689392614982306, |
|
"eval_loss": 9.1015625, |
|
"eval_runtime": 276.485, |
|
"eval_samples_per_second": 122.13, |
|
"eval_steps_per_second": 3.819, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.012641673931996512, |
|
"grad_norm": 1.655360460281372, |
|
"learning_rate": 9.998818173011722e-06, |
|
"loss": 9.1406, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.012641673931996512, |
|
"eval_accuracy": 0.09685108179534405, |
|
"eval_loss": 9.1015625, |
|
"eval_runtime": 276.2683, |
|
"eval_samples_per_second": 122.225, |
|
"eval_steps_per_second": 3.822, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.012690109464303012, |
|
"grad_norm": 1.615159273147583, |
|
"learning_rate": 9.998813329458492e-06, |
|
"loss": 9.0781, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.012690109464303012, |
|
"eval_accuracy": 0.09661187713246557, |
|
"eval_loss": 9.09375, |
|
"eval_runtime": 277.0541, |
|
"eval_samples_per_second": 121.879, |
|
"eval_steps_per_second": 3.812, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.012738544996609513, |
|
"grad_norm": 1.56972074508667, |
|
"learning_rate": 9.998808485905261e-06, |
|
"loss": 9.1094, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.012738544996609513, |
|
"eval_accuracy": 0.09626272459234972, |
|
"eval_loss": 9.09375, |
|
"eval_runtime": 276.2994, |
|
"eval_samples_per_second": 122.212, |
|
"eval_steps_per_second": 3.822, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.012786980528916013, |
|
"grad_norm": 1.5011804103851318, |
|
"learning_rate": 9.998803642352031e-06, |
|
"loss": 9.1172, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.012786980528916013, |
|
"eval_accuracy": 0.09589736067486343, |
|
"eval_loss": 9.0859375, |
|
"eval_runtime": 276.7096, |
|
"eval_samples_per_second": 122.03, |
|
"eval_steps_per_second": 3.816, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.012835416061222512, |
|
"grad_norm": 1.6870362758636475, |
|
"learning_rate": 9.9987987987988e-06, |
|
"loss": 9.1172, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.012835416061222512, |
|
"eval_accuracy": 0.09563450477035701, |
|
"eval_loss": 9.0859375, |
|
"eval_runtime": 275.3219, |
|
"eval_samples_per_second": 122.646, |
|
"eval_steps_per_second": 3.836, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.012883851593529012, |
|
"grad_norm": 1.5479800701141357, |
|
"learning_rate": 9.99879395524557e-06, |
|
"loss": 9.125, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.012883851593529012, |
|
"eval_accuracy": 0.09551361421339459, |
|
"eval_loss": 9.0859375, |
|
"eval_runtime": 275.3988, |
|
"eval_samples_per_second": 122.611, |
|
"eval_steps_per_second": 3.834, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.012932287125835513, |
|
"grad_norm": 1.5906175374984741, |
|
"learning_rate": 9.99878911169234e-06, |
|
"loss": 9.1094, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.012932287125835513, |
|
"eval_accuracy": 0.09573892340995555, |
|
"eval_loss": 9.078125, |
|
"eval_runtime": 273.7737, |
|
"eval_samples_per_second": 123.339, |
|
"eval_steps_per_second": 3.857, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.012980722658142013, |
|
"grad_norm": 1.5682505369186401, |
|
"learning_rate": 9.998784268139107e-06, |
|
"loss": 9.0781, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.012980722658142013, |
|
"eval_accuracy": 0.09638306512043707, |
|
"eval_loss": 9.078125, |
|
"eval_runtime": 272.9963, |
|
"eval_samples_per_second": 123.69, |
|
"eval_steps_per_second": 3.868, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.013029158190448514, |
|
"grad_norm": 1.5259824991226196, |
|
"learning_rate": 9.998779424585877e-06, |
|
"loss": 9.125, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.013029158190448514, |
|
"eval_accuracy": 0.09727434348915333, |
|
"eval_loss": 9.0703125, |
|
"eval_runtime": 274.1813, |
|
"eval_samples_per_second": 123.156, |
|
"eval_steps_per_second": 3.851, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.013077593722755012, |
|
"grad_norm": 1.5006844997406006, |
|
"learning_rate": 9.998774581032647e-06, |
|
"loss": 9.0547, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.013077593722755012, |
|
"eval_accuracy": 0.09799540239547996, |
|
"eval_loss": 9.0703125, |
|
"eval_runtime": 272.3774, |
|
"eval_samples_per_second": 123.971, |
|
"eval_steps_per_second": 3.877, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.013126029255061513, |
|
"grad_norm": 1.4817960262298584, |
|
"learning_rate": 9.998769737479415e-06, |
|
"loss": 9.0781, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.013126029255061513, |
|
"eval_accuracy": 0.09830503970325537, |
|
"eval_loss": 9.0625, |
|
"eval_runtime": 271.8893, |
|
"eval_samples_per_second": 124.194, |
|
"eval_steps_per_second": 3.884, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.013174464787368013, |
|
"grad_norm": 1.6597894430160522, |
|
"learning_rate": 9.998764893926185e-06, |
|
"loss": 9.1016, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.013174464787368013, |
|
"eval_accuracy": 0.09809840253955858, |
|
"eval_loss": 9.0625, |
|
"eval_runtime": 273.6177, |
|
"eval_samples_per_second": 123.409, |
|
"eval_steps_per_second": 3.859, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.013222900319674514, |
|
"grad_norm": 1.5939491987228394, |
|
"learning_rate": 9.998760050372955e-06, |
|
"loss": 9.0703, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.013222900319674514, |
|
"eval_accuracy": 0.09753586774480431, |
|
"eval_loss": 9.0546875, |
|
"eval_runtime": 273.2408, |
|
"eval_samples_per_second": 123.58, |
|
"eval_steps_per_second": 3.865, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.013271335851981014, |
|
"grad_norm": 1.5878655910491943, |
|
"learning_rate": 9.998755206819723e-06, |
|
"loss": 9.0547, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.013271335851981014, |
|
"eval_accuracy": 0.09690096072964631, |
|
"eval_loss": 9.0546875, |
|
"eval_runtime": 272.9584, |
|
"eval_samples_per_second": 123.707, |
|
"eval_steps_per_second": 3.869, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.013319771384287513, |
|
"grad_norm": 1.6010398864746094, |
|
"learning_rate": 9.998750363266493e-06, |
|
"loss": 9.0312, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.013319771384287513, |
|
"eval_accuracy": 0.09638720481144417, |
|
"eval_loss": 9.046875, |
|
"eval_runtime": 272.7096, |
|
"eval_samples_per_second": 123.82, |
|
"eval_steps_per_second": 3.872, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.013368206916594013, |
|
"grad_norm": 1.7441232204437256, |
|
"learning_rate": 9.998745519713263e-06, |
|
"loss": 9.0938, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.013368206916594013, |
|
"eval_accuracy": 0.09639140240022759, |
|
"eval_loss": 9.046875, |
|
"eval_runtime": 273.1864, |
|
"eval_samples_per_second": 123.604, |
|
"eval_steps_per_second": 3.865, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.013416642448900513, |
|
"grad_norm": 1.586517095565796, |
|
"learning_rate": 9.998740676160031e-06, |
|
"loss": 9.0156, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.013416642448900513, |
|
"eval_accuracy": 0.09671230082549781, |
|
"eval_loss": 9.0390625, |
|
"eval_runtime": 272.732, |
|
"eval_samples_per_second": 123.81, |
|
"eval_steps_per_second": 3.872, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.013465077981207014, |
|
"grad_norm": 1.5039782524108887, |
|
"learning_rate": 9.998735832606801e-06, |
|
"loss": 9.1094, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.013465077981207014, |
|
"eval_accuracy": 0.0972694800759422, |
|
"eval_loss": 9.0390625, |
|
"eval_runtime": 272.288, |
|
"eval_samples_per_second": 124.012, |
|
"eval_steps_per_second": 3.878, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"grad_norm": 1.534090518951416, |
|
"learning_rate": 9.99873098905357e-06, |
|
"loss": 9.0859, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"eval_accuracy": 0.09795999790525846, |
|
"eval_loss": 9.03125, |
|
"eval_runtime": 273.6685, |
|
"eval_samples_per_second": 123.387, |
|
"eval_steps_per_second": 3.859, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.013561949045820013, |
|
"grad_norm": 1.5849289894104004, |
|
"learning_rate": 9.99872614550034e-06, |
|
"loss": 9.0234, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.013561949045820013, |
|
"eval_accuracy": 0.09837451703484297, |
|
"eval_loss": 9.03125, |
|
"eval_runtime": 273.5891, |
|
"eval_samples_per_second": 123.422, |
|
"eval_steps_per_second": 3.86, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.013610384578126513, |
|
"grad_norm": 1.523674488067627, |
|
"learning_rate": 9.99872130194711e-06, |
|
"loss": 9.0781, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.013610384578126513, |
|
"eval_accuracy": 0.09843887041322598, |
|
"eval_loss": 9.0234375, |
|
"eval_runtime": 271.771, |
|
"eval_samples_per_second": 124.248, |
|
"eval_steps_per_second": 3.886, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.013658820110433014, |
|
"grad_norm": 1.646908164024353, |
|
"learning_rate": 9.998716458393879e-06, |
|
"loss": 9.0547, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.013658820110433014, |
|
"eval_accuracy": 0.09825828724887455, |
|
"eval_loss": 9.0234375, |
|
"eval_runtime": 272.8785, |
|
"eval_samples_per_second": 123.744, |
|
"eval_steps_per_second": 3.87, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.013707255642739514, |
|
"grad_norm": 1.6313369274139404, |
|
"learning_rate": 9.998711614840649e-06, |
|
"loss": 9.0234, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.013707255642739514, |
|
"eval_accuracy": 0.09794141271905878, |
|
"eval_loss": 9.015625, |
|
"eval_runtime": 273.6116, |
|
"eval_samples_per_second": 123.412, |
|
"eval_steps_per_second": 3.859, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.013755691175046015, |
|
"grad_norm": 1.6014082431793213, |
|
"learning_rate": 9.998706771287417e-06, |
|
"loss": 9.0312, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.013755691175046015, |
|
"eval_accuracy": 0.09782020372432657, |
|
"eval_loss": 9.015625, |
|
"eval_runtime": 273.8768, |
|
"eval_samples_per_second": 123.293, |
|
"eval_steps_per_second": 3.856, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.013804126707352513, |
|
"grad_norm": 1.5171185731887817, |
|
"learning_rate": 9.998701927734187e-06, |
|
"loss": 9.0391, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.013804126707352513, |
|
"eval_accuracy": 0.09783482291284813, |
|
"eval_loss": 9.0078125, |
|
"eval_runtime": 273.0741, |
|
"eval_samples_per_second": 123.655, |
|
"eval_steps_per_second": 3.867, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.013852562239659014, |
|
"grad_norm": 1.5492215156555176, |
|
"learning_rate": 9.998697084180956e-06, |
|
"loss": 9.0312, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.013852562239659014, |
|
"eval_accuracy": 0.09799062582893332, |
|
"eval_loss": 9.0078125, |
|
"eval_runtime": 273.3737, |
|
"eval_samples_per_second": 123.52, |
|
"eval_steps_per_second": 3.863, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.013900997771965514, |
|
"grad_norm": 1.6462546586990356, |
|
"learning_rate": 9.998692240627725e-06, |
|
"loss": 9.0625, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.013900997771965514, |
|
"eval_accuracy": 0.09823570711610857, |
|
"eval_loss": 9.0078125, |
|
"eval_runtime": 272.8604, |
|
"eval_samples_per_second": 123.752, |
|
"eval_steps_per_second": 3.87, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.013949433304272014, |
|
"grad_norm": 1.6392829418182373, |
|
"learning_rate": 9.998687397074494e-06, |
|
"loss": 9.0234, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.013949433304272014, |
|
"eval_accuracy": 0.09855127894595708, |
|
"eval_loss": 9.0, |
|
"eval_runtime": 272.8831, |
|
"eval_samples_per_second": 123.742, |
|
"eval_steps_per_second": 3.87, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.013997868836578513, |
|
"grad_norm": 1.5253773927688599, |
|
"learning_rate": 9.998682553521264e-06, |
|
"loss": 9.0078, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.013997868836578513, |
|
"eval_accuracy": 0.09903811239816902, |
|
"eval_loss": 9.0, |
|
"eval_runtime": 272.9158, |
|
"eval_samples_per_second": 123.727, |
|
"eval_steps_per_second": 3.869, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.014046304368885014, |
|
"grad_norm": 1.496385931968689, |
|
"learning_rate": 9.998677709968032e-06, |
|
"loss": 9.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.014046304368885014, |
|
"eval_accuracy": 0.09958177251784199, |
|
"eval_loss": 8.9921875, |
|
"eval_runtime": 273.8166, |
|
"eval_samples_per_second": 123.32, |
|
"eval_steps_per_second": 3.857, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.014094739901191514, |
|
"grad_norm": 1.5430630445480347, |
|
"learning_rate": 9.998672866414802e-06, |
|
"loss": 9.0078, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.014094739901191514, |
|
"eval_accuracy": 0.09971363470341763, |
|
"eval_loss": 8.9921875, |
|
"eval_runtime": 272.085, |
|
"eval_samples_per_second": 124.105, |
|
"eval_steps_per_second": 3.881, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.014143175433498014, |
|
"grad_norm": 1.7000993490219116, |
|
"learning_rate": 9.998668022861572e-06, |
|
"loss": 9.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.014143175433498014, |
|
"eval_accuracy": 0.09992762488470744, |
|
"eval_loss": 8.984375, |
|
"eval_runtime": 273.2642, |
|
"eval_samples_per_second": 123.569, |
|
"eval_steps_per_second": 3.864, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.014191610965804515, |
|
"grad_norm": 1.5401760339736938, |
|
"learning_rate": 9.99866317930834e-06, |
|
"loss": 9.0078, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.014191610965804515, |
|
"eval_accuracy": 0.09988570689464958, |
|
"eval_loss": 8.984375, |
|
"eval_runtime": 271.53, |
|
"eval_samples_per_second": 124.358, |
|
"eval_steps_per_second": 3.889, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.014240046498111014, |
|
"grad_norm": 1.5899308919906616, |
|
"learning_rate": 9.99865833575511e-06, |
|
"loss": 8.9922, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.014240046498111014, |
|
"eval_accuracy": 0.09954338629213985, |
|
"eval_loss": 8.9765625, |
|
"eval_runtime": 271.4912, |
|
"eval_samples_per_second": 124.376, |
|
"eval_steps_per_second": 3.89, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.014288482030417514, |
|
"grad_norm": 1.5780622959136963, |
|
"learning_rate": 9.99865349220188e-06, |
|
"loss": 9.0078, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.014288482030417514, |
|
"eval_accuracy": 0.09903443588937252, |
|
"eval_loss": 8.9765625, |
|
"eval_runtime": 272.7386, |
|
"eval_samples_per_second": 123.807, |
|
"eval_steps_per_second": 3.872, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.014336917562724014, |
|
"grad_norm": 1.6593127250671387, |
|
"learning_rate": 9.998648648648648e-06, |
|
"loss": 8.9844, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.014336917562724014, |
|
"eval_accuracy": 0.09852673028879613, |
|
"eval_loss": 8.96875, |
|
"eval_runtime": 273.5255, |
|
"eval_samples_per_second": 123.451, |
|
"eval_steps_per_second": 3.861, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.014385353095030515, |
|
"grad_norm": 1.5654476881027222, |
|
"learning_rate": 9.99864380509542e-06, |
|
"loss": 8.9766, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.014385353095030515, |
|
"eval_accuracy": 0.09832848830266619, |
|
"eval_loss": 8.96875, |
|
"eval_runtime": 272.9229, |
|
"eval_samples_per_second": 123.724, |
|
"eval_steps_per_second": 3.869, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.014433788627337015, |
|
"grad_norm": 1.604347586631775, |
|
"learning_rate": 9.998638961542188e-06, |
|
"loss": 8.9531, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.014433788627337015, |
|
"eval_accuracy": 0.09845606605279392, |
|
"eval_loss": 8.9609375, |
|
"eval_runtime": 274.3511, |
|
"eval_samples_per_second": 123.08, |
|
"eval_steps_per_second": 3.849, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.014482224159643514, |
|
"grad_norm": 1.6617177724838257, |
|
"learning_rate": 9.998634117988958e-06, |
|
"loss": 8.9688, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.014482224159643514, |
|
"eval_accuracy": 0.09881535070376629, |
|
"eval_loss": 8.9609375, |
|
"eval_runtime": 275.2558, |
|
"eval_samples_per_second": 122.675, |
|
"eval_steps_per_second": 3.836, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.014530659691950014, |
|
"grad_norm": 1.5470112562179565, |
|
"learning_rate": 9.998629274435728e-06, |
|
"loss": 9.0312, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.014530659691950014, |
|
"eval_accuracy": 0.09944139935914688, |
|
"eval_loss": 8.953125, |
|
"eval_runtime": 276.2061, |
|
"eval_samples_per_second": 122.253, |
|
"eval_steps_per_second": 3.823, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.014579095224256515, |
|
"grad_norm": 1.5366243124008179, |
|
"learning_rate": 9.998624430882496e-06, |
|
"loss": 9.0156, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.014579095224256515, |
|
"eval_accuracy": 0.0997650189799043, |
|
"eval_loss": 8.953125, |
|
"eval_runtime": 273.6153, |
|
"eval_samples_per_second": 123.41, |
|
"eval_steps_per_second": 3.859, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.014627530756563015, |
|
"grad_norm": 1.8393828868865967, |
|
"learning_rate": 9.998619587329266e-06, |
|
"loss": 8.9688, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.014627530756563015, |
|
"eval_accuracy": 0.09985768437090925, |
|
"eval_loss": 8.9453125, |
|
"eval_runtime": 272.5368, |
|
"eval_samples_per_second": 123.899, |
|
"eval_steps_per_second": 3.875, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.014675966288869515, |
|
"grad_norm": 1.5231480598449707, |
|
"learning_rate": 9.998614743776036e-06, |
|
"loss": 9.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.014675966288869515, |
|
"eval_accuracy": 0.09973019346744601, |
|
"eval_loss": 8.9453125, |
|
"eval_runtime": 272.9977, |
|
"eval_samples_per_second": 123.69, |
|
"eval_steps_per_second": 3.868, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.014724401821176014, |
|
"grad_norm": 1.4661198854446411, |
|
"learning_rate": 9.998609900222804e-06, |
|
"loss": 8.9375, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.014724401821176014, |
|
"eval_accuracy": 0.09956171093834608, |
|
"eval_loss": 8.9375, |
|
"eval_runtime": 273.4012, |
|
"eval_samples_per_second": 123.507, |
|
"eval_steps_per_second": 3.862, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.014772837353482515, |
|
"grad_norm": 1.561277985572815, |
|
"learning_rate": 9.998605056669574e-06, |
|
"loss": 8.9766, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.014772837353482515, |
|
"eval_accuracy": 0.09941792181084791, |
|
"eval_loss": 8.9375, |
|
"eval_runtime": 273.3274, |
|
"eval_samples_per_second": 123.54, |
|
"eval_steps_per_second": 3.863, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.014821272885789015, |
|
"grad_norm": 1.5084242820739746, |
|
"learning_rate": 9.998600213116343e-06, |
|
"loss": 8.9375, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.014821272885789015, |
|
"eval_accuracy": 0.09944964979227292, |
|
"eval_loss": 8.9375, |
|
"eval_runtime": 272.8323, |
|
"eval_samples_per_second": 123.765, |
|
"eval_steps_per_second": 3.871, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.014869708418095515, |
|
"grad_norm": 1.533602237701416, |
|
"learning_rate": 9.998595369563112e-06, |
|
"loss": 8.9688, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.014869708418095515, |
|
"eval_accuracy": 0.09968031453314374, |
|
"eval_loss": 8.9296875, |
|
"eval_runtime": 273.2909, |
|
"eval_samples_per_second": 123.557, |
|
"eval_steps_per_second": 3.864, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.014918143950402014, |
|
"grad_norm": 1.527116060256958, |
|
"learning_rate": 9.998590526009882e-06, |
|
"loss": 8.9531, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.014918143950402014, |
|
"eval_accuracy": 0.099938480717768, |
|
"eval_loss": 8.9296875, |
|
"eval_runtime": 273.5392, |
|
"eval_samples_per_second": 123.445, |
|
"eval_steps_per_second": 3.861, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.014966579482708514, |
|
"grad_norm": 1.5343533754348755, |
|
"learning_rate": 9.998585682456651e-06, |
|
"loss": 8.9531, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.014966579482708514, |
|
"eval_accuracy": 0.10016824804310583, |
|
"eval_loss": 8.921875, |
|
"eval_runtime": 273.4771, |
|
"eval_samples_per_second": 123.473, |
|
"eval_steps_per_second": 3.861, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.015015015015015015, |
|
"grad_norm": 2.02919602394104, |
|
"learning_rate": 9.99858083890342e-06, |
|
"loss": 8.9062, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.015015015015015015, |
|
"eval_accuracy": 0.10033235929009336, |
|
"eval_loss": 8.921875, |
|
"eval_runtime": 272.9511, |
|
"eval_samples_per_second": 123.711, |
|
"eval_steps_per_second": 3.869, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.015063450547321515, |
|
"grad_norm": 1.50547456741333, |
|
"learning_rate": 9.99857599535019e-06, |
|
"loss": 8.9375, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.015063450547321515, |
|
"eval_accuracy": 0.10040134449058222, |
|
"eval_loss": 8.9140625, |
|
"eval_runtime": 274.4167, |
|
"eval_samples_per_second": 123.05, |
|
"eval_steps_per_second": 3.848, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.015111886079628016, |
|
"grad_norm": 1.5935693979263306, |
|
"learning_rate": 9.99857115179696e-06, |
|
"loss": 8.8828, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.015111886079628016, |
|
"eval_accuracy": 0.1003402333876733, |
|
"eval_loss": 8.9140625, |
|
"eval_runtime": 272.7238, |
|
"eval_samples_per_second": 123.814, |
|
"eval_steps_per_second": 3.872, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.015160321611934514, |
|
"grad_norm": 1.4832584857940674, |
|
"learning_rate": 9.998566308243727e-06, |
|
"loss": 8.9219, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.015160321611934514, |
|
"eval_accuracy": 0.10030963441288658, |
|
"eval_loss": 8.90625, |
|
"eval_runtime": 272.1097, |
|
"eval_samples_per_second": 124.093, |
|
"eval_steps_per_second": 3.881, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.015208757144241015, |
|
"grad_norm": 1.4832618236541748, |
|
"learning_rate": 9.998561464690499e-06, |
|
"loss": 8.9219, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.015208757144241015, |
|
"eval_accuracy": 0.10044369671396249, |
|
"eval_loss": 8.90625, |
|
"eval_runtime": 273.1456, |
|
"eval_samples_per_second": 123.623, |
|
"eval_steps_per_second": 3.866, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.015257192676547515, |
|
"grad_norm": 1.5148617029190063, |
|
"learning_rate": 9.998556621137267e-06, |
|
"loss": 8.9297, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.015257192676547515, |
|
"eval_accuracy": 0.10085141285482906, |
|
"eval_loss": 8.90625, |
|
"eval_runtime": 273.2988, |
|
"eval_samples_per_second": 123.553, |
|
"eval_steps_per_second": 3.864, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.015305628208854016, |
|
"grad_norm": 1.433423638343811, |
|
"learning_rate": 9.998551777584037e-06, |
|
"loss": 8.9922, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.015305628208854016, |
|
"eval_accuracy": 0.10107848793356786, |
|
"eval_loss": 8.8984375, |
|
"eval_runtime": 273.7559, |
|
"eval_samples_per_second": 123.347, |
|
"eval_steps_per_second": 3.857, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.015354063741160516, |
|
"grad_norm": 1.5502877235412598, |
|
"learning_rate": 9.998546934030807e-06, |
|
"loss": 8.9062, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.015354063741160516, |
|
"eval_accuracy": 0.10110254445963007, |
|
"eval_loss": 8.8984375, |
|
"eval_runtime": 273.1804, |
|
"eval_samples_per_second": 123.607, |
|
"eval_steps_per_second": 3.866, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.015402499273467015, |
|
"grad_norm": 1.4339704513549805, |
|
"learning_rate": 9.998542090477575e-06, |
|
"loss": 8.9297, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.015402499273467015, |
|
"eval_accuracy": 0.10105709470521651, |
|
"eval_loss": 8.890625, |
|
"eval_runtime": 272.4751, |
|
"eval_samples_per_second": 123.927, |
|
"eval_steps_per_second": 3.876, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.015450934805773515, |
|
"grad_norm": 1.5828499794006348, |
|
"learning_rate": 9.998537246924345e-06, |
|
"loss": 8.9531, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.015450934805773515, |
|
"eval_accuracy": 0.1008436256039136, |
|
"eval_loss": 8.890625, |
|
"eval_runtime": 273.6696, |
|
"eval_samples_per_second": 123.386, |
|
"eval_steps_per_second": 3.859, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.015499370338080016, |
|
"grad_norm": 1.4665073156356812, |
|
"learning_rate": 9.998532403371115e-06, |
|
"loss": 8.9531, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.015499370338080016, |
|
"eval_accuracy": 0.10056403724204985, |
|
"eval_loss": 8.8828125, |
|
"eval_runtime": 273.7537, |
|
"eval_samples_per_second": 123.348, |
|
"eval_steps_per_second": 3.857, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.015547805870386516, |
|
"grad_norm": 1.5187170505523682, |
|
"learning_rate": 9.998527559817883e-06, |
|
"loss": 8.9375, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.015547805870386516, |
|
"eval_accuracy": 0.10036628738701864, |
|
"eval_loss": 8.8828125, |
|
"eval_runtime": 274.5883, |
|
"eval_samples_per_second": 122.973, |
|
"eval_steps_per_second": 3.846, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.015596241402693016, |
|
"grad_norm": 1.6168104410171509, |
|
"learning_rate": 9.998522716264653e-06, |
|
"loss": 8.9219, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.015596241402693016, |
|
"eval_accuracy": 0.10023019866377143, |
|
"eval_loss": 8.875, |
|
"eval_runtime": 274.4874, |
|
"eval_samples_per_second": 123.018, |
|
"eval_steps_per_second": 3.847, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.015644676934999515, |
|
"grad_norm": 1.5896227359771729, |
|
"learning_rate": 9.998517872711423e-06, |
|
"loss": 8.9062, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.015644676934999515, |
|
"eval_accuracy": 0.10039622053737764, |
|
"eval_loss": 8.875, |
|
"eval_runtime": 273.3822, |
|
"eval_samples_per_second": 123.516, |
|
"eval_steps_per_second": 3.863, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.015693112467306015, |
|
"grad_norm": 1.476304531097412, |
|
"learning_rate": 9.99851302915819e-06, |
|
"loss": 8.8906, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.015693112467306015, |
|
"eval_accuracy": 0.10063834903796041, |
|
"eval_loss": 8.875, |
|
"eval_runtime": 273.0848, |
|
"eval_samples_per_second": 123.65, |
|
"eval_steps_per_second": 3.867, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.015741547999612516, |
|
"grad_norm": 1.493653655052185, |
|
"learning_rate": 9.99850818560496e-06, |
|
"loss": 8.8906, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.015741547999612516, |
|
"eval_accuracy": 0.10110543934844622, |
|
"eval_loss": 8.8671875, |
|
"eval_runtime": 273.5247, |
|
"eval_samples_per_second": 123.451, |
|
"eval_steps_per_second": 3.861, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.015789983531919016, |
|
"grad_norm": 1.5021024942398071, |
|
"learning_rate": 9.998503342051729e-06, |
|
"loss": 8.8672, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.015789983531919016, |
|
"eval_accuracy": 0.10160095746710661, |
|
"eval_loss": 8.8671875, |
|
"eval_runtime": 273.3554, |
|
"eval_samples_per_second": 123.528, |
|
"eval_steps_per_second": 3.863, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.015838419064225517, |
|
"grad_norm": 1.4941586256027222, |
|
"learning_rate": 9.998498498498499e-06, |
|
"loss": 8.875, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.015838419064225517, |
|
"eval_accuracy": 0.1018975967240975, |
|
"eval_loss": 8.859375, |
|
"eval_runtime": 272.1708, |
|
"eval_samples_per_second": 124.065, |
|
"eval_steps_per_second": 3.88, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.015886854596532017, |
|
"grad_norm": 1.484066128730774, |
|
"learning_rate": 9.998493654945269e-06, |
|
"loss": 8.8516, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.015886854596532017, |
|
"eval_accuracy": 0.10215561816428094, |
|
"eval_loss": 8.859375, |
|
"eval_runtime": 272.903, |
|
"eval_samples_per_second": 123.733, |
|
"eval_steps_per_second": 3.87, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.015935290128838518, |
|
"grad_norm": 1.6349196434020996, |
|
"learning_rate": 9.998488811392037e-06, |
|
"loss": 8.8672, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.015935290128838518, |
|
"eval_accuracy": 0.10202540606533052, |
|
"eval_loss": 8.8515625, |
|
"eval_runtime": 273.5376, |
|
"eval_samples_per_second": 123.446, |
|
"eval_steps_per_second": 3.861, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.015983725661145014, |
|
"grad_norm": 1.4366816282272339, |
|
"learning_rate": 9.998483967838807e-06, |
|
"loss": 8.8984, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.015983725661145014, |
|
"eval_accuracy": 0.10181083690627749, |
|
"eval_loss": 8.8515625, |
|
"eval_runtime": 273.8934, |
|
"eval_samples_per_second": 123.285, |
|
"eval_steps_per_second": 3.856, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.016032161193451515, |
|
"grad_norm": 1.4731358289718628, |
|
"learning_rate": 9.998479124285576e-06, |
|
"loss": 8.875, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.016032161193451515, |
|
"eval_accuracy": 0.10155426291050211, |
|
"eval_loss": 8.84375, |
|
"eval_runtime": 273.3318, |
|
"eval_samples_per_second": 123.538, |
|
"eval_steps_per_second": 3.863, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.016080596725758015, |
|
"grad_norm": 1.4572798013687134, |
|
"learning_rate": 9.998474280732346e-06, |
|
"loss": 8.8828, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.016080596725758015, |
|
"eval_accuracy": 0.10136490823303775, |
|
"eval_loss": 8.84375, |
|
"eval_runtime": 273.42, |
|
"eval_samples_per_second": 123.499, |
|
"eval_steps_per_second": 3.862, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"grad_norm": 1.5042625665664673, |
|
"learning_rate": 9.998469437179116e-06, |
|
"loss": 8.8438, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"eval_accuracy": 0.10138867527021833, |
|
"eval_loss": 8.8359375, |
|
"eval_runtime": 273.8134, |
|
"eval_samples_per_second": 123.321, |
|
"eval_steps_per_second": 3.857, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.016177467790371016, |
|
"grad_norm": 1.9862890243530273, |
|
"learning_rate": 9.998464593625884e-06, |
|
"loss": 8.7969, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.016177467790371016, |
|
"eval_accuracy": 0.10165384708577767, |
|
"eval_loss": 8.8359375, |
|
"eval_runtime": 273.5642, |
|
"eval_samples_per_second": 123.434, |
|
"eval_steps_per_second": 3.86, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.016225903322677517, |
|
"grad_norm": 1.5006351470947266, |
|
"learning_rate": 9.998459750072654e-06, |
|
"loss": 8.8828, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.016225903322677517, |
|
"eval_accuracy": 0.10197017158671838, |
|
"eval_loss": 8.828125, |
|
"eval_runtime": 273.8604, |
|
"eval_samples_per_second": 123.3, |
|
"eval_steps_per_second": 3.856, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.016274338854984017, |
|
"grad_norm": 1.5454577207565308, |
|
"learning_rate": 9.998454906519424e-06, |
|
"loss": 8.8281, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.016274338854984017, |
|
"eval_accuracy": 0.10246210004324674, |
|
"eval_loss": 8.828125, |
|
"eval_runtime": 273.8457, |
|
"eval_samples_per_second": 123.307, |
|
"eval_steps_per_second": 3.856, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.016322774387290517, |
|
"grad_norm": 1.6074914932250977, |
|
"learning_rate": 9.998450062966192e-06, |
|
"loss": 8.8203, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.016322774387290517, |
|
"eval_accuracy": 0.10273896720962333, |
|
"eval_loss": 8.828125, |
|
"eval_runtime": 273.8399, |
|
"eval_samples_per_second": 123.309, |
|
"eval_steps_per_second": 3.856, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.016371209919597018, |
|
"grad_norm": 1.5352425575256348, |
|
"learning_rate": 9.998445219412962e-06, |
|
"loss": 8.8594, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.016371209919597018, |
|
"eval_accuracy": 0.10277469013761462, |
|
"eval_loss": 8.8203125, |
|
"eval_runtime": 273.8619, |
|
"eval_samples_per_second": 123.299, |
|
"eval_steps_per_second": 3.856, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.016419645451903518, |
|
"grad_norm": 1.5608229637145996, |
|
"learning_rate": 9.998440375859732e-06, |
|
"loss": 8.8594, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.016419645451903518, |
|
"eval_accuracy": 0.10268248792882025, |
|
"eval_loss": 8.8203125, |
|
"eval_runtime": 273.0524, |
|
"eval_samples_per_second": 123.665, |
|
"eval_steps_per_second": 3.867, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.016468080984210015, |
|
"grad_norm": 1.4532408714294434, |
|
"learning_rate": 9.9984355323065e-06, |
|
"loss": 8.8203, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.016468080984210015, |
|
"eval_accuracy": 0.10246901882751734, |
|
"eval_loss": 8.8125, |
|
"eval_runtime": 273.0044, |
|
"eval_samples_per_second": 123.687, |
|
"eval_steps_per_second": 3.868, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.016516516516516516, |
|
"grad_norm": 1.5474005937576294, |
|
"learning_rate": 9.99843068875327e-06, |
|
"loss": 8.8359, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.016516516516516516, |
|
"eval_accuracy": 0.10244681503029747, |
|
"eval_loss": 8.8125, |
|
"eval_runtime": 272.933, |
|
"eval_samples_per_second": 123.719, |
|
"eval_steps_per_second": 3.869, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.016516516516516516, |
|
"step": 341, |
|
"total_flos": 1427643152990208.0, |
|
"train_loss": 9.567586143695015, |
|
"train_runtime": 93928.2806, |
|
"train_samples_per_second": 703.348, |
|
"train_steps_per_second": 21.981 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 2064600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 1000000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1427643152990208.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|