{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.016516516516516516, "eval_steps": 1, "global_step": 341, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.8435532306500046e-05, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 1 }, { "epoch": 4.8435532306500046e-05, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 276.4656, "eval_samples_per_second": 122.138, "eval_steps_per_second": 3.82, "step": 1 }, { "epoch": 9.687106461300009e-05, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 2 }, { "epoch": 9.687106461300009e-05, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.1935, "eval_samples_per_second": 122.703, "eval_steps_per_second": 3.837, "step": 2 }, { "epoch": 0.00014530659691950015, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 3 }, { "epoch": 0.00014530659691950015, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.13, "eval_samples_per_second": 122.731, "eval_steps_per_second": 3.838, "step": 3 }, { "epoch": 0.00019374212922600018, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 4 }, { "epoch": 0.00019374212922600018, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 276.2699, "eval_samples_per_second": 122.225, "eval_steps_per_second": 3.822, "step": 4 }, { "epoch": 0.00024217766153250024, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.9062, "step": 5 }, { "epoch": 0.00024217766153250024, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 274.7331, "eval_samples_per_second": 122.908, "eval_steps_per_second": 3.844, "step": 5 }, { "epoch": 0.0002906131938390003, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 6 }, { "epoch": 0.0002906131938390003, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.5447, "eval_samples_per_second": 122.546, "eval_steps_per_second": 3.832, "step": 6 }, { "epoch": 0.00033904872614550033, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.9062, "step": 7 }, { "epoch": 0.00033904872614550033, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.1675, "eval_samples_per_second": 122.714, "eval_steps_per_second": 3.838, "step": 7 }, { "epoch": 0.00038748425845200037, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.9062, "step": 8 }, { "epoch": 0.00038748425845200037, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.5655, "eval_samples_per_second": 122.537, "eval_steps_per_second": 3.832, "step": 8 }, { "epoch": 0.00043591979075850045, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.9062, "step": 9 }, { "epoch": 0.00043591979075850045, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 273.3419, "eval_samples_per_second": 123.534, "eval_steps_per_second": 3.863, "step": 9 }, { "epoch": 0.0004843553230650005, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 10 }, { "epoch": 0.0004843553230650005, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.1369, "eval_samples_per_second": 122.728, "eval_steps_per_second": 3.838, "step": 10 }, { "epoch": 0.0005327908553715005, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 11 }, { "epoch": 0.0005327908553715005, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 274.4663, "eval_samples_per_second": 123.028, "eval_steps_per_second": 3.847, "step": 11 }, { "epoch": 0.0005812263876780006, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 12 }, { "epoch": 0.0005812263876780006, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 273.8795, "eval_samples_per_second": 123.291, "eval_steps_per_second": 3.856, "step": 12 }, { "epoch": 0.0006296619199845006, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 13 }, { "epoch": 0.0006296619199845006, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 273.17, "eval_samples_per_second": 123.612, "eval_steps_per_second": 3.866, "step": 13 }, { "epoch": 0.0006780974522910007, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.9062, "step": 14 }, { "epoch": 0.0006780974522910007, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 274.4591, "eval_samples_per_second": 123.031, "eval_steps_per_second": 3.848, "step": 14 }, { "epoch": 0.0007265329845975008, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 15 }, { "epoch": 0.0007265329845975008, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 275.1413, "eval_samples_per_second": 122.726, "eval_steps_per_second": 3.838, "step": 15 }, { "epoch": 0.0007749685169040007, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8984, "step": 16 }, { "epoch": 0.0007749685169040007, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 276.8384, "eval_samples_per_second": 121.974, "eval_steps_per_second": 3.814, "step": 16 }, { "epoch": 0.0008234040492105008, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.9062, "step": 17 }, { "epoch": 0.0008234040492105008, "eval_accuracy": 0.0001320069300164392, "eval_loss": 10.90625, "eval_runtime": 273.566, "eval_samples_per_second": 123.433, "eval_steps_per_second": 3.86, "step": 17 }, { "epoch": 0.0008718395815170009, "grad_norm": 6.863816738128662, "learning_rate": 9.99999515644677e-06, "loss": 10.9062, "step": 18 }, { "epoch": 0.0008718395815170009, "eval_accuracy": 0.010980023790775268, "eval_loss": 10.7578125, "eval_runtime": 273.6742, "eval_samples_per_second": 123.384, "eval_steps_per_second": 3.859, "step": 18 }, { "epoch": 0.0009202751138235009, "grad_norm": 6.298513889312744, "learning_rate": 9.999990312893539e-06, "loss": 10.7734, "step": 19 }, { "epoch": 0.0009202751138235009, "eval_accuracy": 0.028515349612393204, "eval_loss": 10.65625, "eval_runtime": 273.1376, "eval_samples_per_second": 123.626, "eval_steps_per_second": 3.866, "step": 19 }, { "epoch": 0.000968710646130001, "grad_norm": 5.340964317321777, "learning_rate": 9.999985469340309e-06, "loss": 10.6797, "step": 20 }, { "epoch": 0.000968710646130001, "eval_accuracy": 0.04691063110573666, "eval_loss": 10.578125, "eval_runtime": 273.0387, "eval_samples_per_second": 123.671, "eval_steps_per_second": 3.868, "step": 20 }, { "epoch": 0.001017146178436501, "grad_norm": 4.327230930328369, "learning_rate": 9.999980625787079e-06, "loss": 10.6016, "step": 21 }, { "epoch": 0.001017146178436501, "eval_accuracy": 0.04854858814680248, "eval_loss": 10.5234375, "eval_runtime": 272.9282, "eval_samples_per_second": 123.721, "eval_steps_per_second": 3.869, "step": 21 }, { "epoch": 0.001065581710743001, "grad_norm": 3.803434133529663, "learning_rate": 9.999975782233847e-06, "loss": 10.5234, "step": 22 }, { "epoch": 0.001065581710743001, "eval_accuracy": 0.04776647603534324, "eval_loss": 10.4765625, "eval_runtime": 272.6159, "eval_samples_per_second": 123.863, "eval_steps_per_second": 3.874, "step": 22 }, { "epoch": 0.001114017243049501, "grad_norm": 3.2490711212158203, "learning_rate": 9.999970938680617e-06, "loss": 10.5, "step": 23 }, { "epoch": 0.001114017243049501, "eval_accuracy": 0.04827195257153118, "eval_loss": 10.4375, "eval_runtime": 272.3658, "eval_samples_per_second": 123.977, "eval_steps_per_second": 3.877, "step": 23 }, { "epoch": 0.0011624527753560012, "grad_norm": 2.9085004329681396, "learning_rate": 9.999966095127386e-06, "loss": 10.4531, "step": 24 }, { "epoch": 0.0011624527753560012, "eval_accuracy": 0.05073292650302844, "eval_loss": 10.40625, "eval_runtime": 272.851, "eval_samples_per_second": 123.756, "eval_steps_per_second": 3.87, "step": 24 }, { "epoch": 0.0012108883076625012, "grad_norm": 2.668471574783325, "learning_rate": 9.999961251574155e-06, "loss": 10.4141, "step": 25 }, { "epoch": 0.0012108883076625012, "eval_accuracy": 0.05310853016333744, "eval_loss": 10.3828125, "eval_runtime": 272.4493, "eval_samples_per_second": 123.939, "eval_steps_per_second": 3.876, "step": 25 }, { "epoch": 0.0012593238399690012, "grad_norm": 2.864935874938965, "learning_rate": 9.999956408020926e-06, "loss": 10.3672, "step": 26 }, { "epoch": 0.0012593238399690012, "eval_accuracy": 0.0555671013371173, "eval_loss": 10.359375, "eval_runtime": 273.3963, "eval_samples_per_second": 123.509, "eval_steps_per_second": 3.863, "step": 26 }, { "epoch": 0.0013077593722755014, "grad_norm": 2.2354369163513184, "learning_rate": 9.999951564467694e-06, "loss": 10.3828, "step": 27 }, { "epoch": 0.0013077593722755014, "eval_accuracy": 0.05616663281094196, "eval_loss": 10.3359375, "eval_runtime": 272.6273, "eval_samples_per_second": 123.858, "eval_steps_per_second": 3.873, "step": 27 }, { "epoch": 0.0013561949045820013, "grad_norm": 2.1477534770965576, "learning_rate": 9.999946720914464e-06, "loss": 10.3594, "step": 28 }, { "epoch": 0.0013561949045820013, "eval_accuracy": 0.05624939768219569, "eval_loss": 10.3203125, "eval_runtime": 273.0918, "eval_samples_per_second": 123.647, "eval_steps_per_second": 3.867, "step": 28 }, { "epoch": 0.0014046304368885013, "grad_norm": 2.097315549850464, "learning_rate": 9.999941877361234e-06, "loss": 10.3281, "step": 29 }, { "epoch": 0.0014046304368885013, "eval_accuracy": 0.055895844911079295, "eval_loss": 10.3046875, "eval_runtime": 273.0626, "eval_samples_per_second": 123.66, "eval_steps_per_second": 3.867, "step": 29 }, { "epoch": 0.0014530659691950015, "grad_norm": 1.8777693510055542, "learning_rate": 9.999937033808002e-06, "loss": 10.3203, "step": 30 }, { "epoch": 0.0014530659691950015, "eval_accuracy": 0.0563090902895847, "eval_loss": 10.296875, "eval_runtime": 272.1369, "eval_samples_per_second": 124.081, "eval_steps_per_second": 3.88, "step": 30 }, { "epoch": 0.0015015015015015015, "grad_norm": 1.8313064575195312, "learning_rate": 9.999932190254772e-06, "loss": 10.3281, "step": 31 }, { "epoch": 0.0015015015015015015, "eval_accuracy": 0.05664084454791549, "eval_loss": 10.28125, "eval_runtime": 272.0928, "eval_samples_per_second": 124.101, "eval_steps_per_second": 3.881, "step": 31 }, { "epoch": 0.0015499370338080015, "grad_norm": 1.7771973609924316, "learning_rate": 9.999927346701542e-06, "loss": 10.3359, "step": 32 }, { "epoch": 0.0015499370338080015, "eval_accuracy": 0.05662431473277527, "eval_loss": 10.2734375, "eval_runtime": 272.172, "eval_samples_per_second": 124.065, "eval_steps_per_second": 3.88, "step": 32 }, { "epoch": 0.0015983725661145017, "grad_norm": 1.8934669494628906, "learning_rate": 9.99992250314831e-06, "loss": 10.2656, "step": 33 }, { "epoch": 0.0015983725661145017, "eval_accuracy": 0.057003458321026435, "eval_loss": 10.265625, "eval_runtime": 271.9989, "eval_samples_per_second": 124.144, "eval_steps_per_second": 3.882, "step": 33 }, { "epoch": 0.0016468080984210016, "grad_norm": 1.789952039718628, "learning_rate": 9.99991765959508e-06, "loss": 10.2656, "step": 34 }, { "epoch": 0.0016468080984210016, "eval_accuracy": 0.056085286435208145, "eval_loss": 10.2578125, "eval_runtime": 272.24, "eval_samples_per_second": 124.034, "eval_steps_per_second": 3.879, "step": 34 }, { "epoch": 0.0016952436307275016, "grad_norm": 1.7584354877471924, "learning_rate": 9.99991281604185e-06, "loss": 10.2656, "step": 35 }, { "epoch": 0.0016952436307275016, "eval_accuracy": 0.056159366640013426, "eval_loss": 10.2421875, "eval_runtime": 272.3595, "eval_samples_per_second": 123.98, "eval_steps_per_second": 3.877, "step": 35 }, { "epoch": 0.0017436791630340018, "grad_norm": 1.7618820667266846, "learning_rate": 9.999907972488618e-06, "loss": 10.2656, "step": 36 }, { "epoch": 0.0017436791630340018, "eval_accuracy": 0.057521267083571186, "eval_loss": 10.234375, "eval_runtime": 272.7034, "eval_samples_per_second": 123.823, "eval_steps_per_second": 3.872, "step": 36 }, { "epoch": 0.0017921146953405018, "grad_norm": 1.6511751413345337, "learning_rate": 9.999903128935388e-06, "loss": 10.2656, "step": 37 }, { "epoch": 0.0017921146953405018, "eval_accuracy": 0.05863849152438795, "eval_loss": 10.2265625, "eval_runtime": 273.7952, "eval_samples_per_second": 123.329, "eval_steps_per_second": 3.857, "step": 37 }, { "epoch": 0.0018405502276470018, "grad_norm": 1.8650130033493042, "learning_rate": 9.999898285382156e-06, "loss": 10.2109, "step": 38 }, { "epoch": 0.0018405502276470018, "eval_accuracy": 0.059270706292946944, "eval_loss": 10.21875, "eval_runtime": 274.2569, "eval_samples_per_second": 123.122, "eval_steps_per_second": 3.85, "step": 38 }, { "epoch": 0.001888985759953502, "grad_norm": 1.7996951341629028, "learning_rate": 9.999893441828926e-06, "loss": 10.2656, "step": 39 }, { "epoch": 0.001888985759953502, "eval_accuracy": 0.05958891247161815, "eval_loss": 10.2109375, "eval_runtime": 274.6294, "eval_samples_per_second": 122.955, "eval_steps_per_second": 3.845, "step": 39 }, { "epoch": 0.001937421292260002, "grad_norm": 1.7773430347442627, "learning_rate": 9.999888598275696e-06, "loss": 10.2266, "step": 40 }, { "epoch": 0.001937421292260002, "eval_accuracy": 0.05994316001605042, "eval_loss": 10.203125, "eval_runtime": 273.9324, "eval_samples_per_second": 123.268, "eval_steps_per_second": 3.855, "step": 40 }, { "epoch": 0.001985856824566502, "grad_norm": 1.7419933080673218, "learning_rate": 9.999883754722464e-06, "loss": 10.2109, "step": 41 }, { "epoch": 0.001985856824566502, "eval_accuracy": 0.06009832605659606, "eval_loss": 10.1953125, "eval_runtime": 274.4948, "eval_samples_per_second": 123.015, "eval_steps_per_second": 3.847, "step": 41 }, { "epoch": 0.002034292356873002, "grad_norm": 1.7278474569320679, "learning_rate": 9.999878911169234e-06, "loss": 10.2109, "step": 42 }, { "epoch": 0.002034292356873002, "eval_accuracy": 0.060353828943509456, "eval_loss": 10.1796875, "eval_runtime": 275.0795, "eval_samples_per_second": 122.754, "eval_steps_per_second": 3.839, "step": 42 }, { "epoch": 0.002082727889179502, "grad_norm": 1.8463383913040161, "learning_rate": 9.999874067616004e-06, "loss": 10.2109, "step": 43 }, { "epoch": 0.002082727889179502, "eval_accuracy": 0.060806444809914505, "eval_loss": 10.171875, "eval_runtime": 275.6513, "eval_samples_per_second": 122.499, "eval_steps_per_second": 3.831, "step": 43 }, { "epoch": 0.002131163421486002, "grad_norm": 1.8434734344482422, "learning_rate": 9.999869224062774e-06, "loss": 10.1484, "step": 44 }, { "epoch": 0.002131163421486002, "eval_accuracy": 0.061002660373873155, "eval_loss": 10.1640625, "eval_runtime": 275.2495, "eval_samples_per_second": 122.678, "eval_steps_per_second": 3.837, "step": 44 }, { "epoch": 0.002179598953792502, "grad_norm": 1.8196474313735962, "learning_rate": 9.999864380509543e-06, "loss": 10.1875, "step": 45 }, { "epoch": 0.002179598953792502, "eval_accuracy": 0.061108642253432405, "eval_loss": 10.1484375, "eval_runtime": 275.2499, "eval_samples_per_second": 122.678, "eval_steps_per_second": 3.837, "step": 45 }, { "epoch": 0.002228034486099002, "grad_norm": 1.7358877658843994, "learning_rate": 9.999859536956312e-06, "loss": 10.1719, "step": 46 }, { "epoch": 0.002228034486099002, "eval_accuracy": 0.061226840563795806, "eval_loss": 10.140625, "eval_runtime": 273.2604, "eval_samples_per_second": 123.571, "eval_steps_per_second": 3.864, "step": 46 }, { "epoch": 0.0022764700184055024, "grad_norm": 1.7613184452056885, "learning_rate": 9.999854693403081e-06, "loss": 10.1484, "step": 47 }, { "epoch": 0.0022764700184055024, "eval_accuracy": 0.06154533623134863, "eval_loss": 10.1328125, "eval_runtime": 274.8774, "eval_samples_per_second": 122.844, "eval_steps_per_second": 3.842, "step": 47 }, { "epoch": 0.0023249055507120024, "grad_norm": 1.926283597946167, "learning_rate": 9.999849849849851e-06, "loss": 10.1172, "step": 48 }, { "epoch": 0.0023249055507120024, "eval_accuracy": 0.062220076916616865, "eval_loss": 10.1171875, "eval_runtime": 275.5341, "eval_samples_per_second": 122.551, "eval_steps_per_second": 3.833, "step": 48 }, { "epoch": 0.0023733410830185024, "grad_norm": 1.7182645797729492, "learning_rate": 9.99984500629662e-06, "loss": 10.1797, "step": 49 }, { "epoch": 0.0023733410830185024, "eval_accuracy": 0.06321505020272762, "eval_loss": 10.109375, "eval_runtime": 275.3575, "eval_samples_per_second": 122.63, "eval_steps_per_second": 3.835, "step": 49 }, { "epoch": 0.0024217766153250024, "grad_norm": 1.756512999534607, "learning_rate": 9.99984016274339e-06, "loss": 10.1016, "step": 50 }, { "epoch": 0.0024217766153250024, "eval_accuracy": 0.06421378684429936, "eval_loss": 10.1015625, "eval_runtime": 274.9451, "eval_samples_per_second": 122.814, "eval_steps_per_second": 3.841, "step": 50 }, { "epoch": 0.0024702121476315024, "grad_norm": 1.8228658437728882, "learning_rate": 9.99983531919016e-06, "loss": 10.1406, "step": 51 }, { "epoch": 0.0024702121476315024, "eval_accuracy": 0.06511247612838496, "eval_loss": 10.09375, "eval_runtime": 274.7644, "eval_samples_per_second": 122.894, "eval_steps_per_second": 3.843, "step": 51 }, { "epoch": 0.0025186476799380023, "grad_norm": 1.6864567995071411, "learning_rate": 9.999830475636927e-06, "loss": 10.1406, "step": 52 }, { "epoch": 0.0025186476799380023, "eval_accuracy": 0.0658020965421682, "eval_loss": 10.0859375, "eval_runtime": 275.2863, "eval_samples_per_second": 122.661, "eval_steps_per_second": 3.836, "step": 52 }, { "epoch": 0.0025670832122445027, "grad_norm": 1.7754981517791748, "learning_rate": 9.999825632083697e-06, "loss": 10.1094, "step": 53 }, { "epoch": 0.0025670832122445027, "eval_accuracy": 0.06627448449918756, "eval_loss": 10.078125, "eval_runtime": 274.8943, "eval_samples_per_second": 122.836, "eval_steps_per_second": 3.841, "step": 53 }, { "epoch": 0.0026155187445510027, "grad_norm": 1.7636278867721558, "learning_rate": 9.999820788530467e-06, "loss": 10.1016, "step": 54 }, { "epoch": 0.0026155187445510027, "eval_accuracy": 0.06685522814459541, "eval_loss": 10.0703125, "eval_runtime": 275.7861, "eval_samples_per_second": 122.439, "eval_steps_per_second": 3.829, "step": 54 }, { "epoch": 0.0026639542768575027, "grad_norm": 1.7524579763412476, "learning_rate": 9.999815944977235e-06, "loss": 10.0781, "step": 55 }, { "epoch": 0.0026639542768575027, "eval_accuracy": 0.06716721031231189, "eval_loss": 10.0625, "eval_runtime": 274.8867, "eval_samples_per_second": 122.84, "eval_steps_per_second": 3.842, "step": 55 }, { "epoch": 0.0027123898091640027, "grad_norm": 1.8897311687469482, "learning_rate": 9.999811101424005e-06, "loss": 10.0703, "step": 56 }, { "epoch": 0.0027123898091640027, "eval_accuracy": 0.06777936350137496, "eval_loss": 10.0546875, "eval_runtime": 276.2638, "eval_samples_per_second": 122.227, "eval_steps_per_second": 3.822, "step": 56 }, { "epoch": 0.0027608253414705027, "grad_norm": 1.7320737838745117, "learning_rate": 9.999806257870775e-06, "loss": 10.0703, "step": 57 }, { "epoch": 0.0027608253414705027, "eval_accuracy": 0.06813957451676851, "eval_loss": 10.046875, "eval_runtime": 275.4873, "eval_samples_per_second": 122.572, "eval_steps_per_second": 3.833, "step": 57 }, { "epoch": 0.0028092608737770026, "grad_norm": 1.685152530670166, "learning_rate": 9.999801414317543e-06, "loss": 10.0469, "step": 58 }, { "epoch": 0.0028092608737770026, "eval_accuracy": 0.06857357624808572, "eval_loss": 10.0390625, "eval_runtime": 275.0861, "eval_samples_per_second": 122.751, "eval_steps_per_second": 3.839, "step": 58 }, { "epoch": 0.002857696406083503, "grad_norm": 1.6026166677474976, "learning_rate": 9.999796570764313e-06, "loss": 10.1016, "step": 59 }, { "epoch": 0.002857696406083503, "eval_accuracy": 0.06889201401786221, "eval_loss": 10.03125, "eval_runtime": 273.8074, "eval_samples_per_second": 123.324, "eval_steps_per_second": 3.857, "step": 59 }, { "epoch": 0.002906131938390003, "grad_norm": 1.7406948804855347, "learning_rate": 9.999791727211083e-06, "loss": 10.0547, "step": 60 }, { "epoch": 0.002906131938390003, "eval_accuracy": 0.06942965276879759, "eval_loss": 10.03125, "eval_runtime": 274.1162, "eval_samples_per_second": 123.185, "eval_steps_per_second": 3.852, "step": 60 }, { "epoch": 0.002954567470696503, "grad_norm": 2.25240421295166, "learning_rate": 9.999786883657853e-06, "loss": 10.0391, "step": 61 }, { "epoch": 0.002954567470696503, "eval_accuracy": 0.06947052859888163, "eval_loss": 10.0234375, "eval_runtime": 274.759, "eval_samples_per_second": 122.897, "eval_steps_per_second": 3.843, "step": 61 }, { "epoch": 0.003003003003003003, "grad_norm": 1.6132714748382568, "learning_rate": 9.999782040104623e-06, "loss": 10.0547, "step": 62 }, { "epoch": 0.003003003003003003, "eval_accuracy": 0.06921656000304079, "eval_loss": 10.015625, "eval_runtime": 275.2004, "eval_samples_per_second": 122.7, "eval_steps_per_second": 3.837, "step": 62 }, { "epoch": 0.003051438535309503, "grad_norm": 1.6277832984924316, "learning_rate": 9.99977719655139e-06, "loss": 10.0312, "step": 63 }, { "epoch": 0.003051438535309503, "eval_accuracy": 0.06882297091959703, "eval_loss": 10.0078125, "eval_runtime": 274.9078, "eval_samples_per_second": 122.83, "eval_steps_per_second": 3.841, "step": 63 }, { "epoch": 0.003099874067616003, "grad_norm": 1.6769694089889526, "learning_rate": 9.99977235299816e-06, "loss": 10.0547, "step": 64 }, { "epoch": 0.003099874067616003, "eval_accuracy": 0.06873357675295433, "eval_loss": 10.0, "eval_runtime": 273.9236, "eval_samples_per_second": 123.272, "eval_steps_per_second": 3.855, "step": 64 }, { "epoch": 0.0031483095999225033, "grad_norm": 1.6080327033996582, "learning_rate": 9.99976750944493e-06, "loss": 10.0547, "step": 65 }, { "epoch": 0.0031483095999225033, "eval_accuracy": 0.06925561205317066, "eval_loss": 9.9921875, "eval_runtime": 275.0761, "eval_samples_per_second": 122.755, "eval_steps_per_second": 3.839, "step": 65 }, { "epoch": 0.0031967451322290033, "grad_norm": 1.6163508892059326, "learning_rate": 9.999762665891699e-06, "loss": 9.9922, "step": 66 }, { "epoch": 0.0031967451322290033, "eval_accuracy": 0.0697469615319358, "eval_loss": 9.984375, "eval_runtime": 273.8628, "eval_samples_per_second": 123.299, "eval_steps_per_second": 3.856, "step": 66 }, { "epoch": 0.0032451806645355033, "grad_norm": 1.625279426574707, "learning_rate": 9.999757822338468e-06, "loss": 10.0234, "step": 67 }, { "epoch": 0.0032451806645355033, "eval_accuracy": 0.0704554855696885, "eval_loss": 9.9765625, "eval_runtime": 274.3158, "eval_samples_per_second": 123.095, "eval_steps_per_second": 3.85, "step": 67 }, { "epoch": 0.0032936161968420033, "grad_norm": 1.6738680601119995, "learning_rate": 9.999752978785238e-06, "loss": 10.0, "step": 68 }, { "epoch": 0.0032936161968420033, "eval_accuracy": 0.07112654974616023, "eval_loss": 9.96875, "eval_runtime": 275.9505, "eval_samples_per_second": 122.366, "eval_steps_per_second": 3.827, "step": 68 }, { "epoch": 0.0033420517291485033, "grad_norm": 1.5247821807861328, "learning_rate": 9.999748135232007e-06, "loss": 10.0, "step": 69 }, { "epoch": 0.0033420517291485033, "eval_accuracy": 0.0715033484744703, "eval_loss": 9.9609375, "eval_runtime": 274.4182, "eval_samples_per_second": 123.049, "eval_steps_per_second": 3.848, "step": 69 }, { "epoch": 0.0033904872614550032, "grad_norm": 1.8255083560943604, "learning_rate": 9.999743291678776e-06, "loss": 9.9688, "step": 70 }, { "epoch": 0.0033904872614550032, "eval_accuracy": 0.07161468589833944, "eval_loss": 9.9609375, "eval_runtime": 274.1089, "eval_samples_per_second": 123.188, "eval_steps_per_second": 3.852, "step": 70 }, { "epoch": 0.0034389227937615036, "grad_norm": 1.845422387123108, "learning_rate": 9.999738448125546e-06, "loss": 9.9922, "step": 71 }, { "epoch": 0.0034389227937615036, "eval_accuracy": 0.07169394795412562, "eval_loss": 9.953125, "eval_runtime": 275.2085, "eval_samples_per_second": 122.696, "eval_steps_per_second": 3.837, "step": 71 }, { "epoch": 0.0034873583260680036, "grad_norm": 1.663128137588501, "learning_rate": 9.999733604572314e-06, "loss": 9.9844, "step": 72 }, { "epoch": 0.0034873583260680036, "eval_accuracy": 0.07159517434771859, "eval_loss": 9.9453125, "eval_runtime": 274.6498, "eval_samples_per_second": 122.946, "eval_steps_per_second": 3.845, "step": 72 }, { "epoch": 0.0035357938583745036, "grad_norm": 1.6756772994995117, "learning_rate": 9.999728761019084e-06, "loss": 9.9688, "step": 73 }, { "epoch": 0.0035357938583745036, "eval_accuracy": 0.07181347791334446, "eval_loss": 9.9375, "eval_runtime": 274.5733, "eval_samples_per_second": 122.98, "eval_steps_per_second": 3.846, "step": 73 }, { "epoch": 0.0035842293906810036, "grad_norm": 1.746936559677124, "learning_rate": 9.999723917465854e-06, "loss": 9.9453, "step": 74 }, { "epoch": 0.0035842293906810036, "eval_accuracy": 0.07256360150338524, "eval_loss": 9.9296875, "eval_runtime": 275.0915, "eval_samples_per_second": 122.748, "eval_steps_per_second": 3.839, "step": 74 }, { "epoch": 0.0036326649229875036, "grad_norm": 1.6043540239334106, "learning_rate": 9.999719073912622e-06, "loss": 9.9375, "step": 75 }, { "epoch": 0.0036326649229875036, "eval_accuracy": 0.07335680103901034, "eval_loss": 9.921875, "eval_runtime": 274.3328, "eval_samples_per_second": 123.088, "eval_steps_per_second": 3.849, "step": 75 }, { "epoch": 0.0036811004552940035, "grad_norm": 1.6499953269958496, "learning_rate": 9.999714230359392e-06, "loss": 9.9141, "step": 76 }, { "epoch": 0.0036811004552940035, "eval_accuracy": 0.0744010163838838, "eval_loss": 9.9140625, "eval_runtime": 274.4219, "eval_samples_per_second": 123.048, "eval_steps_per_second": 3.848, "step": 76 }, { "epoch": 0.0037295359876005035, "grad_norm": 1.6161168813705444, "learning_rate": 9.999709386806162e-06, "loss": 9.9062, "step": 77 }, { "epoch": 0.0037295359876005035, "eval_accuracy": 0.07513791033203478, "eval_loss": 9.90625, "eval_runtime": 274.8621, "eval_samples_per_second": 122.851, "eval_steps_per_second": 3.842, "step": 77 }, { "epoch": 0.003777971519907004, "grad_norm": 1.760338544845581, "learning_rate": 9.999704543252932e-06, "loss": 9.9219, "step": 78 }, { "epoch": 0.003777971519907004, "eval_accuracy": 0.07549183943869726, "eval_loss": 9.90625, "eval_runtime": 273.942, "eval_samples_per_second": 123.263, "eval_steps_per_second": 3.855, "step": 78 }, { "epoch": 0.003826407052213504, "grad_norm": 2.1402640342712402, "learning_rate": 9.9996996996997e-06, "loss": 9.9219, "step": 79 }, { "epoch": 0.003826407052213504, "eval_accuracy": 0.07561446692894938, "eval_loss": 9.8984375, "eval_runtime": 273.1637, "eval_samples_per_second": 123.615, "eval_steps_per_second": 3.866, "step": 79 }, { "epoch": 0.003874842584520004, "grad_norm": 1.5549274682998657, "learning_rate": 9.99969485614647e-06, "loss": 9.9219, "step": 80 }, { "epoch": 0.003874842584520004, "eval_accuracy": 0.07565650071455989, "eval_loss": 9.890625, "eval_runtime": 274.1597, "eval_samples_per_second": 123.165, "eval_steps_per_second": 3.852, "step": 80 }, { "epoch": 0.003923278116826504, "grad_norm": 1.619598388671875, "learning_rate": 9.99969001259324e-06, "loss": 9.875, "step": 81 }, { "epoch": 0.003923278116826504, "eval_accuracy": 0.07585523483178858, "eval_loss": 9.8828125, "eval_runtime": 273.6756, "eval_samples_per_second": 123.383, "eval_steps_per_second": 3.859, "step": 81 }, { "epoch": 0.003971713649133004, "grad_norm": 1.4982187747955322, "learning_rate": 9.999685169040008e-06, "loss": 9.9219, "step": 82 }, { "epoch": 0.003971713649133004, "eval_accuracy": 0.07604369209371994, "eval_loss": 9.875, "eval_runtime": 274.2745, "eval_samples_per_second": 123.114, "eval_steps_per_second": 3.85, "step": 82 }, { "epoch": 0.004020149181439504, "grad_norm": 1.8369065523147583, "learning_rate": 9.999680325486778e-06, "loss": 9.875, "step": 83 }, { "epoch": 0.004020149181439504, "eval_accuracy": 0.07629766068956077, "eval_loss": 9.875, "eval_runtime": 273.8951, "eval_samples_per_second": 123.284, "eval_steps_per_second": 3.855, "step": 83 }, { "epoch": 0.004068584713746004, "grad_norm": 1.5859246253967285, "learning_rate": 9.999675481933548e-06, "loss": 9.8672, "step": 84 }, { "epoch": 0.004068584713746004, "eval_accuracy": 0.07654106294122268, "eval_loss": 9.8671875, "eval_runtime": 273.7145, "eval_samples_per_second": 123.366, "eval_steps_per_second": 3.858, "step": 84 }, { "epoch": 0.004117020246052504, "grad_norm": 1.527214765548706, "learning_rate": 9.999670638380316e-06, "loss": 9.9062, "step": 85 }, { "epoch": 0.004117020246052504, "eval_accuracy": 0.07687032759517157, "eval_loss": 9.859375, "eval_runtime": 273.3281, "eval_samples_per_second": 123.54, "eval_steps_per_second": 3.863, "step": 85 }, { "epoch": 0.004165455778359004, "grad_norm": 1.5885719060897827, "learning_rate": 9.999665794827086e-06, "loss": 9.8828, "step": 86 }, { "epoch": 0.004165455778359004, "eval_accuracy": 0.07730965592191048, "eval_loss": 9.8515625, "eval_runtime": 273.6316, "eval_samples_per_second": 123.403, "eval_steps_per_second": 3.859, "step": 86 }, { "epoch": 0.004213891310665505, "grad_norm": 1.7169041633605957, "learning_rate": 9.999660951273856e-06, "loss": 9.8594, "step": 87 }, { "epoch": 0.004213891310665505, "eval_accuracy": 0.07752955167638524, "eval_loss": 9.8515625, "eval_runtime": 273.4517, "eval_samples_per_second": 123.484, "eval_steps_per_second": 3.862, "step": 87 }, { "epoch": 0.004262326842972004, "grad_norm": 1.5023819208145142, "learning_rate": 9.999656107720624e-06, "loss": 9.8906, "step": 88 }, { "epoch": 0.004262326842972004, "eval_accuracy": 0.07768043328148298, "eval_loss": 9.84375, "eval_runtime": 275.7256, "eval_samples_per_second": 122.466, "eval_steps_per_second": 3.83, "step": 88 }, { "epoch": 0.0043107623752785046, "grad_norm": 1.6757872104644775, "learning_rate": 9.999651264167394e-06, "loss": 9.8047, "step": 89 }, { "epoch": 0.0043107623752785046, "eval_accuracy": 0.07773103593798929, "eval_loss": 9.8359375, "eval_runtime": 275.8678, "eval_samples_per_second": 122.403, "eval_steps_per_second": 3.828, "step": 89 }, { "epoch": 0.004359197907585004, "grad_norm": 2.2149763107299805, "learning_rate": 9.999646420614163e-06, "loss": 9.8203, "step": 90 }, { "epoch": 0.004359197907585004, "eval_accuracy": 0.07783635199312082, "eval_loss": 9.8359375, "eval_runtime": 275.5895, "eval_samples_per_second": 122.526, "eval_steps_per_second": 3.832, "step": 90 }, { "epoch": 0.0044076334398915045, "grad_norm": 1.6437429189682007, "learning_rate": 9.999641577060932e-06, "loss": 9.8594, "step": 91 }, { "epoch": 0.0044076334398915045, "eval_accuracy": 0.07813093587905225, "eval_loss": 9.828125, "eval_runtime": 274.7605, "eval_samples_per_second": 122.896, "eval_steps_per_second": 3.843, "step": 91 }, { "epoch": 0.004456068972198004, "grad_norm": 1.6756585836410522, "learning_rate": 9.999636733507701e-06, "loss": 9.8438, "step": 92 }, { "epoch": 0.004456068972198004, "eval_accuracy": 0.07858427546766132, "eval_loss": 9.8203125, "eval_runtime": 275.56, "eval_samples_per_second": 122.54, "eval_steps_per_second": 3.832, "step": 92 }, { "epoch": 0.0045045045045045045, "grad_norm": 1.6290555000305176, "learning_rate": 9.999631889954471e-06, "loss": 9.8438, "step": 93 }, { "epoch": 0.0045045045045045045, "eval_accuracy": 0.07898075943992122, "eval_loss": 9.8203125, "eval_runtime": 275.4203, "eval_samples_per_second": 122.602, "eval_steps_per_second": 3.834, "step": 93 }, { "epoch": 0.004552940036811005, "grad_norm": 1.552886724472046, "learning_rate": 9.999627046401241e-06, "loss": 9.8438, "step": 94 }, { "epoch": 0.004552940036811005, "eval_accuracy": 0.0792507946686917, "eval_loss": 9.8125, "eval_runtime": 276.2603, "eval_samples_per_second": 122.229, "eval_steps_per_second": 3.822, "step": 94 }, { "epoch": 0.004601375569117504, "grad_norm": 1.6093745231628418, "learning_rate": 9.999622202848011e-06, "loss": 9.8359, "step": 95 }, { "epoch": 0.004601375569117504, "eval_accuracy": 0.07942483538431863, "eval_loss": 9.8046875, "eval_runtime": 277.5976, "eval_samples_per_second": 121.64, "eval_steps_per_second": 3.804, "step": 95 }, { "epoch": 0.004649811101424005, "grad_norm": 1.6716474294662476, "learning_rate": 9.99961735929478e-06, "loss": 9.8281, "step": 96 }, { "epoch": 0.004649811101424005, "eval_accuracy": 0.07951587963758655, "eval_loss": 9.8046875, "eval_runtime": 277.2247, "eval_samples_per_second": 121.804, "eval_steps_per_second": 3.809, "step": 96 }, { "epoch": 0.004698246633730504, "grad_norm": 1.5188281536102295, "learning_rate": 9.999612515741549e-06, "loss": 9.8516, "step": 97 }, { "epoch": 0.004698246633730504, "eval_accuracy": 0.07964241522774047, "eval_loss": 9.796875, "eval_runtime": 277.219, "eval_samples_per_second": 121.806, "eval_steps_per_second": 3.809, "step": 97 }, { "epoch": 0.004746682166037005, "grad_norm": 1.5686155557632446, "learning_rate": 9.999607672188319e-06, "loss": 9.8281, "step": 98 }, { "epoch": 0.004746682166037005, "eval_accuracy": 0.07971328210595982, "eval_loss": 9.7890625, "eval_runtime": 276.6601, "eval_samples_per_second": 122.052, "eval_steps_per_second": 3.817, "step": 98 }, { "epoch": 0.004795117698343505, "grad_norm": 1.6188207864761353, "learning_rate": 9.999602828635087e-06, "loss": 9.7734, "step": 99 }, { "epoch": 0.004795117698343505, "eval_accuracy": 0.0798379649672714, "eval_loss": 9.7890625, "eval_runtime": 276.9636, "eval_samples_per_second": 121.919, "eval_steps_per_second": 3.813, "step": 99 }, { "epoch": 0.004843553230650005, "grad_norm": 1.6795498132705688, "learning_rate": 9.999597985081857e-06, "loss": 9.8125, "step": 100 }, { "epoch": 0.004843553230650005, "eval_accuracy": 0.08018245673639325, "eval_loss": 9.78125, "eval_runtime": 277.3903, "eval_samples_per_second": 121.731, "eval_steps_per_second": 3.807, "step": 100 }, { "epoch": 0.004891988762956505, "grad_norm": 1.516228199005127, "learning_rate": 9.999593141528627e-06, "loss": 9.8203, "step": 101 }, { "epoch": 0.004891988762956505, "eval_accuracy": 0.08056970601332963, "eval_loss": 9.7734375, "eval_runtime": 277.0892, "eval_samples_per_second": 121.863, "eval_steps_per_second": 3.811, "step": 101 }, { "epoch": 0.004940424295263005, "grad_norm": 1.485206961631775, "learning_rate": 9.999588297975395e-06, "loss": 9.8281, "step": 102 }, { "epoch": 0.004940424295263005, "eval_accuracy": 0.0809254588999463, "eval_loss": 9.7734375, "eval_runtime": 277.0597, "eval_samples_per_second": 121.876, "eval_steps_per_second": 3.811, "step": 102 }, { "epoch": 0.004988859827569505, "grad_norm": 1.6925771236419678, "learning_rate": 9.999583454422165e-06, "loss": 9.7734, "step": 103 }, { "epoch": 0.004988859827569505, "eval_accuracy": 0.081113974059654, "eval_loss": 9.765625, "eval_runtime": 276.5455, "eval_samples_per_second": 122.103, "eval_steps_per_second": 3.819, "step": 103 }, { "epoch": 0.005037295359876005, "grad_norm": 1.6215219497680664, "learning_rate": 9.999578610868935e-06, "loss": 9.7891, "step": 104 }, { "epoch": 0.005037295359876005, "eval_accuracy": 0.08127527726448987, "eval_loss": 9.7578125, "eval_runtime": 275.7199, "eval_samples_per_second": 122.469, "eval_steps_per_second": 3.83, "step": 104 }, { "epoch": 0.005085730892182505, "grad_norm": 1.5104496479034424, "learning_rate": 9.999573767315703e-06, "loss": 9.8047, "step": 105 }, { "epoch": 0.005085730892182505, "eval_accuracy": 0.08140079964355813, "eval_loss": 9.7578125, "eval_runtime": 275.8702, "eval_samples_per_second": 122.402, "eval_steps_per_second": 3.828, "step": 105 }, { "epoch": 0.0051341664244890055, "grad_norm": 1.5603739023208618, "learning_rate": 9.999568923762473e-06, "loss": 9.7578, "step": 106 }, { "epoch": 0.0051341664244890055, "eval_accuracy": 0.0814951151211883, "eval_loss": 9.75, "eval_runtime": 276.6948, "eval_samples_per_second": 122.037, "eval_steps_per_second": 3.816, "step": 106 }, { "epoch": 0.005182601956795505, "grad_norm": 1.6554555892944336, "learning_rate": 9.999564080209243e-06, "loss": 9.7734, "step": 107 }, { "epoch": 0.005182601956795505, "eval_accuracy": 0.08162946691114582, "eval_loss": 9.75, "eval_runtime": 277.5011, "eval_samples_per_second": 121.682, "eval_steps_per_second": 3.805, "step": 107 }, { "epoch": 0.0052310374891020054, "grad_norm": 1.4874709844589233, "learning_rate": 9.99955923665601e-06, "loss": 9.7891, "step": 108 }, { "epoch": 0.0052310374891020054, "eval_accuracy": 0.08175817366791184, "eval_loss": 9.7421875, "eval_runtime": 277.2974, "eval_samples_per_second": 121.772, "eval_steps_per_second": 3.808, "step": 108 }, { "epoch": 0.005279473021408505, "grad_norm": 1.5930671691894531, "learning_rate": 9.99955439310278e-06, "loss": 9.75, "step": 109 }, { "epoch": 0.005279473021408505, "eval_accuracy": 0.081921676988248, "eval_loss": 9.734375, "eval_runtime": 278.2891, "eval_samples_per_second": 121.338, "eval_steps_per_second": 3.795, "step": 109 }, { "epoch": 0.005327908553715005, "grad_norm": 1.7005099058151245, "learning_rate": 9.99954954954955e-06, "loss": 9.75, "step": 110 }, { "epoch": 0.005327908553715005, "eval_accuracy": 0.08213621719841287, "eval_loss": 9.734375, "eval_runtime": 276.9822, "eval_samples_per_second": 121.91, "eval_steps_per_second": 3.813, "step": 110 }, { "epoch": 0.005376344086021506, "grad_norm": 1.5735907554626465, "learning_rate": 9.99954470599632e-06, "loss": 9.7266, "step": 111 }, { "epoch": 0.005376344086021506, "eval_accuracy": 0.08227051109059406, "eval_loss": 9.7265625, "eval_runtime": 277.2678, "eval_samples_per_second": 121.785, "eval_steps_per_second": 3.809, "step": 111 }, { "epoch": 0.005424779618328005, "grad_norm": 1.473027229309082, "learning_rate": 9.99953986244309e-06, "loss": 9.7656, "step": 112 }, { "epoch": 0.005424779618328005, "eval_accuracy": 0.08236607137041518, "eval_loss": 9.71875, "eval_runtime": 276.699, "eval_samples_per_second": 122.035, "eval_steps_per_second": 3.816, "step": 112 }, { "epoch": 0.005473215150634506, "grad_norm": 1.4636644124984741, "learning_rate": 9.999535018889858e-06, "loss": 9.7812, "step": 113 }, { "epoch": 0.005473215150634506, "eval_accuracy": 0.0823751613212979, "eval_loss": 9.71875, "eval_runtime": 276.4525, "eval_samples_per_second": 122.144, "eval_steps_per_second": 3.82, "step": 113 }, { "epoch": 0.005521650682941005, "grad_norm": 1.4979418516159058, "learning_rate": 9.999530175336628e-06, "loss": 9.7734, "step": 114 }, { "epoch": 0.005521650682941005, "eval_accuracy": 0.08236062897944081, "eval_loss": 9.7109375, "eval_runtime": 277.8616, "eval_samples_per_second": 121.525, "eval_steps_per_second": 3.8, "step": 114 }, { "epoch": 0.005570086215247506, "grad_norm": 1.8021794557571411, "learning_rate": 9.999525331783398e-06, "loss": 9.7266, "step": 115 }, { "epoch": 0.005570086215247506, "eval_accuracy": 0.08244796777502407, "eval_loss": 9.7109375, "eval_runtime": 277.0309, "eval_samples_per_second": 121.889, "eval_steps_per_second": 3.812, "step": 115 }, { "epoch": 0.005618521747554005, "grad_norm": 1.8129605054855347, "learning_rate": 9.999520488230166e-06, "loss": 9.7266, "step": 116 }, { "epoch": 0.005618521747554005, "eval_accuracy": 0.08262273221285504, "eval_loss": 9.703125, "eval_runtime": 278.4245, "eval_samples_per_second": 121.279, "eval_steps_per_second": 3.793, "step": 116 }, { "epoch": 0.005666957279860506, "grad_norm": 1.5428948402404785, "learning_rate": 9.999515644676936e-06, "loss": 9.7109, "step": 117 }, { "epoch": 0.005666957279860506, "eval_accuracy": 0.08277873777115737, "eval_loss": 9.6953125, "eval_runtime": 276.3715, "eval_samples_per_second": 122.18, "eval_steps_per_second": 3.821, "step": 117 }, { "epoch": 0.005715392812167006, "grad_norm": 1.7619973421096802, "learning_rate": 9.999510801123706e-06, "loss": 9.6719, "step": 118 }, { "epoch": 0.005715392812167006, "eval_accuracy": 0.08289928094146184, "eval_loss": 9.6953125, "eval_runtime": 277.665, "eval_samples_per_second": 121.611, "eval_steps_per_second": 3.803, "step": 118 }, { "epoch": 0.005763828344473506, "grad_norm": 1.5316611528396606, "learning_rate": 9.999505957570474e-06, "loss": 9.6953, "step": 119 }, { "epoch": 0.005763828344473506, "eval_accuracy": 0.08300393117216567, "eval_loss": 9.6875, "eval_runtime": 276.1306, "eval_samples_per_second": 122.286, "eval_steps_per_second": 3.824, "step": 119 }, { "epoch": 0.005812263876780006, "grad_norm": 1.7051466703414917, "learning_rate": 9.999501114017244e-06, "loss": 9.6719, "step": 120 }, { "epoch": 0.005812263876780006, "eval_accuracy": 0.08307213475267416, "eval_loss": 9.6875, "eval_runtime": 276.7696, "eval_samples_per_second": 122.004, "eval_steps_per_second": 3.815, "step": 120 }, { "epoch": 0.005860699409086506, "grad_norm": 1.6584818363189697, "learning_rate": 9.999496270464012e-06, "loss": 9.6953, "step": 121 }, { "epoch": 0.005860699409086506, "eval_accuracy": 0.08307086100159505, "eval_loss": 9.6796875, "eval_runtime": 277.4403, "eval_samples_per_second": 121.709, "eval_steps_per_second": 3.806, "step": 121 }, { "epoch": 0.005909134941393006, "grad_norm": 1.7079665660858154, "learning_rate": 9.999491426910782e-06, "loss": 9.6875, "step": 122 }, { "epoch": 0.005909134941393006, "eval_accuracy": 0.08310041781640795, "eval_loss": 9.6796875, "eval_runtime": 275.7807, "eval_samples_per_second": 122.442, "eval_steps_per_second": 3.829, "step": 122 }, { "epoch": 0.0059575704736995055, "grad_norm": 1.6613987684249878, "learning_rate": 9.999486583357552e-06, "loss": 9.6719, "step": 123 }, { "epoch": 0.0059575704736995055, "eval_accuracy": 0.08318772766310303, "eval_loss": 9.671875, "eval_runtime": 277.1478, "eval_samples_per_second": 121.838, "eval_steps_per_second": 3.81, "step": 123 }, { "epoch": 0.006006006006006006, "grad_norm": 1.5512877702713013, "learning_rate": 9.99948173980432e-06, "loss": 9.6719, "step": 124 }, { "epoch": 0.006006006006006006, "eval_accuracy": 0.08327202682542932, "eval_loss": 9.6640625, "eval_runtime": 276.8801, "eval_samples_per_second": 121.955, "eval_steps_per_second": 3.814, "step": 124 }, { "epoch": 0.006054441538312506, "grad_norm": 1.6818300485610962, "learning_rate": 9.99947689625109e-06, "loss": 9.625, "step": 125 }, { "epoch": 0.006054441538312506, "eval_accuracy": 0.08333837767709547, "eval_loss": 9.6640625, "eval_runtime": 275.4944, "eval_samples_per_second": 122.569, "eval_steps_per_second": 3.833, "step": 125 }, { "epoch": 0.006102877070619006, "grad_norm": 1.497159719467163, "learning_rate": 9.99947205269786e-06, "loss": 9.6719, "step": 126 }, { "epoch": 0.006102877070619006, "eval_accuracy": 0.08344485168775347, "eval_loss": 9.65625, "eval_runtime": 276.6512, "eval_samples_per_second": 122.056, "eval_steps_per_second": 3.817, "step": 126 }, { "epoch": 0.006151312602925506, "grad_norm": 1.4452403783798218, "learning_rate": 9.99946720914463e-06, "loss": 9.6953, "step": 127 }, { "epoch": 0.006151312602925506, "eval_accuracy": 0.08355928664265588, "eval_loss": 9.65625, "eval_runtime": 277.0928, "eval_samples_per_second": 121.862, "eval_steps_per_second": 3.811, "step": 127 }, { "epoch": 0.006199748135232006, "grad_norm": 1.4734400510787964, "learning_rate": 9.9994623655914e-06, "loss": 9.6719, "step": 128 }, { "epoch": 0.006199748135232006, "eval_accuracy": 0.08367464796197946, "eval_loss": 9.6484375, "eval_runtime": 278.2759, "eval_samples_per_second": 121.344, "eval_steps_per_second": 3.795, "step": 128 }, { "epoch": 0.006248183667538506, "grad_norm": 1.4783730506896973, "learning_rate": 9.999457522038168e-06, "loss": 9.6797, "step": 129 }, { "epoch": 0.006248183667538506, "eval_accuracy": 0.08380396264539687, "eval_loss": 9.640625, "eval_runtime": 276.6233, "eval_samples_per_second": 122.069, "eval_steps_per_second": 3.817, "step": 129 }, { "epoch": 0.006296619199845007, "grad_norm": 1.7012325525283813, "learning_rate": 9.999452678484938e-06, "loss": 9.6484, "step": 130 }, { "epoch": 0.006296619199845007, "eval_accuracy": 0.08385152566864622, "eval_loss": 9.640625, "eval_runtime": 276.6061, "eval_samples_per_second": 122.076, "eval_steps_per_second": 3.818, "step": 130 }, { "epoch": 0.006345054732151506, "grad_norm": 1.5358777046203613, "learning_rate": 9.999447834931707e-06, "loss": 9.6719, "step": 131 }, { "epoch": 0.006345054732151506, "eval_accuracy": 0.0839065864539294, "eval_loss": 9.6328125, "eval_runtime": 276.1031, "eval_samples_per_second": 122.299, "eval_steps_per_second": 3.825, "step": 131 }, { "epoch": 0.006393490264458007, "grad_norm": 1.5622602701187134, "learning_rate": 9.999442991378476e-06, "loss": 9.6328, "step": 132 }, { "epoch": 0.006393490264458007, "eval_accuracy": 0.08391194199823927, "eval_loss": 9.6328125, "eval_runtime": 276.4355, "eval_samples_per_second": 122.151, "eval_steps_per_second": 3.82, "step": 132 }, { "epoch": 0.006441925796764506, "grad_norm": 1.5135513544082642, "learning_rate": 9.999438147825245e-06, "loss": 9.6719, "step": 133 }, { "epoch": 0.006441925796764506, "eval_accuracy": 0.08392389788904997, "eval_loss": 9.625, "eval_runtime": 277.1805, "eval_samples_per_second": 121.823, "eval_steps_per_second": 3.81, "step": 133 }, { "epoch": 0.006490361329071007, "grad_norm": 1.4829246997833252, "learning_rate": 9.999433304272015e-06, "loss": 9.6484, "step": 134 }, { "epoch": 0.006490361329071007, "eval_accuracy": 0.08400426000258629, "eval_loss": 9.6171875, "eval_runtime": 276.9095, "eval_samples_per_second": 121.942, "eval_steps_per_second": 3.814, "step": 134 }, { "epoch": 0.006538796861377506, "grad_norm": 1.506585955619812, "learning_rate": 9.999428460718784e-06, "loss": 9.6406, "step": 135 }, { "epoch": 0.006538796861377506, "eval_accuracy": 0.084119823964127, "eval_loss": 9.6171875, "eval_runtime": 275.7802, "eval_samples_per_second": 122.442, "eval_steps_per_second": 3.829, "step": 135 }, { "epoch": 0.0065872323936840066, "grad_norm": 1.597743272781372, "learning_rate": 9.999423617165553e-06, "loss": 9.6094, "step": 136 }, { "epoch": 0.0065872323936840066, "eval_accuracy": 0.08430503895058428, "eval_loss": 9.609375, "eval_runtime": 278.6385, "eval_samples_per_second": 121.186, "eval_steps_per_second": 3.79, "step": 136 }, { "epoch": 0.006635667925990507, "grad_norm": 1.5326935052871704, "learning_rate": 9.999418773612323e-06, "loss": 9.625, "step": 137 }, { "epoch": 0.006635667925990507, "eval_accuracy": 0.08447033710198644, "eval_loss": 9.609375, "eval_runtime": 276.8784, "eval_samples_per_second": 121.956, "eval_steps_per_second": 3.814, "step": 137 }, { "epoch": 0.0066841034582970065, "grad_norm": 1.5170117616653442, "learning_rate": 9.999413930059091e-06, "loss": 9.6562, "step": 138 }, { "epoch": 0.0066841034582970065, "eval_accuracy": 0.0845831509191518, "eval_loss": 9.6015625, "eval_runtime": 276.8242, "eval_samples_per_second": 121.98, "eval_steps_per_second": 3.815, "step": 138 }, { "epoch": 0.006732538990603507, "grad_norm": 1.5148200988769531, "learning_rate": 9.999409086505861e-06, "loss": 9.6172, "step": 139 }, { "epoch": 0.006732538990603507, "eval_accuracy": 0.0846733846035512, "eval_loss": 9.6015625, "eval_runtime": 276.0924, "eval_samples_per_second": 122.303, "eval_steps_per_second": 3.825, "step": 139 }, { "epoch": 0.0067809745229100065, "grad_norm": 1.584030032157898, "learning_rate": 9.999404242952631e-06, "loss": 9.6094, "step": 140 }, { "epoch": 0.0067809745229100065, "eval_accuracy": 0.08471049707817424, "eval_loss": 9.59375, "eval_runtime": 276.8263, "eval_samples_per_second": 121.979, "eval_steps_per_second": 3.815, "step": 140 }, { "epoch": 0.006829410055216507, "grad_norm": 1.5023019313812256, "learning_rate": 9.9993993993994e-06, "loss": 9.6562, "step": 141 }, { "epoch": 0.006829410055216507, "eval_accuracy": 0.08469269351195492, "eval_loss": 9.5859375, "eval_runtime": 276.3818, "eval_samples_per_second": 122.175, "eval_steps_per_second": 3.821, "step": 141 }, { "epoch": 0.006877845587523007, "grad_norm": 1.5090259313583374, "learning_rate": 9.99939455584617e-06, "loss": 9.6562, "step": 142 }, { "epoch": 0.006877845587523007, "eval_accuracy": 0.08472097657568871, "eval_loss": 9.5859375, "eval_runtime": 277.9886, "eval_samples_per_second": 121.469, "eval_steps_per_second": 3.799, "step": 142 }, { "epoch": 0.006926281119829507, "grad_norm": 1.4967498779296875, "learning_rate": 9.999389712292939e-06, "loss": 9.6562, "step": 143 }, { "epoch": 0.006926281119829507, "eval_accuracy": 0.0847566126570155, "eval_loss": 9.578125, "eval_runtime": 276.5132, "eval_samples_per_second": 122.117, "eval_steps_per_second": 3.819, "step": 143 }, { "epoch": 0.006974716652136007, "grad_norm": 1.8095794916152954, "learning_rate": 9.999384868739709e-06, "loss": 9.6016, "step": 144 }, { "epoch": 0.006974716652136007, "eval_accuracy": 0.08490364405998777, "eval_loss": 9.578125, "eval_runtime": 276.1595, "eval_samples_per_second": 122.274, "eval_steps_per_second": 3.824, "step": 144 }, { "epoch": 0.007023152184442507, "grad_norm": 1.7810986042022705, "learning_rate": 9.999380025186479e-06, "loss": 9.6094, "step": 145 }, { "epoch": 0.007023152184442507, "eval_accuracy": 0.08503437723892511, "eval_loss": 9.5703125, "eval_runtime": 276.2917, "eval_samples_per_second": 122.215, "eval_steps_per_second": 3.822, "step": 145 }, { "epoch": 0.007071587716749007, "grad_norm": 1.5788795948028564, "learning_rate": 9.999375181633247e-06, "loss": 9.5938, "step": 146 }, { "epoch": 0.007071587716749007, "eval_accuracy": 0.08506937644471235, "eval_loss": 9.5703125, "eval_runtime": 276.3857, "eval_samples_per_second": 122.173, "eval_steps_per_second": 3.821, "step": 146 }, { "epoch": 0.007120023249055507, "grad_norm": 1.8074451684951782, "learning_rate": 9.999370338080017e-06, "loss": 9.5703, "step": 147 }, { "epoch": 0.007120023249055507, "eval_accuracy": 0.08507481883568672, "eval_loss": 9.5625, "eval_runtime": 274.2982, "eval_samples_per_second": 123.103, "eval_steps_per_second": 3.85, "step": 147 }, { "epoch": 0.007168458781362007, "grad_norm": 1.7187494039535522, "learning_rate": 9.999365494526787e-06, "loss": 9.5859, "step": 148 }, { "epoch": 0.007168458781362007, "eval_accuracy": 0.08513216658313465, "eval_loss": 9.5625, "eval_runtime": 275.5423, "eval_samples_per_second": 122.547, "eval_steps_per_second": 3.832, "step": 148 }, { "epoch": 0.007216894313668508, "grad_norm": 1.6044690608978271, "learning_rate": 9.999360650973555e-06, "loss": 9.625, "step": 149 }, { "epoch": 0.007216894313668508, "eval_accuracy": 0.08522213972754059, "eval_loss": 9.5546875, "eval_runtime": 275.5733, "eval_samples_per_second": 122.534, "eval_steps_per_second": 3.832, "step": 149 }, { "epoch": 0.007265329845975007, "grad_norm": 1.7572296857833862, "learning_rate": 9.999355807420325e-06, "loss": 9.5859, "step": 150 }, { "epoch": 0.007265329845975007, "eval_accuracy": 0.0853520623376094, "eval_loss": 9.546875, "eval_runtime": 275.4002, "eval_samples_per_second": 122.611, "eval_steps_per_second": 3.834, "step": 150 }, { "epoch": 0.0073137653782815075, "grad_norm": 1.5954887866973877, "learning_rate": 9.999350963867095e-06, "loss": 9.5625, "step": 151 }, { "epoch": 0.0073137653782815075, "eval_accuracy": 0.0855014675494109, "eval_loss": 9.546875, "eval_runtime": 275.8854, "eval_samples_per_second": 122.395, "eval_steps_per_second": 3.828, "step": 151 }, { "epoch": 0.007362200910588007, "grad_norm": 1.6131614446640015, "learning_rate": 9.999346120313863e-06, "loss": 9.5547, "step": 152 }, { "epoch": 0.007362200910588007, "eval_accuracy": 0.0856389168704017, "eval_loss": 9.5390625, "eval_runtime": 276.1363, "eval_samples_per_second": 122.284, "eval_steps_per_second": 3.824, "step": 152 }, { "epoch": 0.0074106364428945075, "grad_norm": 1.4832433462142944, "learning_rate": 9.999341276760633e-06, "loss": 9.5703, "step": 153 }, { "epoch": 0.0074106364428945075, "eval_accuracy": 0.08576559720499642, "eval_loss": 9.5390625, "eval_runtime": 274.8867, "eval_samples_per_second": 122.84, "eval_steps_per_second": 3.842, "step": 153 }, { "epoch": 0.007459071975201007, "grad_norm": 1.7311336994171143, "learning_rate": 9.999336433207402e-06, "loss": 9.5391, "step": 154 }, { "epoch": 0.007459071975201007, "eval_accuracy": 0.08582592668792499, "eval_loss": 9.53125, "eval_runtime": 275.7658, "eval_samples_per_second": 122.448, "eval_steps_per_second": 3.829, "step": 154 }, { "epoch": 0.0075075075075075074, "grad_norm": 1.9239146709442139, "learning_rate": 9.99933158965417e-06, "loss": 9.5391, "step": 155 }, { "epoch": 0.0075075075075075074, "eval_accuracy": 0.08592603194318746, "eval_loss": 9.53125, "eval_runtime": 274.2731, "eval_samples_per_second": 123.115, "eval_steps_per_second": 3.85, "step": 155 }, { "epoch": 0.007555943039814008, "grad_norm": 1.8369977474212646, "learning_rate": 9.99932674610094e-06, "loss": 9.5, "step": 156 }, { "epoch": 0.007555943039814008, "eval_accuracy": 0.08610542820312428, "eval_loss": 9.5234375, "eval_runtime": 274.5904, "eval_samples_per_second": 122.972, "eval_steps_per_second": 3.846, "step": 156 }, { "epoch": 0.007604378572120507, "grad_norm": 1.5703845024108887, "learning_rate": 9.99932190254771e-06, "loss": 9.5547, "step": 157 }, { "epoch": 0.007604378572120507, "eval_accuracy": 0.08628861676741025, "eval_loss": 9.515625, "eval_runtime": 275.108, "eval_samples_per_second": 122.741, "eval_steps_per_second": 3.838, "step": 157 }, { "epoch": 0.007652814104427008, "grad_norm": 1.5686722993850708, "learning_rate": 9.999317058994478e-06, "loss": 9.5391, "step": 158 }, { "epoch": 0.007652814104427008, "eval_accuracy": 0.08633840885504802, "eval_loss": 9.515625, "eval_runtime": 274.8868, "eval_samples_per_second": 122.84, "eval_steps_per_second": 3.842, "step": 158 }, { "epoch": 0.007701249636733507, "grad_norm": 1.6259181499481201, "learning_rate": 9.999312215441248e-06, "loss": 9.5312, "step": 159 }, { "epoch": 0.007701249636733507, "eval_accuracy": 0.08635719668346484, "eval_loss": 9.515625, "eval_runtime": 274.9284, "eval_samples_per_second": 122.821, "eval_steps_per_second": 3.841, "step": 159 }, { "epoch": 0.007749685169040008, "grad_norm": 1.6887496709823608, "learning_rate": 9.999307371888018e-06, "loss": 9.5391, "step": 160 }, { "epoch": 0.007749685169040008, "eval_accuracy": 0.08644670664566019, "eval_loss": 9.5078125, "eval_runtime": 274.7492, "eval_samples_per_second": 122.901, "eval_steps_per_second": 3.844, "step": 160 }, { "epoch": 0.007798120701346508, "grad_norm": 1.6951507329940796, "learning_rate": 9.999302528334788e-06, "loss": 9.4688, "step": 161 }, { "epoch": 0.007798120701346508, "eval_accuracy": 0.08658233218669682, "eval_loss": 9.5, "eval_runtime": 275.2855, "eval_samples_per_second": 122.662, "eval_steps_per_second": 3.836, "step": 161 }, { "epoch": 0.007846556233653008, "grad_norm": 1.4970242977142334, "learning_rate": 9.999297684781556e-06, "loss": 9.5547, "step": 162 }, { "epoch": 0.007846556233653008, "eval_accuracy": 0.0867301741585376, "eval_loss": 9.5, "eval_runtime": 272.7321, "eval_samples_per_second": 123.81, "eval_steps_per_second": 3.872, "step": 162 }, { "epoch": 0.007894991765959508, "grad_norm": 1.5665501356124878, "learning_rate": 9.999292841228326e-06, "loss": 9.5078, "step": 163 }, { "epoch": 0.007894991765959508, "eval_accuracy": 0.08686166000856714, "eval_loss": 9.4921875, "eval_runtime": 274.5735, "eval_samples_per_second": 122.98, "eval_steps_per_second": 3.846, "step": 163 }, { "epoch": 0.007943427298266009, "grad_norm": 1.5631929636001587, "learning_rate": 9.999287997675096e-06, "loss": 9.5078, "step": 164 }, { "epoch": 0.007943427298266009, "eval_accuracy": 0.08703300847759506, "eval_loss": 9.4921875, "eval_runtime": 275.4664, "eval_samples_per_second": 122.581, "eval_steps_per_second": 3.833, "step": 164 }, { "epoch": 0.007991862830572507, "grad_norm": 1.5439754724502563, "learning_rate": 9.999283154121864e-06, "loss": 9.5, "step": 165 }, { "epoch": 0.007991862830572507, "eval_accuracy": 0.08722841347268517, "eval_loss": 9.484375, "eval_runtime": 275.1068, "eval_samples_per_second": 122.741, "eval_steps_per_second": 3.839, "step": 165 }, { "epoch": 0.008040298362879008, "grad_norm": 1.5011335611343384, "learning_rate": 9.999278310568634e-06, "loss": 9.5312, "step": 166 }, { "epoch": 0.008040298362879008, "eval_accuracy": 0.08746153886904974, "eval_loss": 9.484375, "eval_runtime": 274.8226, "eval_samples_per_second": 122.868, "eval_steps_per_second": 3.842, "step": 166 }, { "epoch": 0.008088733895185508, "grad_norm": 1.5114489793777466, "learning_rate": 9.999273467015404e-06, "loss": 9.5156, "step": 167 }, { "epoch": 0.008088733895185508, "eval_accuracy": 0.08767709229030025, "eval_loss": 9.4765625, "eval_runtime": 275.1958, "eval_samples_per_second": 122.702, "eval_steps_per_second": 3.837, "step": 167 }, { "epoch": 0.008137169427492008, "grad_norm": 1.6843675374984741, "learning_rate": 9.999268623462172e-06, "loss": 9.4844, "step": 168 }, { "epoch": 0.008137169427492008, "eval_accuracy": 0.08782279204441709, "eval_loss": 9.4765625, "eval_runtime": 275.685, "eval_samples_per_second": 122.484, "eval_steps_per_second": 3.83, "step": 168 }, { "epoch": 0.008185604959798509, "grad_norm": 1.6421033143997192, "learning_rate": 9.999263779908942e-06, "loss": 9.4688, "step": 169 }, { "epoch": 0.008185604959798509, "eval_accuracy": 0.08784629854160422, "eval_loss": 9.46875, "eval_runtime": 276.249, "eval_samples_per_second": 122.234, "eval_steps_per_second": 3.823, "step": 169 }, { "epoch": 0.008234040492105008, "grad_norm": 1.6387994289398193, "learning_rate": 9.999258936355712e-06, "loss": 9.5156, "step": 170 }, { "epoch": 0.008234040492105008, "eval_accuracy": 0.08786224937898121, "eval_loss": 9.4609375, "eval_runtime": 276.6173, "eval_samples_per_second": 122.071, "eval_steps_per_second": 3.818, "step": 170 }, { "epoch": 0.008282476024411508, "grad_norm": 1.5107547044754028, "learning_rate": 9.99925409280248e-06, "loss": 9.4922, "step": 171 }, { "epoch": 0.008282476024411508, "eval_accuracy": 0.08785258045033527, "eval_loss": 9.4609375, "eval_runtime": 276.9505, "eval_samples_per_second": 121.924, "eval_steps_per_second": 3.813, "step": 171 }, { "epoch": 0.008330911556718008, "grad_norm": 1.5190666913986206, "learning_rate": 9.99924924924925e-06, "loss": 9.4844, "step": 172 }, { "epoch": 0.008330911556718008, "eval_accuracy": 0.08782476056881207, "eval_loss": 9.453125, "eval_runtime": 275.3735, "eval_samples_per_second": 122.623, "eval_steps_per_second": 3.835, "step": 172 }, { "epoch": 0.008379347089024509, "grad_norm": 1.560573935508728, "learning_rate": 9.99924440569602e-06, "loss": 9.5234, "step": 173 }, { "epoch": 0.008379347089024509, "eval_accuracy": 0.08786801020772535, "eval_loss": 9.453125, "eval_runtime": 276.2814, "eval_samples_per_second": 122.22, "eval_steps_per_second": 3.822, "step": 173 }, { "epoch": 0.00842778262133101, "grad_norm": 1.7032357454299927, "learning_rate": 9.999239562142788e-06, "loss": 9.4844, "step": 174 }, { "epoch": 0.00842778262133101, "eval_accuracy": 0.08790746754228948, "eval_loss": 9.4453125, "eval_runtime": 275.8479, "eval_samples_per_second": 122.412, "eval_steps_per_second": 3.828, "step": 174 }, { "epoch": 0.008476218153637508, "grad_norm": 1.550713300704956, "learning_rate": 9.999234718589558e-06, "loss": 9.4219, "step": 175 }, { "epoch": 0.008476218153637508, "eval_accuracy": 0.08798829283803639, "eval_loss": 9.4453125, "eval_runtime": 275.7584, "eval_samples_per_second": 122.451, "eval_steps_per_second": 3.829, "step": 175 }, { "epoch": 0.008524653685944008, "grad_norm": 1.6866670846939087, "learning_rate": 9.999229875036328e-06, "loss": 9.4062, "step": 176 }, { "epoch": 0.008524653685944008, "eval_accuracy": 0.08809621429310245, "eval_loss": 9.4375, "eval_runtime": 276.5351, "eval_samples_per_second": 122.107, "eval_steps_per_second": 3.819, "step": 176 }, { "epoch": 0.008573089218250509, "grad_norm": 1.622749924659729, "learning_rate": 9.999225031483096e-06, "loss": 9.4375, "step": 177 }, { "epoch": 0.008573089218250509, "eval_accuracy": 0.08827711589522366, "eval_loss": 9.4375, "eval_runtime": 278.1964, "eval_samples_per_second": 121.378, "eval_steps_per_second": 3.796, "step": 177 }, { "epoch": 0.008621524750557009, "grad_norm": 1.5966665744781494, "learning_rate": 9.999220187929867e-06, "loss": 9.4375, "step": 178 }, { "epoch": 0.008621524750557009, "eval_accuracy": 0.08849081658763186, "eval_loss": 9.4296875, "eval_runtime": 277.208, "eval_samples_per_second": 121.811, "eval_steps_per_second": 3.809, "step": 178 }, { "epoch": 0.00866996028286351, "grad_norm": 1.499353289604187, "learning_rate": 9.999215344376635e-06, "loss": 9.4688, "step": 179 }, { "epoch": 0.00866996028286351, "eval_accuracy": 0.08868196609616225, "eval_loss": 9.4296875, "eval_runtime": 276.9344, "eval_samples_per_second": 121.931, "eval_steps_per_second": 3.813, "step": 179 }, { "epoch": 0.008718395815170008, "grad_norm": 1.5957542657852173, "learning_rate": 9.999210500823405e-06, "loss": 9.4453, "step": 180 }, { "epoch": 0.008718395815170008, "eval_accuracy": 0.08884607734314978, "eval_loss": 9.421875, "eval_runtime": 277.5515, "eval_samples_per_second": 121.66, "eval_steps_per_second": 3.805, "step": 180 }, { "epoch": 0.008766831347476509, "grad_norm": 1.519926905632019, "learning_rate": 9.999205657270175e-06, "loss": 9.4219, "step": 181 }, { "epoch": 0.008766831347476509, "eval_accuracy": 0.0889903586017467, "eval_loss": 9.421875, "eval_runtime": 277.0317, "eval_samples_per_second": 121.889, "eval_steps_per_second": 3.812, "step": 181 }, { "epoch": 0.008815266879783009, "grad_norm": 1.5913316011428833, "learning_rate": 9.999200813716943e-06, "loss": 9.4141, "step": 182 }, { "epoch": 0.008815266879783009, "eval_accuracy": 0.08903491094062725, "eval_loss": 9.4140625, "eval_runtime": 277.095, "eval_samples_per_second": 121.861, "eval_steps_per_second": 3.811, "step": 182 }, { "epoch": 0.00886370241208951, "grad_norm": 1.5328583717346191, "learning_rate": 9.999195970163713e-06, "loss": 9.4375, "step": 183 }, { "epoch": 0.00886370241208951, "eval_accuracy": 0.08903395562731792, "eval_loss": 9.40625, "eval_runtime": 276.2305, "eval_samples_per_second": 122.242, "eval_steps_per_second": 3.823, "step": 183 }, { "epoch": 0.008912137944396008, "grad_norm": 1.5967031717300415, "learning_rate": 9.999191126610483e-06, "loss": 9.3984, "step": 184 }, { "epoch": 0.008912137944396008, "eval_accuracy": 0.08904220606044394, "eval_loss": 9.40625, "eval_runtime": 278.9395, "eval_samples_per_second": 121.055, "eval_steps_per_second": 3.786, "step": 184 }, { "epoch": 0.008960573476702509, "grad_norm": 1.596799612045288, "learning_rate": 9.999186283057251e-06, "loss": 9.4297, "step": 185 }, { "epoch": 0.008960573476702509, "eval_accuracy": 0.08908192393500153, "eval_loss": 9.3984375, "eval_runtime": 277.1703, "eval_samples_per_second": 121.828, "eval_steps_per_second": 3.81, "step": 185 }, { "epoch": 0.009009009009009009, "grad_norm": 1.5406758785247803, "learning_rate": 9.999181439504021e-06, "loss": 9.3984, "step": 186 }, { "epoch": 0.009009009009009009, "eval_accuracy": 0.089139011142456, "eval_loss": 9.3984375, "eval_runtime": 276.7972, "eval_samples_per_second": 121.992, "eval_steps_per_second": 3.815, "step": 186 }, { "epoch": 0.00905744454131551, "grad_norm": 1.6137006282806396, "learning_rate": 9.999176595950791e-06, "loss": 9.3906, "step": 187 }, { "epoch": 0.00905744454131551, "eval_accuracy": 0.08919705366321981, "eval_loss": 9.390625, "eval_runtime": 277.048, "eval_samples_per_second": 121.881, "eval_steps_per_second": 3.812, "step": 187 }, { "epoch": 0.00910588007362201, "grad_norm": 1.5155887603759766, "learning_rate": 9.999171752397559e-06, "loss": 9.4219, "step": 188 }, { "epoch": 0.00910588007362201, "eval_accuracy": 0.08929362715412657, "eval_loss": 9.390625, "eval_runtime": 277.0594, "eval_samples_per_second": 121.876, "eval_steps_per_second": 3.811, "step": 188 }, { "epoch": 0.009154315605928508, "grad_norm": 1.7281869649887085, "learning_rate": 9.999166908844329e-06, "loss": 9.4062, "step": 189 }, { "epoch": 0.009154315605928508, "eval_accuracy": 0.08947652622953092, "eval_loss": 9.3828125, "eval_runtime": 277.1329, "eval_samples_per_second": 121.844, "eval_steps_per_second": 3.81, "step": 189 }, { "epoch": 0.009202751138235009, "grad_norm": 1.5536915063858032, "learning_rate": 9.999162065291099e-06, "loss": 9.375, "step": 190 }, { "epoch": 0.009202751138235009, "eval_accuracy": 0.08965256441844101, "eval_loss": 9.3828125, "eval_runtime": 277.1552, "eval_samples_per_second": 121.834, "eval_steps_per_second": 3.81, "step": 190 }, { "epoch": 0.00925118667054151, "grad_norm": 1.6295173168182373, "learning_rate": 9.999157221737867e-06, "loss": 9.3828, "step": 191 }, { "epoch": 0.00925118667054151, "eval_accuracy": 0.08979913263920268, "eval_loss": 9.375, "eval_runtime": 276.596, "eval_samples_per_second": 122.081, "eval_steps_per_second": 3.818, "step": 191 }, { "epoch": 0.00929962220284801, "grad_norm": 1.5873547792434692, "learning_rate": 9.999152378184637e-06, "loss": 9.3906, "step": 192 }, { "epoch": 0.00929962220284801, "eval_accuracy": 0.08980952529005266, "eval_loss": 9.375, "eval_runtime": 275.7787, "eval_samples_per_second": 122.442, "eval_steps_per_second": 3.829, "step": 192 }, { "epoch": 0.00934805773515451, "grad_norm": 1.4720993041992188, "learning_rate": 9.999147534631407e-06, "loss": 9.3906, "step": 193 }, { "epoch": 0.00934805773515451, "eval_accuracy": 0.08985268808230146, "eval_loss": 9.3671875, "eval_runtime": 276.9238, "eval_samples_per_second": 121.936, "eval_steps_per_second": 3.813, "step": 193 }, { "epoch": 0.009396493267461009, "grad_norm": 1.603896975517273, "learning_rate": 9.999142691078175e-06, "loss": 9.4141, "step": 194 }, { "epoch": 0.009396493267461009, "eval_accuracy": 0.08978121327743072, "eval_loss": 9.3671875, "eval_runtime": 275.8739, "eval_samples_per_second": 122.4, "eval_steps_per_second": 3.828, "step": 194 }, { "epoch": 0.00944492879976751, "grad_norm": 1.6265010833740234, "learning_rate": 9.999137847524946e-06, "loss": 9.3203, "step": 195 }, { "epoch": 0.00944492879976751, "eval_accuracy": 0.08979632459705102, "eval_loss": 9.359375, "eval_runtime": 276.051, "eval_samples_per_second": 122.322, "eval_steps_per_second": 3.825, "step": 195 }, { "epoch": 0.00949336433207401, "grad_norm": 1.609118103981018, "learning_rate": 9.999133003971715e-06, "loss": 9.3906, "step": 196 }, { "epoch": 0.00949336433207401, "eval_accuracy": 0.08980888841451311, "eval_loss": 9.359375, "eval_runtime": 274.9932, "eval_samples_per_second": 122.792, "eval_steps_per_second": 3.84, "step": 196 }, { "epoch": 0.00954179986438051, "grad_norm": 1.6511759757995605, "learning_rate": 9.999128160418484e-06, "loss": 9.3594, "step": 197 }, { "epoch": 0.00954179986438051, "eval_accuracy": 0.08997595244809313, "eval_loss": 9.3515625, "eval_runtime": 275.7963, "eval_samples_per_second": 122.435, "eval_steps_per_second": 3.829, "step": 197 }, { "epoch": 0.00959023539668701, "grad_norm": 1.5398412942886353, "learning_rate": 9.999123316865254e-06, "loss": 9.3516, "step": 198 }, { "epoch": 0.00959023539668701, "eval_accuracy": 0.09011600716901846, "eval_loss": 9.3515625, "eval_runtime": 275.8953, "eval_samples_per_second": 122.391, "eval_steps_per_second": 3.828, "step": 198 }, { "epoch": 0.009638670928993509, "grad_norm": 1.5655171871185303, "learning_rate": 9.999118473312022e-06, "loss": 9.3438, "step": 199 }, { "epoch": 0.009638670928993509, "eval_accuracy": 0.0902209468886039, "eval_loss": 9.34375, "eval_runtime": 276.4152, "eval_samples_per_second": 122.16, "eval_steps_per_second": 3.82, "step": 199 }, { "epoch": 0.00968710646130001, "grad_norm": 1.5900487899780273, "learning_rate": 9.999113629758792e-06, "loss": 9.3516, "step": 200 }, { "epoch": 0.00968710646130001, "eval_accuracy": 0.09037614187803769, "eval_loss": 9.34375, "eval_runtime": 274.9294, "eval_samples_per_second": 122.821, "eval_steps_per_second": 3.841, "step": 200 }, { "epoch": 0.00973554199360651, "grad_norm": 1.549442172050476, "learning_rate": 9.999108786205562e-06, "loss": 9.3125, "step": 201 }, { "epoch": 0.00973554199360651, "eval_accuracy": 0.09055328012469792, "eval_loss": 9.3359375, "eval_runtime": 275.1689, "eval_samples_per_second": 122.714, "eval_steps_per_second": 3.838, "step": 201 }, { "epoch": 0.00978397752591301, "grad_norm": 1.5649633407592773, "learning_rate": 9.99910394265233e-06, "loss": 9.3516, "step": 202 }, { "epoch": 0.00978397752591301, "eval_accuracy": 0.0907463113109588, "eval_loss": 9.3359375, "eval_runtime": 276.2363, "eval_samples_per_second": 122.24, "eval_steps_per_second": 3.823, "step": 202 }, { "epoch": 0.009832413058219509, "grad_norm": 1.6223474740982056, "learning_rate": 9.9990990990991e-06, "loss": 9.3359, "step": 203 }, { "epoch": 0.009832413058219509, "eval_accuracy": 0.09079908513407721, "eval_loss": 9.328125, "eval_runtime": 276.2485, "eval_samples_per_second": 122.234, "eval_steps_per_second": 3.823, "step": 203 }, { "epoch": 0.00988084859052601, "grad_norm": 1.5935430526733398, "learning_rate": 9.999094255545868e-06, "loss": 9.3516, "step": 204 }, { "epoch": 0.00988084859052601, "eval_accuracy": 0.09073201056020702, "eval_loss": 9.328125, "eval_runtime": 277.2721, "eval_samples_per_second": 121.783, "eval_steps_per_second": 3.809, "step": 204 }, { "epoch": 0.00992928412283251, "grad_norm": 1.6288846731185913, "learning_rate": 9.999089411992638e-06, "loss": 9.3281, "step": 205 }, { "epoch": 0.00992928412283251, "eval_accuracy": 0.09064690082901221, "eval_loss": 9.3203125, "eval_runtime": 276.8611, "eval_samples_per_second": 121.964, "eval_steps_per_second": 3.814, "step": 205 }, { "epoch": 0.00997771965513901, "grad_norm": 1.4847911596298218, "learning_rate": 9.999084568439408e-06, "loss": 9.375, "step": 206 }, { "epoch": 0.00997771965513901, "eval_accuracy": 0.0904633648780683, "eval_loss": 9.3125, "eval_runtime": 275.7135, "eval_samples_per_second": 122.471, "eval_steps_per_second": 3.83, "step": 206 }, { "epoch": 0.01002615518744551, "grad_norm": 1.6263271570205688, "learning_rate": 9.999079724886176e-06, "loss": 9.2812, "step": 207 }, { "epoch": 0.01002615518744551, "eval_accuracy": 0.09040885412166019, "eval_loss": 9.3125, "eval_runtime": 275.4549, "eval_samples_per_second": 122.586, "eval_steps_per_second": 3.834, "step": 207 }, { "epoch": 0.01007459071975201, "grad_norm": 1.5669511556625366, "learning_rate": 9.999074881332946e-06, "loss": 9.3281, "step": 208 }, { "epoch": 0.01007459071975201, "eval_accuracy": 0.09057241533977267, "eval_loss": 9.3046875, "eval_runtime": 276.0055, "eval_samples_per_second": 122.342, "eval_steps_per_second": 3.826, "step": 208 }, { "epoch": 0.01012302625205851, "grad_norm": 1.5233213901519775, "learning_rate": 9.999070037779716e-06, "loss": 9.3281, "step": 209 }, { "epoch": 0.01012302625205851, "eval_accuracy": 0.09081425435147383, "eval_loss": 9.3046875, "eval_runtime": 275.4632, "eval_samples_per_second": 122.583, "eval_steps_per_second": 3.834, "step": 209 }, { "epoch": 0.01017146178436501, "grad_norm": 1.6155483722686768, "learning_rate": 9.999065194226484e-06, "loss": 9.3594, "step": 210 }, { "epoch": 0.01017146178436501, "eval_accuracy": 0.0911761733512689, "eval_loss": 9.296875, "eval_runtime": 276.0337, "eval_samples_per_second": 122.329, "eval_steps_per_second": 3.826, "step": 210 }, { "epoch": 0.01021989731667151, "grad_norm": 1.5271143913269043, "learning_rate": 9.999060350673254e-06, "loss": 9.3438, "step": 211 }, { "epoch": 0.01021989731667151, "eval_accuracy": 0.09151863869821945, "eval_loss": 9.296875, "eval_runtime": 274.7088, "eval_samples_per_second": 122.919, "eval_steps_per_second": 3.844, "step": 211 }, { "epoch": 0.010268332848978011, "grad_norm": 1.6638132333755493, "learning_rate": 9.999055507120024e-06, "loss": 9.2891, "step": 212 }, { "epoch": 0.010268332848978011, "eval_accuracy": 0.09163796601522115, "eval_loss": 9.2890625, "eval_runtime": 275.693, "eval_samples_per_second": 122.48, "eval_steps_per_second": 3.83, "step": 212 }, { "epoch": 0.01031676838128451, "grad_norm": 1.5015349388122559, "learning_rate": 9.999050663566794e-06, "loss": 9.3438, "step": 213 }, { "epoch": 0.01031676838128451, "eval_accuracy": 0.09161217255586926, "eval_loss": 9.2890625, "eval_runtime": 274.9875, "eval_samples_per_second": 122.795, "eval_steps_per_second": 3.84, "step": 213 }, { "epoch": 0.01036520391359101, "grad_norm": 1.5039061307907104, "learning_rate": 9.999045820013564e-06, "loss": 9.3047, "step": 214 }, { "epoch": 0.01036520391359101, "eval_accuracy": 0.09152989981571427, "eval_loss": 9.28125, "eval_runtime": 274.8846, "eval_samples_per_second": 122.841, "eval_steps_per_second": 3.842, "step": 214 }, { "epoch": 0.01041363944589751, "grad_norm": 1.6265090703964233, "learning_rate": 9.999040976460332e-06, "loss": 9.2656, "step": 215 }, { "epoch": 0.01041363944589751, "eval_accuracy": 0.09139931138121775, "eval_loss": 9.28125, "eval_runtime": 274.6634, "eval_samples_per_second": 122.94, "eval_steps_per_second": 3.845, "step": 215 }, { "epoch": 0.010462074978204011, "grad_norm": 1.5140306949615479, "learning_rate": 9.999036132907102e-06, "loss": 9.2734, "step": 216 }, { "epoch": 0.010462074978204011, "eval_accuracy": 0.09134954824246813, "eval_loss": 9.2734375, "eval_runtime": 274.7537, "eval_samples_per_second": 122.899, "eval_steps_per_second": 3.843, "step": 216 }, { "epoch": 0.010510510510510511, "grad_norm": 1.5547981262207031, "learning_rate": 9.999031289353872e-06, "loss": 9.2891, "step": 217 }, { "epoch": 0.010510510510510511, "eval_accuracy": 0.09132992089629463, "eval_loss": 9.2734375, "eval_runtime": 275.3328, "eval_samples_per_second": 122.641, "eval_steps_per_second": 3.835, "step": 217 }, { "epoch": 0.01055894604281701, "grad_norm": 1.5140680074691772, "learning_rate": 9.99902644580064e-06, "loss": 9.2969, "step": 218 }, { "epoch": 0.01055894604281701, "eval_accuracy": 0.091310090907904, "eval_loss": 9.265625, "eval_runtime": 275.0207, "eval_samples_per_second": 122.78, "eval_steps_per_second": 3.84, "step": 218 }, { "epoch": 0.01060738157512351, "grad_norm": 1.5878396034240723, "learning_rate": 9.99902160224741e-06, "loss": 9.25, "step": 219 }, { "epoch": 0.01060738157512351, "eval_accuracy": 0.09137461797961599, "eval_loss": 9.265625, "eval_runtime": 275.3479, "eval_samples_per_second": 122.634, "eval_steps_per_second": 3.835, "step": 219 }, { "epoch": 0.01065581710743001, "grad_norm": 1.5309175252914429, "learning_rate": 9.99901675869418e-06, "loss": 9.2578, "step": 220 }, { "epoch": 0.01065581710743001, "eval_accuracy": 0.09149657964544039, "eval_loss": 9.2578125, "eval_runtime": 274.9496, "eval_samples_per_second": 122.812, "eval_steps_per_second": 3.841, "step": 220 }, { "epoch": 0.010704252639736511, "grad_norm": 1.5207297801971436, "learning_rate": 9.999011915140948e-06, "loss": 9.25, "step": 221 }, { "epoch": 0.010704252639736511, "eval_accuracy": 0.09163194464648355, "eval_loss": 9.2578125, "eval_runtime": 274.7916, "eval_samples_per_second": 122.882, "eval_steps_per_second": 3.843, "step": 221 }, { "epoch": 0.010752688172043012, "grad_norm": 1.5458952188491821, "learning_rate": 9.999007071587717e-06, "loss": 9.2656, "step": 222 }, { "epoch": 0.010752688172043012, "eval_accuracy": 0.09197788386001349, "eval_loss": 9.25, "eval_runtime": 274.1902, "eval_samples_per_second": 123.152, "eval_steps_per_second": 3.851, "step": 222 }, { "epoch": 0.01080112370434951, "grad_norm": 1.468177080154419, "learning_rate": 9.999002228034487e-06, "loss": 9.2578, "step": 223 }, { "epoch": 0.01080112370434951, "eval_accuracy": 0.09228760801445338, "eval_loss": 9.25, "eval_runtime": 274.801, "eval_samples_per_second": 122.878, "eval_steps_per_second": 3.843, "step": 223 }, { "epoch": 0.01084955923665601, "grad_norm": 1.466130018234253, "learning_rate": 9.998997384481255e-06, "loss": 9.2734, "step": 224 }, { "epoch": 0.01084955923665601, "eval_accuracy": 0.09260112447324241, "eval_loss": 9.2421875, "eval_runtime": 274.0532, "eval_samples_per_second": 123.213, "eval_steps_per_second": 3.853, "step": 224 }, { "epoch": 0.010897994768962511, "grad_norm": 1.4513353109359741, "learning_rate": 9.998992540928025e-06, "loss": 9.2891, "step": 225 }, { "epoch": 0.010897994768962511, "eval_accuracy": 0.09285208238471446, "eval_loss": 9.2421875, "eval_runtime": 274.2911, "eval_samples_per_second": 123.106, "eval_steps_per_second": 3.85, "step": 225 }, { "epoch": 0.010946430301269012, "grad_norm": 1.6049507856369019, "learning_rate": 9.998987697374795e-06, "loss": 9.25, "step": 226 }, { "epoch": 0.010946430301269012, "eval_accuracy": 0.09283341035185029, "eval_loss": 9.234375, "eval_runtime": 275.8109, "eval_samples_per_second": 122.428, "eval_steps_per_second": 3.829, "step": 226 }, { "epoch": 0.01099486583357551, "grad_norm": 1.6145049333572388, "learning_rate": 9.998982853821563e-06, "loss": 9.2344, "step": 227 }, { "epoch": 0.01099486583357551, "eval_accuracy": 0.09279450304616123, "eval_loss": 9.234375, "eval_runtime": 275.8634, "eval_samples_per_second": 122.405, "eval_steps_per_second": 3.828, "step": 227 }, { "epoch": 0.01104330136588201, "grad_norm": 1.5092509984970093, "learning_rate": 9.998978010268333e-06, "loss": 9.2656, "step": 228 }, { "epoch": 0.01104330136588201, "eval_accuracy": 0.09270768533056489, "eval_loss": 9.2265625, "eval_runtime": 275.1066, "eval_samples_per_second": 122.742, "eval_steps_per_second": 3.839, "step": 228 }, { "epoch": 0.011091736898188511, "grad_norm": 1.6245758533477783, "learning_rate": 9.998973166715103e-06, "loss": 9.2656, "step": 229 }, { "epoch": 0.011091736898188511, "eval_accuracy": 0.09277232819782952, "eval_loss": 9.2265625, "eval_runtime": 274.1156, "eval_samples_per_second": 123.185, "eval_steps_per_second": 3.852, "step": 229 }, { "epoch": 0.011140172430495011, "grad_norm": 1.5349066257476807, "learning_rate": 9.998968323161873e-06, "loss": 9.2656, "step": 230 }, { "epoch": 0.011140172430495011, "eval_accuracy": 0.09297572308605222, "eval_loss": 9.21875, "eval_runtime": 275.053, "eval_samples_per_second": 122.765, "eval_steps_per_second": 3.839, "step": 230 }, { "epoch": 0.011188607962801512, "grad_norm": 1.5491435527801514, "learning_rate": 9.998963479608643e-06, "loss": 9.25, "step": 231 }, { "epoch": 0.011188607962801512, "eval_accuracy": 0.09332889952162252, "eval_loss": 9.21875, "eval_runtime": 275.8534, "eval_samples_per_second": 122.409, "eval_steps_per_second": 3.828, "step": 231 }, { "epoch": 0.01123704349510801, "grad_norm": 1.5584843158721924, "learning_rate": 9.998958636055411e-06, "loss": 9.2891, "step": 232 }, { "epoch": 0.01123704349510801, "eval_accuracy": 0.0936537349956827, "eval_loss": 9.2109375, "eval_runtime": 275.2078, "eval_samples_per_second": 122.696, "eval_steps_per_second": 3.837, "step": 232 }, { "epoch": 0.011285479027414511, "grad_norm": 1.6923131942749023, "learning_rate": 9.99895379250218e-06, "loss": 9.2188, "step": 233 }, { "epoch": 0.011285479027414511, "eval_accuracy": 0.09384227910427856, "eval_loss": 9.203125, "eval_runtime": 275.578, "eval_samples_per_second": 122.532, "eval_steps_per_second": 3.832, "step": 233 }, { "epoch": 0.011333914559721011, "grad_norm": 1.636615514755249, "learning_rate": 9.99894894894895e-06, "loss": 9.2578, "step": 234 }, { "epoch": 0.011333914559721011, "eval_accuracy": 0.09388075217664518, "eval_loss": 9.203125, "eval_runtime": 275.1327, "eval_samples_per_second": 122.73, "eval_steps_per_second": 3.838, "step": 234 }, { "epoch": 0.011382350092027512, "grad_norm": 1.5573487281799316, "learning_rate": 9.998944105395719e-06, "loss": 9.2422, "step": 235 }, { "epoch": 0.011382350092027512, "eval_accuracy": 0.09375699567975478, "eval_loss": 9.1953125, "eval_runtime": 274.1524, "eval_samples_per_second": 123.169, "eval_steps_per_second": 3.852, "step": 235 }, { "epoch": 0.011430785624334012, "grad_norm": 1.758978009223938, "learning_rate": 9.998939261842489e-06, "loss": 9.2109, "step": 236 }, { "epoch": 0.011430785624334012, "eval_accuracy": 0.09350146384395322, "eval_loss": 9.1953125, "eval_runtime": 276.5184, "eval_samples_per_second": 122.115, "eval_steps_per_second": 3.819, "step": 236 }, { "epoch": 0.01147922115664051, "grad_norm": 1.6766207218170166, "learning_rate": 9.998934418289259e-06, "loss": 9.1797, "step": 237 }, { "epoch": 0.01147922115664051, "eval_accuracy": 0.09353747626082612, "eval_loss": 9.1953125, "eval_runtime": 275.7856, "eval_samples_per_second": 122.439, "eval_steps_per_second": 3.829, "step": 237 }, { "epoch": 0.011527656688947011, "grad_norm": 1.7581781148910522, "learning_rate": 9.998929574736027e-06, "loss": 9.1953, "step": 238 }, { "epoch": 0.011527656688947011, "eval_accuracy": 0.09377196225493427, "eval_loss": 9.1875, "eval_runtime": 276.3511, "eval_samples_per_second": 122.189, "eval_steps_per_second": 3.821, "step": 238 }, { "epoch": 0.011576092221253512, "grad_norm": 2.0294253826141357, "learning_rate": 9.998924731182797e-06, "loss": 9.1797, "step": 239 }, { "epoch": 0.011576092221253512, "eval_accuracy": 0.09428256274432681, "eval_loss": 9.1875, "eval_runtime": 275.8013, "eval_samples_per_second": 122.432, "eval_steps_per_second": 3.829, "step": 239 }, { "epoch": 0.011624527753560012, "grad_norm": 1.4771103858947754, "learning_rate": 9.998919887629566e-06, "loss": 9.2266, "step": 240 }, { "epoch": 0.011624527753560012, "eval_accuracy": 0.09478583916501448, "eval_loss": 9.1796875, "eval_runtime": 273.9293, "eval_samples_per_second": 123.269, "eval_steps_per_second": 3.855, "step": 240 }, { "epoch": 0.011672963285866512, "grad_norm": 1.494795322418213, "learning_rate": 9.998915044076335e-06, "loss": 9.2109, "step": 241 }, { "epoch": 0.011672963285866512, "eval_accuracy": 0.09512494644093829, "eval_loss": 9.171875, "eval_runtime": 274.6237, "eval_samples_per_second": 122.957, "eval_steps_per_second": 3.845, "step": 241 }, { "epoch": 0.011721398818173011, "grad_norm": 1.4708678722381592, "learning_rate": 9.998910200523105e-06, "loss": 9.1719, "step": 242 }, { "epoch": 0.011721398818173011, "eval_accuracy": 0.0953564506995658, "eval_loss": 9.171875, "eval_runtime": 274.319, "eval_samples_per_second": 123.094, "eval_steps_per_second": 3.85, "step": 242 }, { "epoch": 0.011769834350479512, "grad_norm": 1.5596672296524048, "learning_rate": 9.998905356969873e-06, "loss": 9.2031, "step": 243 }, { "epoch": 0.011769834350479512, "eval_accuracy": 0.09549230783170773, "eval_loss": 9.171875, "eval_runtime": 276.3644, "eval_samples_per_second": 122.183, "eval_steps_per_second": 3.821, "step": 243 }, { "epoch": 0.011818269882786012, "grad_norm": 1.6623671054840088, "learning_rate": 9.998900513416643e-06, "loss": 9.1953, "step": 244 }, { "epoch": 0.011818269882786012, "eval_accuracy": 0.09535106620636777, "eval_loss": 9.1640625, "eval_runtime": 276.0732, "eval_samples_per_second": 122.312, "eval_steps_per_second": 3.825, "step": 244 }, { "epoch": 0.011866705415092512, "grad_norm": 1.5597991943359375, "learning_rate": 9.998895669863412e-06, "loss": 9.1875, "step": 245 }, { "epoch": 0.011866705415092512, "eval_accuracy": 0.09501265370375983, "eval_loss": 9.1640625, "eval_runtime": 276.6977, "eval_samples_per_second": 122.036, "eval_steps_per_second": 3.816, "step": 245 }, { "epoch": 0.011915140947399011, "grad_norm": 1.540256381034851, "learning_rate": 9.998890826310182e-06, "loss": 9.2031, "step": 246 }, { "epoch": 0.011915140947399011, "eval_accuracy": 0.09489992673325895, "eval_loss": 9.15625, "eval_runtime": 275.9711, "eval_samples_per_second": 122.357, "eval_steps_per_second": 3.826, "step": 246 }, { "epoch": 0.011963576479705511, "grad_norm": 1.7622281312942505, "learning_rate": 9.998885982756952e-06, "loss": 9.1797, "step": 247 }, { "epoch": 0.011963576479705511, "eval_accuracy": 0.09502625968119574, "eval_loss": 9.1484375, "eval_runtime": 276.4265, "eval_samples_per_second": 122.155, "eval_steps_per_second": 3.82, "step": 247 }, { "epoch": 0.012012012012012012, "grad_norm": 1.5139068365097046, "learning_rate": 9.99888113920372e-06, "loss": 9.1484, "step": 248 }, { "epoch": 0.012012012012012012, "eval_accuracy": 0.09517540435300378, "eval_loss": 9.1484375, "eval_runtime": 276.0853, "eval_samples_per_second": 122.306, "eval_steps_per_second": 3.825, "step": 248 }, { "epoch": 0.012060447544318512, "grad_norm": 1.8858153820037842, "learning_rate": 9.99887629565049e-06, "loss": 9.1406, "step": 249 }, { "epoch": 0.012060447544318512, "eval_accuracy": 0.0953735305435811, "eval_loss": 9.1484375, "eval_runtime": 277.0667, "eval_samples_per_second": 121.873, "eval_steps_per_second": 3.811, "step": 249 }, { "epoch": 0.012108883076625013, "grad_norm": 1.5456604957580566, "learning_rate": 9.99887145209726e-06, "loss": 9.1641, "step": 250 }, { "epoch": 0.012108883076625013, "eval_accuracy": 0.09559869499570124, "eval_loss": 9.140625, "eval_runtime": 277.6666, "eval_samples_per_second": 121.61, "eval_steps_per_second": 3.803, "step": 250 }, { "epoch": 0.012157318608931511, "grad_norm": 1.594663143157959, "learning_rate": 9.998866608544028e-06, "loss": 9.1406, "step": 251 }, { "epoch": 0.012157318608931511, "eval_accuracy": 0.09564637381450322, "eval_loss": 9.140625, "eval_runtime": 276.8219, "eval_samples_per_second": 121.981, "eval_steps_per_second": 3.815, "step": 251 }, { "epoch": 0.012205754141238012, "grad_norm": 1.6868451833724976, "learning_rate": 9.998861764990798e-06, "loss": 9.1719, "step": 252 }, { "epoch": 0.012205754141238012, "eval_accuracy": 0.09536970929034377, "eval_loss": 9.1328125, "eval_runtime": 274.9951, "eval_samples_per_second": 122.791, "eval_steps_per_second": 3.84, "step": 252 }, { "epoch": 0.012254189673544512, "grad_norm": 1.5256409645080566, "learning_rate": 9.998856921437568e-06, "loss": 9.125, "step": 253 }, { "epoch": 0.012254189673544512, "eval_accuracy": 0.09525680862651392, "eval_loss": 9.1328125, "eval_runtime": 275.3586, "eval_samples_per_second": 122.629, "eval_steps_per_second": 3.835, "step": 253 }, { "epoch": 0.012302625205851013, "grad_norm": 1.565302848815918, "learning_rate": 9.998852077884336e-06, "loss": 9.1719, "step": 254 }, { "epoch": 0.012302625205851013, "eval_accuracy": 0.09499291056203368, "eval_loss": 9.125, "eval_runtime": 275.0624, "eval_samples_per_second": 122.761, "eval_steps_per_second": 3.839, "step": 254 }, { "epoch": 0.012351060738157513, "grad_norm": 1.4815526008605957, "learning_rate": 9.998847234331106e-06, "loss": 9.1797, "step": 255 }, { "epoch": 0.012351060738157513, "eval_accuracy": 0.09496150101837846, "eval_loss": 9.125, "eval_runtime": 275.5286, "eval_samples_per_second": 122.554, "eval_steps_per_second": 3.833, "step": 255 }, { "epoch": 0.012399496270464012, "grad_norm": 1.6366430521011353, "learning_rate": 9.998842390777876e-06, "loss": 9.0859, "step": 256 }, { "epoch": 0.012399496270464012, "eval_accuracy": 0.09506522488466111, "eval_loss": 9.1171875, "eval_runtime": 276.1079, "eval_samples_per_second": 122.296, "eval_steps_per_second": 3.825, "step": 256 }, { "epoch": 0.012447931802770512, "grad_norm": 1.6034120321273804, "learning_rate": 9.998837547224644e-06, "loss": 9.1875, "step": 257 }, { "epoch": 0.012447931802770512, "eval_accuracy": 0.09566794073618354, "eval_loss": 9.1171875, "eval_runtime": 275.8176, "eval_samples_per_second": 122.425, "eval_steps_per_second": 3.829, "step": 257 }, { "epoch": 0.012496367335077013, "grad_norm": 1.6382652521133423, "learning_rate": 9.998832703671414e-06, "loss": 9.1094, "step": 258 }, { "epoch": 0.012496367335077013, "eval_accuracy": 0.09628628898731317, "eval_loss": 9.109375, "eval_runtime": 275.7044, "eval_samples_per_second": 122.475, "eval_steps_per_second": 3.83, "step": 258 }, { "epoch": 0.012544802867383513, "grad_norm": 1.4967926740646362, "learning_rate": 9.998827860118184e-06, "loss": 9.0938, "step": 259 }, { "epoch": 0.012544802867383513, "eval_accuracy": 0.09678612049030963, "eval_loss": 9.109375, "eval_runtime": 276.0511, "eval_samples_per_second": 122.322, "eval_steps_per_second": 3.825, "step": 259 }, { "epoch": 0.012593238399690013, "grad_norm": 2.137125015258789, "learning_rate": 9.998823016564952e-06, "loss": 9.1016, "step": 260 }, { "epoch": 0.012593238399690013, "eval_accuracy": 0.09689392614982306, "eval_loss": 9.1015625, "eval_runtime": 276.485, "eval_samples_per_second": 122.13, "eval_steps_per_second": 3.819, "step": 260 }, { "epoch": 0.012641673931996512, "grad_norm": 1.655360460281372, "learning_rate": 9.998818173011722e-06, "loss": 9.1406, "step": 261 }, { "epoch": 0.012641673931996512, "eval_accuracy": 0.09685108179534405, "eval_loss": 9.1015625, "eval_runtime": 276.2683, "eval_samples_per_second": 122.225, "eval_steps_per_second": 3.822, "step": 261 }, { "epoch": 0.012690109464303012, "grad_norm": 1.615159273147583, "learning_rate": 9.998813329458492e-06, "loss": 9.0781, "step": 262 }, { "epoch": 0.012690109464303012, "eval_accuracy": 0.09661187713246557, "eval_loss": 9.09375, "eval_runtime": 277.0541, "eval_samples_per_second": 121.879, "eval_steps_per_second": 3.812, "step": 262 }, { "epoch": 0.012738544996609513, "grad_norm": 1.56972074508667, "learning_rate": 9.998808485905261e-06, "loss": 9.1094, "step": 263 }, { "epoch": 0.012738544996609513, "eval_accuracy": 0.09626272459234972, "eval_loss": 9.09375, "eval_runtime": 276.2994, "eval_samples_per_second": 122.212, "eval_steps_per_second": 3.822, "step": 263 }, { "epoch": 0.012786980528916013, "grad_norm": 1.5011804103851318, "learning_rate": 9.998803642352031e-06, "loss": 9.1172, "step": 264 }, { "epoch": 0.012786980528916013, "eval_accuracy": 0.09589736067486343, "eval_loss": 9.0859375, "eval_runtime": 276.7096, "eval_samples_per_second": 122.03, "eval_steps_per_second": 3.816, "step": 264 }, { "epoch": 0.012835416061222512, "grad_norm": 1.6870362758636475, "learning_rate": 9.9987987987988e-06, "loss": 9.1172, "step": 265 }, { "epoch": 0.012835416061222512, "eval_accuracy": 0.09563450477035701, "eval_loss": 9.0859375, "eval_runtime": 275.3219, "eval_samples_per_second": 122.646, "eval_steps_per_second": 3.836, "step": 265 }, { "epoch": 0.012883851593529012, "grad_norm": 1.5479800701141357, "learning_rate": 9.99879395524557e-06, "loss": 9.125, "step": 266 }, { "epoch": 0.012883851593529012, "eval_accuracy": 0.09551361421339459, "eval_loss": 9.0859375, "eval_runtime": 275.3988, "eval_samples_per_second": 122.611, "eval_steps_per_second": 3.834, "step": 266 }, { "epoch": 0.012932287125835513, "grad_norm": 1.5906175374984741, "learning_rate": 9.99878911169234e-06, "loss": 9.1094, "step": 267 }, { "epoch": 0.012932287125835513, "eval_accuracy": 0.09573892340995555, "eval_loss": 9.078125, "eval_runtime": 273.7737, "eval_samples_per_second": 123.339, "eval_steps_per_second": 3.857, "step": 267 }, { "epoch": 0.012980722658142013, "grad_norm": 1.5682505369186401, "learning_rate": 9.998784268139107e-06, "loss": 9.0781, "step": 268 }, { "epoch": 0.012980722658142013, "eval_accuracy": 0.09638306512043707, "eval_loss": 9.078125, "eval_runtime": 272.9963, "eval_samples_per_second": 123.69, "eval_steps_per_second": 3.868, "step": 268 }, { "epoch": 0.013029158190448514, "grad_norm": 1.5259824991226196, "learning_rate": 9.998779424585877e-06, "loss": 9.125, "step": 269 }, { "epoch": 0.013029158190448514, "eval_accuracy": 0.09727434348915333, "eval_loss": 9.0703125, "eval_runtime": 274.1813, "eval_samples_per_second": 123.156, "eval_steps_per_second": 3.851, "step": 269 }, { "epoch": 0.013077593722755012, "grad_norm": 1.5006844997406006, "learning_rate": 9.998774581032647e-06, "loss": 9.0547, "step": 270 }, { "epoch": 0.013077593722755012, "eval_accuracy": 0.09799540239547996, "eval_loss": 9.0703125, "eval_runtime": 272.3774, "eval_samples_per_second": 123.971, "eval_steps_per_second": 3.877, "step": 270 }, { "epoch": 0.013126029255061513, "grad_norm": 1.4817960262298584, "learning_rate": 9.998769737479415e-06, "loss": 9.0781, "step": 271 }, { "epoch": 0.013126029255061513, "eval_accuracy": 0.09830503970325537, "eval_loss": 9.0625, "eval_runtime": 271.8893, "eval_samples_per_second": 124.194, "eval_steps_per_second": 3.884, "step": 271 }, { "epoch": 0.013174464787368013, "grad_norm": 1.6597894430160522, "learning_rate": 9.998764893926185e-06, "loss": 9.1016, "step": 272 }, { "epoch": 0.013174464787368013, "eval_accuracy": 0.09809840253955858, "eval_loss": 9.0625, "eval_runtime": 273.6177, "eval_samples_per_second": 123.409, "eval_steps_per_second": 3.859, "step": 272 }, { "epoch": 0.013222900319674514, "grad_norm": 1.5939491987228394, "learning_rate": 9.998760050372955e-06, "loss": 9.0703, "step": 273 }, { "epoch": 0.013222900319674514, "eval_accuracy": 0.09753586774480431, "eval_loss": 9.0546875, "eval_runtime": 273.2408, "eval_samples_per_second": 123.58, "eval_steps_per_second": 3.865, "step": 273 }, { "epoch": 0.013271335851981014, "grad_norm": 1.5878655910491943, "learning_rate": 9.998755206819723e-06, "loss": 9.0547, "step": 274 }, { "epoch": 0.013271335851981014, "eval_accuracy": 0.09690096072964631, "eval_loss": 9.0546875, "eval_runtime": 272.9584, "eval_samples_per_second": 123.707, "eval_steps_per_second": 3.869, "step": 274 }, { "epoch": 0.013319771384287513, "grad_norm": 1.6010398864746094, "learning_rate": 9.998750363266493e-06, "loss": 9.0312, "step": 275 }, { "epoch": 0.013319771384287513, "eval_accuracy": 0.09638720481144417, "eval_loss": 9.046875, "eval_runtime": 272.7096, "eval_samples_per_second": 123.82, "eval_steps_per_second": 3.872, "step": 275 }, { "epoch": 0.013368206916594013, "grad_norm": 1.7441232204437256, "learning_rate": 9.998745519713263e-06, "loss": 9.0938, "step": 276 }, { "epoch": 0.013368206916594013, "eval_accuracy": 0.09639140240022759, "eval_loss": 9.046875, "eval_runtime": 273.1864, "eval_samples_per_second": 123.604, "eval_steps_per_second": 3.865, "step": 276 }, { "epoch": 0.013416642448900513, "grad_norm": 1.586517095565796, "learning_rate": 9.998740676160031e-06, "loss": 9.0156, "step": 277 }, { "epoch": 0.013416642448900513, "eval_accuracy": 0.09671230082549781, "eval_loss": 9.0390625, "eval_runtime": 272.732, "eval_samples_per_second": 123.81, "eval_steps_per_second": 3.872, "step": 277 }, { "epoch": 0.013465077981207014, "grad_norm": 1.5039782524108887, "learning_rate": 9.998735832606801e-06, "loss": 9.1094, "step": 278 }, { "epoch": 0.013465077981207014, "eval_accuracy": 0.0972694800759422, "eval_loss": 9.0390625, "eval_runtime": 272.288, "eval_samples_per_second": 124.012, "eval_steps_per_second": 3.878, "step": 278 }, { "epoch": 0.013513513513513514, "grad_norm": 1.534090518951416, "learning_rate": 9.99873098905357e-06, "loss": 9.0859, "step": 279 }, { "epoch": 0.013513513513513514, "eval_accuracy": 0.09795999790525846, "eval_loss": 9.03125, "eval_runtime": 273.6685, "eval_samples_per_second": 123.387, "eval_steps_per_second": 3.859, "step": 279 }, { "epoch": 0.013561949045820013, "grad_norm": 1.5849289894104004, "learning_rate": 9.99872614550034e-06, "loss": 9.0234, "step": 280 }, { "epoch": 0.013561949045820013, "eval_accuracy": 0.09837451703484297, "eval_loss": 9.03125, "eval_runtime": 273.5891, "eval_samples_per_second": 123.422, "eval_steps_per_second": 3.86, "step": 280 }, { "epoch": 0.013610384578126513, "grad_norm": 1.523674488067627, "learning_rate": 9.99872130194711e-06, "loss": 9.0781, "step": 281 }, { "epoch": 0.013610384578126513, "eval_accuracy": 0.09843887041322598, "eval_loss": 9.0234375, "eval_runtime": 271.771, "eval_samples_per_second": 124.248, "eval_steps_per_second": 3.886, "step": 281 }, { "epoch": 0.013658820110433014, "grad_norm": 1.646908164024353, "learning_rate": 9.998716458393879e-06, "loss": 9.0547, "step": 282 }, { "epoch": 0.013658820110433014, "eval_accuracy": 0.09825828724887455, "eval_loss": 9.0234375, "eval_runtime": 272.8785, "eval_samples_per_second": 123.744, "eval_steps_per_second": 3.87, "step": 282 }, { "epoch": 0.013707255642739514, "grad_norm": 1.6313369274139404, "learning_rate": 9.998711614840649e-06, "loss": 9.0234, "step": 283 }, { "epoch": 0.013707255642739514, "eval_accuracy": 0.09794141271905878, "eval_loss": 9.015625, "eval_runtime": 273.6116, "eval_samples_per_second": 123.412, "eval_steps_per_second": 3.859, "step": 283 }, { "epoch": 0.013755691175046015, "grad_norm": 1.6014082431793213, "learning_rate": 9.998706771287417e-06, "loss": 9.0312, "step": 284 }, { "epoch": 0.013755691175046015, "eval_accuracy": 0.09782020372432657, "eval_loss": 9.015625, "eval_runtime": 273.8768, "eval_samples_per_second": 123.293, "eval_steps_per_second": 3.856, "step": 284 }, { "epoch": 0.013804126707352513, "grad_norm": 1.5171185731887817, "learning_rate": 9.998701927734187e-06, "loss": 9.0391, "step": 285 }, { "epoch": 0.013804126707352513, "eval_accuracy": 0.09783482291284813, "eval_loss": 9.0078125, "eval_runtime": 273.0741, "eval_samples_per_second": 123.655, "eval_steps_per_second": 3.867, "step": 285 }, { "epoch": 0.013852562239659014, "grad_norm": 1.5492215156555176, "learning_rate": 9.998697084180956e-06, "loss": 9.0312, "step": 286 }, { "epoch": 0.013852562239659014, "eval_accuracy": 0.09799062582893332, "eval_loss": 9.0078125, "eval_runtime": 273.3737, "eval_samples_per_second": 123.52, "eval_steps_per_second": 3.863, "step": 286 }, { "epoch": 0.013900997771965514, "grad_norm": 1.6462546586990356, "learning_rate": 9.998692240627725e-06, "loss": 9.0625, "step": 287 }, { "epoch": 0.013900997771965514, "eval_accuracy": 0.09823570711610857, "eval_loss": 9.0078125, "eval_runtime": 272.8604, "eval_samples_per_second": 123.752, "eval_steps_per_second": 3.87, "step": 287 }, { "epoch": 0.013949433304272014, "grad_norm": 1.6392829418182373, "learning_rate": 9.998687397074494e-06, "loss": 9.0234, "step": 288 }, { "epoch": 0.013949433304272014, "eval_accuracy": 0.09855127894595708, "eval_loss": 9.0, "eval_runtime": 272.8831, "eval_samples_per_second": 123.742, "eval_steps_per_second": 3.87, "step": 288 }, { "epoch": 0.013997868836578513, "grad_norm": 1.5253773927688599, "learning_rate": 9.998682553521264e-06, "loss": 9.0078, "step": 289 }, { "epoch": 0.013997868836578513, "eval_accuracy": 0.09903811239816902, "eval_loss": 9.0, "eval_runtime": 272.9158, "eval_samples_per_second": 123.727, "eval_steps_per_second": 3.869, "step": 289 }, { "epoch": 0.014046304368885014, "grad_norm": 1.496385931968689, "learning_rate": 9.998677709968032e-06, "loss": 9.0, "step": 290 }, { "epoch": 0.014046304368885014, "eval_accuracy": 0.09958177251784199, "eval_loss": 8.9921875, "eval_runtime": 273.8166, "eval_samples_per_second": 123.32, "eval_steps_per_second": 3.857, "step": 290 }, { "epoch": 0.014094739901191514, "grad_norm": 1.5430630445480347, "learning_rate": 9.998672866414802e-06, "loss": 9.0078, "step": 291 }, { "epoch": 0.014094739901191514, "eval_accuracy": 0.09971363470341763, "eval_loss": 8.9921875, "eval_runtime": 272.085, "eval_samples_per_second": 124.105, "eval_steps_per_second": 3.881, "step": 291 }, { "epoch": 0.014143175433498014, "grad_norm": 1.7000993490219116, "learning_rate": 9.998668022861572e-06, "loss": 9.0, "step": 292 }, { "epoch": 0.014143175433498014, "eval_accuracy": 0.09992762488470744, "eval_loss": 8.984375, "eval_runtime": 273.2642, "eval_samples_per_second": 123.569, "eval_steps_per_second": 3.864, "step": 292 }, { "epoch": 0.014191610965804515, "grad_norm": 1.5401760339736938, "learning_rate": 9.99866317930834e-06, "loss": 9.0078, "step": 293 }, { "epoch": 0.014191610965804515, "eval_accuracy": 0.09988570689464958, "eval_loss": 8.984375, "eval_runtime": 271.53, "eval_samples_per_second": 124.358, "eval_steps_per_second": 3.889, "step": 293 }, { "epoch": 0.014240046498111014, "grad_norm": 1.5899308919906616, "learning_rate": 9.99865833575511e-06, "loss": 8.9922, "step": 294 }, { "epoch": 0.014240046498111014, "eval_accuracy": 0.09954338629213985, "eval_loss": 8.9765625, "eval_runtime": 271.4912, "eval_samples_per_second": 124.376, "eval_steps_per_second": 3.89, "step": 294 }, { "epoch": 0.014288482030417514, "grad_norm": 1.5780622959136963, "learning_rate": 9.99865349220188e-06, "loss": 9.0078, "step": 295 }, { "epoch": 0.014288482030417514, "eval_accuracy": 0.09903443588937252, "eval_loss": 8.9765625, "eval_runtime": 272.7386, "eval_samples_per_second": 123.807, "eval_steps_per_second": 3.872, "step": 295 }, { "epoch": 0.014336917562724014, "grad_norm": 1.6593127250671387, "learning_rate": 9.998648648648648e-06, "loss": 8.9844, "step": 296 }, { "epoch": 0.014336917562724014, "eval_accuracy": 0.09852673028879613, "eval_loss": 8.96875, "eval_runtime": 273.5255, "eval_samples_per_second": 123.451, "eval_steps_per_second": 3.861, "step": 296 }, { "epoch": 0.014385353095030515, "grad_norm": 1.5654476881027222, "learning_rate": 9.99864380509542e-06, "loss": 8.9766, "step": 297 }, { "epoch": 0.014385353095030515, "eval_accuracy": 0.09832848830266619, "eval_loss": 8.96875, "eval_runtime": 272.9229, "eval_samples_per_second": 123.724, "eval_steps_per_second": 3.869, "step": 297 }, { "epoch": 0.014433788627337015, "grad_norm": 1.604347586631775, "learning_rate": 9.998638961542188e-06, "loss": 8.9531, "step": 298 }, { "epoch": 0.014433788627337015, "eval_accuracy": 0.09845606605279392, "eval_loss": 8.9609375, "eval_runtime": 274.3511, "eval_samples_per_second": 123.08, "eval_steps_per_second": 3.849, "step": 298 }, { "epoch": 0.014482224159643514, "grad_norm": 1.6617177724838257, "learning_rate": 9.998634117988958e-06, "loss": 8.9688, "step": 299 }, { "epoch": 0.014482224159643514, "eval_accuracy": 0.09881535070376629, "eval_loss": 8.9609375, "eval_runtime": 275.2558, "eval_samples_per_second": 122.675, "eval_steps_per_second": 3.836, "step": 299 }, { "epoch": 0.014530659691950014, "grad_norm": 1.5470112562179565, "learning_rate": 9.998629274435728e-06, "loss": 9.0312, "step": 300 }, { "epoch": 0.014530659691950014, "eval_accuracy": 0.09944139935914688, "eval_loss": 8.953125, "eval_runtime": 276.2061, "eval_samples_per_second": 122.253, "eval_steps_per_second": 3.823, "step": 300 }, { "epoch": 0.014579095224256515, "grad_norm": 1.5366243124008179, "learning_rate": 9.998624430882496e-06, "loss": 9.0156, "step": 301 }, { "epoch": 0.014579095224256515, "eval_accuracy": 0.0997650189799043, "eval_loss": 8.953125, "eval_runtime": 273.6153, "eval_samples_per_second": 123.41, "eval_steps_per_second": 3.859, "step": 301 }, { "epoch": 0.014627530756563015, "grad_norm": 1.8393828868865967, "learning_rate": 9.998619587329266e-06, "loss": 8.9688, "step": 302 }, { "epoch": 0.014627530756563015, "eval_accuracy": 0.09985768437090925, "eval_loss": 8.9453125, "eval_runtime": 272.5368, "eval_samples_per_second": 123.899, "eval_steps_per_second": 3.875, "step": 302 }, { "epoch": 0.014675966288869515, "grad_norm": 1.5231480598449707, "learning_rate": 9.998614743776036e-06, "loss": 9.0, "step": 303 }, { "epoch": 0.014675966288869515, "eval_accuracy": 0.09973019346744601, "eval_loss": 8.9453125, "eval_runtime": 272.9977, "eval_samples_per_second": 123.69, "eval_steps_per_second": 3.868, "step": 303 }, { "epoch": 0.014724401821176014, "grad_norm": 1.4661198854446411, "learning_rate": 9.998609900222804e-06, "loss": 8.9375, "step": 304 }, { "epoch": 0.014724401821176014, "eval_accuracy": 0.09956171093834608, "eval_loss": 8.9375, "eval_runtime": 273.4012, "eval_samples_per_second": 123.507, "eval_steps_per_second": 3.862, "step": 304 }, { "epoch": 0.014772837353482515, "grad_norm": 1.561277985572815, "learning_rate": 9.998605056669574e-06, "loss": 8.9766, "step": 305 }, { "epoch": 0.014772837353482515, "eval_accuracy": 0.09941792181084791, "eval_loss": 8.9375, "eval_runtime": 273.3274, "eval_samples_per_second": 123.54, "eval_steps_per_second": 3.863, "step": 305 }, { "epoch": 0.014821272885789015, "grad_norm": 1.5084242820739746, "learning_rate": 9.998600213116343e-06, "loss": 8.9375, "step": 306 }, { "epoch": 0.014821272885789015, "eval_accuracy": 0.09944964979227292, "eval_loss": 8.9375, "eval_runtime": 272.8323, "eval_samples_per_second": 123.765, "eval_steps_per_second": 3.871, "step": 306 }, { "epoch": 0.014869708418095515, "grad_norm": 1.533602237701416, "learning_rate": 9.998595369563112e-06, "loss": 8.9688, "step": 307 }, { "epoch": 0.014869708418095515, "eval_accuracy": 0.09968031453314374, "eval_loss": 8.9296875, "eval_runtime": 273.2909, "eval_samples_per_second": 123.557, "eval_steps_per_second": 3.864, "step": 307 }, { "epoch": 0.014918143950402014, "grad_norm": 1.527116060256958, "learning_rate": 9.998590526009882e-06, "loss": 8.9531, "step": 308 }, { "epoch": 0.014918143950402014, "eval_accuracy": 0.099938480717768, "eval_loss": 8.9296875, "eval_runtime": 273.5392, "eval_samples_per_second": 123.445, "eval_steps_per_second": 3.861, "step": 308 }, { "epoch": 0.014966579482708514, "grad_norm": 1.5343533754348755, "learning_rate": 9.998585682456651e-06, "loss": 8.9531, "step": 309 }, { "epoch": 0.014966579482708514, "eval_accuracy": 0.10016824804310583, "eval_loss": 8.921875, "eval_runtime": 273.4771, "eval_samples_per_second": 123.473, "eval_steps_per_second": 3.861, "step": 309 }, { "epoch": 0.015015015015015015, "grad_norm": 2.02919602394104, "learning_rate": 9.99858083890342e-06, "loss": 8.9062, "step": 310 }, { "epoch": 0.015015015015015015, "eval_accuracy": 0.10033235929009336, "eval_loss": 8.921875, "eval_runtime": 272.9511, "eval_samples_per_second": 123.711, "eval_steps_per_second": 3.869, "step": 310 }, { "epoch": 0.015063450547321515, "grad_norm": 1.50547456741333, "learning_rate": 9.99857599535019e-06, "loss": 8.9375, "step": 311 }, { "epoch": 0.015063450547321515, "eval_accuracy": 0.10040134449058222, "eval_loss": 8.9140625, "eval_runtime": 274.4167, "eval_samples_per_second": 123.05, "eval_steps_per_second": 3.848, "step": 311 }, { "epoch": 0.015111886079628016, "grad_norm": 1.5935693979263306, "learning_rate": 9.99857115179696e-06, "loss": 8.8828, "step": 312 }, { "epoch": 0.015111886079628016, "eval_accuracy": 0.1003402333876733, "eval_loss": 8.9140625, "eval_runtime": 272.7238, "eval_samples_per_second": 123.814, "eval_steps_per_second": 3.872, "step": 312 }, { "epoch": 0.015160321611934514, "grad_norm": 1.4832584857940674, "learning_rate": 9.998566308243727e-06, "loss": 8.9219, "step": 313 }, { "epoch": 0.015160321611934514, "eval_accuracy": 0.10030963441288658, "eval_loss": 8.90625, "eval_runtime": 272.1097, "eval_samples_per_second": 124.093, "eval_steps_per_second": 3.881, "step": 313 }, { "epoch": 0.015208757144241015, "grad_norm": 1.4832618236541748, "learning_rate": 9.998561464690499e-06, "loss": 8.9219, "step": 314 }, { "epoch": 0.015208757144241015, "eval_accuracy": 0.10044369671396249, "eval_loss": 8.90625, "eval_runtime": 273.1456, "eval_samples_per_second": 123.623, "eval_steps_per_second": 3.866, "step": 314 }, { "epoch": 0.015257192676547515, "grad_norm": 1.5148617029190063, "learning_rate": 9.998556621137267e-06, "loss": 8.9297, "step": 315 }, { "epoch": 0.015257192676547515, "eval_accuracy": 0.10085141285482906, "eval_loss": 8.90625, "eval_runtime": 273.2988, "eval_samples_per_second": 123.553, "eval_steps_per_second": 3.864, "step": 315 }, { "epoch": 0.015305628208854016, "grad_norm": 1.433423638343811, "learning_rate": 9.998551777584037e-06, "loss": 8.9922, "step": 316 }, { "epoch": 0.015305628208854016, "eval_accuracy": 0.10107848793356786, "eval_loss": 8.8984375, "eval_runtime": 273.7559, "eval_samples_per_second": 123.347, "eval_steps_per_second": 3.857, "step": 316 }, { "epoch": 0.015354063741160516, "grad_norm": 1.5502877235412598, "learning_rate": 9.998546934030807e-06, "loss": 8.9062, "step": 317 }, { "epoch": 0.015354063741160516, "eval_accuracy": 0.10110254445963007, "eval_loss": 8.8984375, "eval_runtime": 273.1804, "eval_samples_per_second": 123.607, "eval_steps_per_second": 3.866, "step": 317 }, { "epoch": 0.015402499273467015, "grad_norm": 1.4339704513549805, "learning_rate": 9.998542090477575e-06, "loss": 8.9297, "step": 318 }, { "epoch": 0.015402499273467015, "eval_accuracy": 0.10105709470521651, "eval_loss": 8.890625, "eval_runtime": 272.4751, "eval_samples_per_second": 123.927, "eval_steps_per_second": 3.876, "step": 318 }, { "epoch": 0.015450934805773515, "grad_norm": 1.5828499794006348, "learning_rate": 9.998537246924345e-06, "loss": 8.9531, "step": 319 }, { "epoch": 0.015450934805773515, "eval_accuracy": 0.1008436256039136, "eval_loss": 8.890625, "eval_runtime": 273.6696, "eval_samples_per_second": 123.386, "eval_steps_per_second": 3.859, "step": 319 }, { "epoch": 0.015499370338080016, "grad_norm": 1.4665073156356812, "learning_rate": 9.998532403371115e-06, "loss": 8.9531, "step": 320 }, { "epoch": 0.015499370338080016, "eval_accuracy": 0.10056403724204985, "eval_loss": 8.8828125, "eval_runtime": 273.7537, "eval_samples_per_second": 123.348, "eval_steps_per_second": 3.857, "step": 320 }, { "epoch": 0.015547805870386516, "grad_norm": 1.5187170505523682, "learning_rate": 9.998527559817883e-06, "loss": 8.9375, "step": 321 }, { "epoch": 0.015547805870386516, "eval_accuracy": 0.10036628738701864, "eval_loss": 8.8828125, "eval_runtime": 274.5883, "eval_samples_per_second": 122.973, "eval_steps_per_second": 3.846, "step": 321 }, { "epoch": 0.015596241402693016, "grad_norm": 1.6168104410171509, "learning_rate": 9.998522716264653e-06, "loss": 8.9219, "step": 322 }, { "epoch": 0.015596241402693016, "eval_accuracy": 0.10023019866377143, "eval_loss": 8.875, "eval_runtime": 274.4874, "eval_samples_per_second": 123.018, "eval_steps_per_second": 3.847, "step": 322 }, { "epoch": 0.015644676934999515, "grad_norm": 1.5896227359771729, "learning_rate": 9.998517872711423e-06, "loss": 8.9062, "step": 323 }, { "epoch": 0.015644676934999515, "eval_accuracy": 0.10039622053737764, "eval_loss": 8.875, "eval_runtime": 273.3822, "eval_samples_per_second": 123.516, "eval_steps_per_second": 3.863, "step": 323 }, { "epoch": 0.015693112467306015, "grad_norm": 1.476304531097412, "learning_rate": 9.99851302915819e-06, "loss": 8.8906, "step": 324 }, { "epoch": 0.015693112467306015, "eval_accuracy": 0.10063834903796041, "eval_loss": 8.875, "eval_runtime": 273.0848, "eval_samples_per_second": 123.65, "eval_steps_per_second": 3.867, "step": 324 }, { "epoch": 0.015741547999612516, "grad_norm": 1.493653655052185, "learning_rate": 9.99850818560496e-06, "loss": 8.8906, "step": 325 }, { "epoch": 0.015741547999612516, "eval_accuracy": 0.10110543934844622, "eval_loss": 8.8671875, "eval_runtime": 273.5247, "eval_samples_per_second": 123.451, "eval_steps_per_second": 3.861, "step": 325 }, { "epoch": 0.015789983531919016, "grad_norm": 1.5021024942398071, "learning_rate": 9.998503342051729e-06, "loss": 8.8672, "step": 326 }, { "epoch": 0.015789983531919016, "eval_accuracy": 0.10160095746710661, "eval_loss": 8.8671875, "eval_runtime": 273.3554, "eval_samples_per_second": 123.528, "eval_steps_per_second": 3.863, "step": 326 }, { "epoch": 0.015838419064225517, "grad_norm": 1.4941586256027222, "learning_rate": 9.998498498498499e-06, "loss": 8.875, "step": 327 }, { "epoch": 0.015838419064225517, "eval_accuracy": 0.1018975967240975, "eval_loss": 8.859375, "eval_runtime": 272.1708, "eval_samples_per_second": 124.065, "eval_steps_per_second": 3.88, "step": 327 }, { "epoch": 0.015886854596532017, "grad_norm": 1.484066128730774, "learning_rate": 9.998493654945269e-06, "loss": 8.8516, "step": 328 }, { "epoch": 0.015886854596532017, "eval_accuracy": 0.10215561816428094, "eval_loss": 8.859375, "eval_runtime": 272.903, "eval_samples_per_second": 123.733, "eval_steps_per_second": 3.87, "step": 328 }, { "epoch": 0.015935290128838518, "grad_norm": 1.6349196434020996, "learning_rate": 9.998488811392037e-06, "loss": 8.8672, "step": 329 }, { "epoch": 0.015935290128838518, "eval_accuracy": 0.10202540606533052, "eval_loss": 8.8515625, "eval_runtime": 273.5376, "eval_samples_per_second": 123.446, "eval_steps_per_second": 3.861, "step": 329 }, { "epoch": 0.015983725661145014, "grad_norm": 1.4366816282272339, "learning_rate": 9.998483967838807e-06, "loss": 8.8984, "step": 330 }, { "epoch": 0.015983725661145014, "eval_accuracy": 0.10181083690627749, "eval_loss": 8.8515625, "eval_runtime": 273.8934, "eval_samples_per_second": 123.285, "eval_steps_per_second": 3.856, "step": 330 }, { "epoch": 0.016032161193451515, "grad_norm": 1.4731358289718628, "learning_rate": 9.998479124285576e-06, "loss": 8.875, "step": 331 }, { "epoch": 0.016032161193451515, "eval_accuracy": 0.10155426291050211, "eval_loss": 8.84375, "eval_runtime": 273.3318, "eval_samples_per_second": 123.538, "eval_steps_per_second": 3.863, "step": 331 }, { "epoch": 0.016080596725758015, "grad_norm": 1.4572798013687134, "learning_rate": 9.998474280732346e-06, "loss": 8.8828, "step": 332 }, { "epoch": 0.016080596725758015, "eval_accuracy": 0.10136490823303775, "eval_loss": 8.84375, "eval_runtime": 273.42, "eval_samples_per_second": 123.499, "eval_steps_per_second": 3.862, "step": 332 }, { "epoch": 0.016129032258064516, "grad_norm": 1.5042625665664673, "learning_rate": 9.998469437179116e-06, "loss": 8.8438, "step": 333 }, { "epoch": 0.016129032258064516, "eval_accuracy": 0.10138867527021833, "eval_loss": 8.8359375, "eval_runtime": 273.8134, "eval_samples_per_second": 123.321, "eval_steps_per_second": 3.857, "step": 333 }, { "epoch": 0.016177467790371016, "grad_norm": 1.9862890243530273, "learning_rate": 9.998464593625884e-06, "loss": 8.7969, "step": 334 }, { "epoch": 0.016177467790371016, "eval_accuracy": 0.10165384708577767, "eval_loss": 8.8359375, "eval_runtime": 273.5642, "eval_samples_per_second": 123.434, "eval_steps_per_second": 3.86, "step": 334 }, { "epoch": 0.016225903322677517, "grad_norm": 1.5006351470947266, "learning_rate": 9.998459750072654e-06, "loss": 8.8828, "step": 335 }, { "epoch": 0.016225903322677517, "eval_accuracy": 0.10197017158671838, "eval_loss": 8.828125, "eval_runtime": 273.8604, "eval_samples_per_second": 123.3, "eval_steps_per_second": 3.856, "step": 335 }, { "epoch": 0.016274338854984017, "grad_norm": 1.5454577207565308, "learning_rate": 9.998454906519424e-06, "loss": 8.8281, "step": 336 }, { "epoch": 0.016274338854984017, "eval_accuracy": 0.10246210004324674, "eval_loss": 8.828125, "eval_runtime": 273.8457, "eval_samples_per_second": 123.307, "eval_steps_per_second": 3.856, "step": 336 }, { "epoch": 0.016322774387290517, "grad_norm": 1.6074914932250977, "learning_rate": 9.998450062966192e-06, "loss": 8.8203, "step": 337 }, { "epoch": 0.016322774387290517, "eval_accuracy": 0.10273896720962333, "eval_loss": 8.828125, "eval_runtime": 273.8399, "eval_samples_per_second": 123.309, "eval_steps_per_second": 3.856, "step": 337 }, { "epoch": 0.016371209919597018, "grad_norm": 1.5352425575256348, "learning_rate": 9.998445219412962e-06, "loss": 8.8594, "step": 338 }, { "epoch": 0.016371209919597018, "eval_accuracy": 0.10277469013761462, "eval_loss": 8.8203125, "eval_runtime": 273.8619, "eval_samples_per_second": 123.299, "eval_steps_per_second": 3.856, "step": 338 }, { "epoch": 0.016419645451903518, "grad_norm": 1.5608229637145996, "learning_rate": 9.998440375859732e-06, "loss": 8.8594, "step": 339 }, { "epoch": 0.016419645451903518, "eval_accuracy": 0.10268248792882025, "eval_loss": 8.8203125, "eval_runtime": 273.0524, "eval_samples_per_second": 123.665, "eval_steps_per_second": 3.867, "step": 339 }, { "epoch": 0.016468080984210015, "grad_norm": 1.4532408714294434, "learning_rate": 9.9984355323065e-06, "loss": 8.8203, "step": 340 }, { "epoch": 0.016468080984210015, "eval_accuracy": 0.10246901882751734, "eval_loss": 8.8125, "eval_runtime": 273.0044, "eval_samples_per_second": 123.687, "eval_steps_per_second": 3.868, "step": 340 }, { "epoch": 0.016516516516516516, "grad_norm": 1.5474005937576294, "learning_rate": 9.99843068875327e-06, "loss": 8.8359, "step": 341 }, { "epoch": 0.016516516516516516, "eval_accuracy": 0.10244681503029747, "eval_loss": 8.8125, "eval_runtime": 272.933, "eval_samples_per_second": 123.719, "eval_steps_per_second": 3.869, "step": 341 }, { "epoch": 0.016516516516516516, "step": 341, "total_flos": 1427643152990208.0, "train_loss": 9.567586143695015, "train_runtime": 93928.2806, "train_samples_per_second": 703.348, "train_steps_per_second": 21.981 } ], "logging_steps": 1, "max_steps": 2064600, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1427643152990208.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }