{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.018308631211857017, "eval_steps": 1, "global_step": 336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.448997384481256e-05, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 1 }, { "epoch": 5.448997384481256e-05, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 283.3287, "eval_samples_per_second": 119.18, "eval_steps_per_second": 3.311, "step": 1 }, { "epoch": 0.00010897994768962511, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 2 }, { "epoch": 0.00010897994768962511, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 283.1668, "eval_samples_per_second": 119.248, "eval_steps_per_second": 3.313, "step": 2 }, { "epoch": 0.00016346992153443767, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 3 }, { "epoch": 0.00016346992153443767, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 282.1254, "eval_samples_per_second": 119.688, "eval_steps_per_second": 3.325, "step": 3 }, { "epoch": 0.00021795989537925023, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 4 }, { "epoch": 0.00021795989537925023, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 281.3948, "eval_samples_per_second": 119.999, "eval_steps_per_second": 3.333, "step": 4 }, { "epoch": 0.00027244986922406276, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8594, "step": 5 }, { "epoch": 0.00027244986922406276, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 281.7028, "eval_samples_per_second": 119.867, "eval_steps_per_second": 3.33, "step": 5 }, { "epoch": 0.00032693984306887534, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 6 }, { "epoch": 0.00032693984306887534, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 282.539, "eval_samples_per_second": 119.513, "eval_steps_per_second": 3.32, "step": 6 }, { "epoch": 0.00038142981691368787, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 7 }, { "epoch": 0.00038142981691368787, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 282.3676, "eval_samples_per_second": 119.585, "eval_steps_per_second": 3.322, "step": 7 }, { "epoch": 0.00043591979075850045, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 8 }, { "epoch": 0.00043591979075850045, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 281.6041, "eval_samples_per_second": 119.909, "eval_steps_per_second": 3.331, "step": 8 }, { "epoch": 0.000490409764603313, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 9 }, { "epoch": 0.000490409764603313, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 281.1865, "eval_samples_per_second": 120.088, "eval_steps_per_second": 3.336, "step": 9 }, { "epoch": 0.0005448997384481255, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8594, "step": 10 }, { "epoch": 0.0005448997384481255, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 282.104, "eval_samples_per_second": 119.697, "eval_steps_per_second": 3.325, "step": 10 }, { "epoch": 0.0005993897122929382, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 11 }, { "epoch": 0.0005993897122929382, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 280.8618, "eval_samples_per_second": 120.226, "eval_steps_per_second": 3.34, "step": 11 }, { "epoch": 0.0006538796861377507, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 12 }, { "epoch": 0.0006538796861377507, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 280.475, "eval_samples_per_second": 120.392, "eval_steps_per_second": 3.344, "step": 12 }, { "epoch": 0.0007083696599825632, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8594, "step": 13 }, { "epoch": 0.0007083696599825632, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 279.8203, "eval_samples_per_second": 120.674, "eval_steps_per_second": 3.352, "step": 13 }, { "epoch": 0.0007628596338273757, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 14 }, { "epoch": 0.0007628596338273757, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 279.8528, "eval_samples_per_second": 120.66, "eval_steps_per_second": 3.352, "step": 14 }, { "epoch": 0.0008173496076721883, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8594, "step": 15 }, { "epoch": 0.0008173496076721883, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 281.1019, "eval_samples_per_second": 120.124, "eval_steps_per_second": 3.337, "step": 15 }, { "epoch": 0.0008718395815170009, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8594, "step": 16 }, { "epoch": 0.0008718395815170009, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 280.3712, "eval_samples_per_second": 120.437, "eval_steps_per_second": 3.346, "step": 16 }, { "epoch": 0.0009263295553618134, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 10.8672, "step": 17 }, { "epoch": 0.0009263295553618134, "eval_accuracy": 0.004507457682298169, "eval_loss": 10.8671875, "eval_runtime": 281.4302, "eval_samples_per_second": 119.984, "eval_steps_per_second": 3.333, "step": 17 }, { "epoch": 0.000980819529206626, "grad_norm": 3.3501086235046387, "learning_rate": 9.999994551002616e-06, "loss": 10.8672, "step": 18 }, { "epoch": 0.000980819529206626, "eval_accuracy": 0.008603349021604294, "eval_loss": 10.8359375, "eval_runtime": 281.0275, "eval_samples_per_second": 120.155, "eval_steps_per_second": 3.338, "step": 18 }, { "epoch": 0.0010353095030514385, "grad_norm": 3.1848981380462646, "learning_rate": 9.999989102005233e-06, "loss": 10.8359, "step": 19 }, { "epoch": 0.0010353095030514385, "eval_accuracy": 0.010782302884632226, "eval_loss": 10.8046875, "eval_runtime": 280.8218, "eval_samples_per_second": 120.244, "eval_steps_per_second": 3.34, "step": 19 }, { "epoch": 0.001089799476896251, "grad_norm": 3.103949546813965, "learning_rate": 9.999983653007848e-06, "loss": 10.8047, "step": 20 }, { "epoch": 0.001089799476896251, "eval_accuracy": 0.011296145649498848, "eval_loss": 10.7734375, "eval_runtime": 280.7519, "eval_samples_per_second": 120.273, "eval_steps_per_second": 3.341, "step": 20 }, { "epoch": 0.0011442894507410636, "grad_norm": 3.007913827896118, "learning_rate": 9.999978204010463e-06, "loss": 10.7891, "step": 21 }, { "epoch": 0.0011442894507410636, "eval_accuracy": 0.011502753864307472, "eval_loss": 10.75, "eval_runtime": 282.1403, "eval_samples_per_second": 119.682, "eval_steps_per_second": 3.325, "step": 21 }, { "epoch": 0.0011987794245858763, "grad_norm": 3.0112082958221436, "learning_rate": 9.99997275501308e-06, "loss": 10.7578, "step": 22 }, { "epoch": 0.0011987794245858763, "eval_accuracy": 0.011935742384539025, "eval_loss": 10.7265625, "eval_runtime": 280.7974, "eval_samples_per_second": 120.254, "eval_steps_per_second": 3.34, "step": 22 }, { "epoch": 0.0012532693984306888, "grad_norm": 3.0085818767547607, "learning_rate": 9.999967306015694e-06, "loss": 10.7188, "step": 23 }, { "epoch": 0.0012532693984306888, "eval_accuracy": 0.012902490504692311, "eval_loss": 10.703125, "eval_runtime": 279.9507, "eval_samples_per_second": 120.618, "eval_steps_per_second": 3.351, "step": 23 }, { "epoch": 0.0013077593722755014, "grad_norm": 2.6498820781707764, "learning_rate": 9.99996185701831e-06, "loss": 10.7188, "step": 24 }, { "epoch": 0.0013077593722755014, "eval_accuracy": 0.014669617484734745, "eval_loss": 10.6796875, "eval_runtime": 284.0388, "eval_samples_per_second": 118.882, "eval_steps_per_second": 3.302, "step": 24 }, { "epoch": 0.0013622493461203139, "grad_norm": 2.525411367416382, "learning_rate": 9.999956408020926e-06, "loss": 10.6953, "step": 25 }, { "epoch": 0.0013622493461203139, "eval_accuracy": 0.017945705260195358, "eval_loss": 10.6640625, "eval_runtime": 279.0802, "eval_samples_per_second": 120.994, "eval_steps_per_second": 3.361, "step": 25 }, { "epoch": 0.0014167393199651264, "grad_norm": 2.438088893890381, "learning_rate": 9.99995095902354e-06, "loss": 10.6719, "step": 26 }, { "epoch": 0.0014167393199651264, "eval_accuracy": 0.023050523249706075, "eval_loss": 10.640625, "eval_runtime": 280.2346, "eval_samples_per_second": 120.495, "eval_steps_per_second": 3.347, "step": 26 }, { "epoch": 0.001471229293809939, "grad_norm": 2.2894654273986816, "learning_rate": 9.999945510026156e-06, "loss": 10.6562, "step": 27 }, { "epoch": 0.001471229293809939, "eval_accuracy": 0.02863128990948001, "eval_loss": 10.625, "eval_runtime": 280.5686, "eval_samples_per_second": 120.352, "eval_steps_per_second": 3.343, "step": 27 }, { "epoch": 0.0015257192676547515, "grad_norm": 2.0723860263824463, "learning_rate": 9.999940061028771e-06, "loss": 10.6641, "step": 28 }, { "epoch": 0.0015257192676547515, "eval_accuracy": 0.03466305129792195, "eval_loss": 10.609375, "eval_runtime": 278.9184, "eval_samples_per_second": 121.064, "eval_steps_per_second": 3.363, "step": 28 }, { "epoch": 0.001580209241499564, "grad_norm": 2.0480406284332275, "learning_rate": 9.999934612031386e-06, "loss": 10.6328, "step": 29 }, { "epoch": 0.001580209241499564, "eval_accuracy": 0.039908271395015946, "eval_loss": 10.59375, "eval_runtime": 278.4399, "eval_samples_per_second": 121.272, "eval_steps_per_second": 3.369, "step": 29 }, { "epoch": 0.0016346992153443765, "grad_norm": 2.0142972469329834, "learning_rate": 9.999929163034003e-06, "loss": 10.6016, "step": 30 }, { "epoch": 0.0016346992153443765, "eval_accuracy": 0.043642156887862514, "eval_loss": 10.578125, "eval_runtime": 277.7144, "eval_samples_per_second": 121.589, "eval_steps_per_second": 3.378, "step": 30 }, { "epoch": 0.0016891891891891893, "grad_norm": 1.8250195980072021, "learning_rate": 9.999923714036618e-06, "loss": 10.6016, "step": 31 }, { "epoch": 0.0016891891891891893, "eval_accuracy": 0.046290082739106744, "eval_loss": 10.5703125, "eval_runtime": 277.9514, "eval_samples_per_second": 121.485, "eval_steps_per_second": 3.375, "step": 31 }, { "epoch": 0.0017436791630340018, "grad_norm": 1.7317852973937988, "learning_rate": 9.999918265039233e-06, "loss": 10.5938, "step": 32 }, { "epoch": 0.0017436791630340018, "eval_accuracy": 0.04786212316182883, "eval_loss": 10.5546875, "eval_runtime": 278.188, "eval_samples_per_second": 121.382, "eval_steps_per_second": 3.372, "step": 32 }, { "epoch": 0.0017981691368788143, "grad_norm": 1.6736972332000732, "learning_rate": 9.99991281604185e-06, "loss": 10.5781, "step": 33 }, { "epoch": 0.0017981691368788143, "eval_accuracy": 0.048402714699356676, "eval_loss": 10.546875, "eval_runtime": 276.9089, "eval_samples_per_second": 121.943, "eval_steps_per_second": 3.387, "step": 33 }, { "epoch": 0.0018526591107236269, "grad_norm": 1.7245711088180542, "learning_rate": 9.999907367044465e-06, "loss": 10.5547, "step": 34 }, { "epoch": 0.0018526591107236269, "eval_accuracy": 0.04842535272989897, "eval_loss": 10.53125, "eval_runtime": 277.5457, "eval_samples_per_second": 121.663, "eval_steps_per_second": 3.38, "step": 34 }, { "epoch": 0.0019071490845684394, "grad_norm": 1.6025965213775635, "learning_rate": 9.99990191804708e-06, "loss": 10.5469, "step": 35 }, { "epoch": 0.0019071490845684394, "eval_accuracy": 0.04835280681616625, "eval_loss": 10.5234375, "eval_runtime": 280.2247, "eval_samples_per_second": 120.5, "eval_steps_per_second": 3.347, "step": 35 }, { "epoch": 0.001961639058413252, "grad_norm": 1.5452033281326294, "learning_rate": 9.999896469049695e-06, "loss": 10.5391, "step": 36 }, { "epoch": 0.001961639058413252, "eval_accuracy": 0.04815010670125943, "eval_loss": 10.515625, "eval_runtime": 278.5802, "eval_samples_per_second": 121.211, "eval_steps_per_second": 3.367, "step": 36 }, { "epoch": 0.0020161290322580645, "grad_norm": 1.5433681011199951, "learning_rate": 9.999891020052312e-06, "loss": 10.5312, "step": 37 }, { "epoch": 0.0020161290322580645, "eval_accuracy": 0.047523855403661705, "eval_loss": 10.5078125, "eval_runtime": 279.3479, "eval_samples_per_second": 120.878, "eval_steps_per_second": 3.358, "step": 37 }, { "epoch": 0.002070619006102877, "grad_norm": 1.4813498258590698, "learning_rate": 9.999885571054927e-06, "loss": 10.5312, "step": 38 }, { "epoch": 0.002070619006102877, "eval_accuracy": 0.047497569813211064, "eval_loss": 10.4921875, "eval_runtime": 279.8874, "eval_samples_per_second": 120.645, "eval_steps_per_second": 3.351, "step": 38 }, { "epoch": 0.0021251089799476895, "grad_norm": 1.5266352891921997, "learning_rate": 9.999880122057542e-06, "loss": 10.4922, "step": 39 }, { "epoch": 0.0021251089799476895, "eval_accuracy": 0.04756021520719255, "eval_loss": 10.484375, "eval_runtime": 278.6312, "eval_samples_per_second": 121.189, "eval_steps_per_second": 3.366, "step": 39 }, { "epoch": 0.002179598953792502, "grad_norm": 1.433236837387085, "learning_rate": 9.999874673060158e-06, "loss": 10.5078, "step": 40 }, { "epoch": 0.002179598953792502, "eval_accuracy": 0.047665212824554305, "eval_loss": 10.484375, "eval_runtime": 278.9837, "eval_samples_per_second": 121.036, "eval_steps_per_second": 3.362, "step": 40 }, { "epoch": 0.0022340889276373146, "grad_norm": 1.4035075902938843, "learning_rate": 9.999869224062774e-06, "loss": 10.4922, "step": 41 }, { "epoch": 0.0022340889276373146, "eval_accuracy": 0.048127468670717134, "eval_loss": 10.4765625, "eval_runtime": 280.2687, "eval_samples_per_second": 120.481, "eval_steps_per_second": 3.347, "step": 41 }, { "epoch": 0.002288578901482127, "grad_norm": 1.392196774482727, "learning_rate": 9.999863775065389e-06, "loss": 10.4844, "step": 42 }, { "epoch": 0.002288578901482127, "eval_accuracy": 0.048623971051575024, "eval_loss": 10.46875, "eval_runtime": 279.283, "eval_samples_per_second": 120.906, "eval_steps_per_second": 3.359, "step": 42 }, { "epoch": 0.0023430688753269396, "grad_norm": 1.4084738492965698, "learning_rate": 9.999858326068004e-06, "loss": 10.4766, "step": 43 }, { "epoch": 0.0023430688753269396, "eval_accuracy": 0.0492689812287014, "eval_loss": 10.4609375, "eval_runtime": 279.956, "eval_samples_per_second": 120.615, "eval_steps_per_second": 3.351, "step": 43 }, { "epoch": 0.0023975588491717526, "grad_norm": 1.3411015272140503, "learning_rate": 9.999852877070619e-06, "loss": 10.4844, "step": 44 }, { "epoch": 0.0023975588491717526, "eval_accuracy": 0.04953739531973483, "eval_loss": 10.453125, "eval_runtime": 278.5767, "eval_samples_per_second": 121.213, "eval_steps_per_second": 3.367, "step": 44 }, { "epoch": 0.002452048823016565, "grad_norm": 1.381066083908081, "learning_rate": 9.999847428073235e-06, "loss": 10.4688, "step": 45 }, { "epoch": 0.002452048823016565, "eval_accuracy": 0.05031273339136427, "eval_loss": 10.4453125, "eval_runtime": 278.3035, "eval_samples_per_second": 121.332, "eval_steps_per_second": 3.37, "step": 45 }, { "epoch": 0.0025065387968613777, "grad_norm": 1.2761576175689697, "learning_rate": 9.99984197907585e-06, "loss": 10.4844, "step": 46 }, { "epoch": 0.0025065387968613777, "eval_accuracy": 0.05126917570733207, "eval_loss": 10.4453125, "eval_runtime": 278.7818, "eval_samples_per_second": 121.123, "eval_steps_per_second": 3.365, "step": 46 }, { "epoch": 0.00256102877070619, "grad_norm": 1.2938231229782104, "learning_rate": 9.999836530078465e-06, "loss": 10.4609, "step": 47 }, { "epoch": 0.00256102877070619, "eval_accuracy": 0.05223774760743953, "eval_loss": 10.4375, "eval_runtime": 280.01, "eval_samples_per_second": 120.592, "eval_steps_per_second": 3.35, "step": 47 }, { "epoch": 0.0026155187445510027, "grad_norm": 1.408036231994629, "learning_rate": 9.999831081081082e-06, "loss": 10.4453, "step": 48 }, { "epoch": 0.0026155187445510027, "eval_accuracy": 0.05263593956410096, "eval_loss": 10.4296875, "eval_runtime": 279.456, "eval_samples_per_second": 120.831, "eval_steps_per_second": 3.357, "step": 48 }, { "epoch": 0.0026700087183958152, "grad_norm": 1.3031139373779297, "learning_rate": 9.999825632083697e-06, "loss": 10.4453, "step": 49 }, { "epoch": 0.0026700087183958152, "eval_accuracy": 0.05322177821382523, "eval_loss": 10.4296875, "eval_runtime": 279.6217, "eval_samples_per_second": 120.76, "eval_steps_per_second": 3.355, "step": 49 }, { "epoch": 0.0027244986922406278, "grad_norm": 1.3556911945343018, "learning_rate": 9.999820183086312e-06, "loss": 10.4297, "step": 50 }, { "epoch": 0.0027244986922406278, "eval_accuracy": 0.05367262819805243, "eval_loss": 10.421875, "eval_runtime": 278.4251, "eval_samples_per_second": 121.279, "eval_steps_per_second": 3.369, "step": 50 }, { "epoch": 0.0027789886660854403, "grad_norm": 1.3359757661819458, "learning_rate": 9.999814734088929e-06, "loss": 10.4219, "step": 51 }, { "epoch": 0.0027789886660854403, "eval_accuracy": 0.05440648251294645, "eval_loss": 10.4140625, "eval_runtime": 278.6487, "eval_samples_per_second": 121.181, "eval_steps_per_second": 3.366, "step": 51 }, { "epoch": 0.002833478639930253, "grad_norm": 1.2961536645889282, "learning_rate": 9.999809285091544e-06, "loss": 10.4297, "step": 52 }, { "epoch": 0.002833478639930253, "eval_accuracy": 0.05480528239625927, "eval_loss": 10.4140625, "eval_runtime": 278.1813, "eval_samples_per_second": 121.385, "eval_steps_per_second": 3.372, "step": 52 }, { "epoch": 0.0028879686137750654, "grad_norm": 1.2359050512313843, "learning_rate": 9.999803836094159e-06, "loss": 10.4375, "step": 53 }, { "epoch": 0.0028879686137750654, "eval_accuracy": 0.055358814086795306, "eval_loss": 10.40625, "eval_runtime": 279.3681, "eval_samples_per_second": 120.869, "eval_steps_per_second": 3.358, "step": 53 }, { "epoch": 0.002942458587619878, "grad_norm": 1.2747548818588257, "learning_rate": 9.999798387096776e-06, "loss": 10.4219, "step": 54 }, { "epoch": 0.002942458587619878, "eval_accuracy": 0.055845676487895415, "eval_loss": 10.40625, "eval_runtime": 280.0513, "eval_samples_per_second": 120.574, "eval_steps_per_second": 3.349, "step": 54 }, { "epoch": 0.0029969485614646904, "grad_norm": 1.304934024810791, "learning_rate": 9.99979293809939e-06, "loss": 10.4141, "step": 55 }, { "epoch": 0.0029969485614646904, "eval_accuracy": 0.05652635169523676, "eval_loss": 10.3984375, "eval_runtime": 280.159, "eval_samples_per_second": 120.528, "eval_steps_per_second": 3.348, "step": 55 }, { "epoch": 0.003051438535309503, "grad_norm": 1.2796216011047363, "learning_rate": 9.999787489102006e-06, "loss": 10.4141, "step": 56 }, { "epoch": 0.003051438535309503, "eval_accuracy": 0.05738361512036325, "eval_loss": 10.390625, "eval_runtime": 280.8546, "eval_samples_per_second": 120.229, "eval_steps_per_second": 3.34, "step": 56 }, { "epoch": 0.0031059285091543155, "grad_norm": 1.273751974105835, "learning_rate": 9.999782040104623e-06, "loss": 10.4219, "step": 57 }, { "epoch": 0.0031059285091543155, "eval_accuracy": 0.05828062536893549, "eval_loss": 10.390625, "eval_runtime": 280.5946, "eval_samples_per_second": 120.341, "eval_steps_per_second": 3.343, "step": 57 }, { "epoch": 0.003160418482999128, "grad_norm": 1.2347089052200317, "learning_rate": 9.999776591107238e-06, "loss": 10.4219, "step": 58 }, { "epoch": 0.003160418482999128, "eval_accuracy": 0.05910523444821581, "eval_loss": 10.3828125, "eval_runtime": 281.2657, "eval_samples_per_second": 120.054, "eval_steps_per_second": 3.335, "step": 58 }, { "epoch": 0.0032149084568439405, "grad_norm": 1.2617988586425781, "learning_rate": 9.999771142109853e-06, "loss": 10.3984, "step": 59 }, { "epoch": 0.0032149084568439405, "eval_accuracy": 0.059817348148100545, "eval_loss": 10.3828125, "eval_runtime": 279.7299, "eval_samples_per_second": 120.713, "eval_steps_per_second": 3.353, "step": 59 }, { "epoch": 0.003269398430688753, "grad_norm": 1.2854173183441162, "learning_rate": 9.999765693112468e-06, "loss": 10.3984, "step": 60 }, { "epoch": 0.003269398430688753, "eval_accuracy": 0.06033486742176367, "eval_loss": 10.375, "eval_runtime": 279.8061, "eval_samples_per_second": 120.68, "eval_steps_per_second": 3.352, "step": 60 }, { "epoch": 0.003323888404533566, "grad_norm": 1.2649012804031372, "learning_rate": 9.999760244115083e-06, "loss": 10.3984, "step": 61 }, { "epoch": 0.003323888404533566, "eval_accuracy": 0.06070726591907321, "eval_loss": 10.375, "eval_runtime": 279.7805, "eval_samples_per_second": 120.691, "eval_steps_per_second": 3.353, "step": 61 }, { "epoch": 0.0033783783783783786, "grad_norm": 1.284862756729126, "learning_rate": 9.9997547951177e-06, "loss": 10.3906, "step": 62 }, { "epoch": 0.0033783783783783786, "eval_accuracy": 0.061101260286951224, "eval_loss": 10.3671875, "eval_runtime": 280.6475, "eval_samples_per_second": 120.318, "eval_steps_per_second": 3.342, "step": 62 }, { "epoch": 0.003432868352223191, "grad_norm": 1.3201900720596313, "learning_rate": 9.999749346120314e-06, "loss": 10.3672, "step": 63 }, { "epoch": 0.003432868352223191, "eval_accuracy": 0.06146384511117401, "eval_loss": 10.3671875, "eval_runtime": 280.4398, "eval_samples_per_second": 120.407, "eval_steps_per_second": 3.345, "step": 63 }, { "epoch": 0.0034873583260680036, "grad_norm": 1.265735387802124, "learning_rate": 9.99974389712293e-06, "loss": 10.3906, "step": 64 }, { "epoch": 0.0034873583260680036, "eval_accuracy": 0.0615552078022117, "eval_loss": 10.359375, "eval_runtime": 280.4115, "eval_samples_per_second": 120.419, "eval_steps_per_second": 3.345, "step": 64 }, { "epoch": 0.003541848299912816, "grad_norm": 1.2759215831756592, "learning_rate": 9.999738448125546e-06, "loss": 10.3828, "step": 65 }, { "epoch": 0.003541848299912816, "eval_accuracy": 0.061487409506137465, "eval_loss": 10.359375, "eval_runtime": 280.0889, "eval_samples_per_second": 120.558, "eval_steps_per_second": 3.349, "step": 65 }, { "epoch": 0.0035963382737576287, "grad_norm": 1.2972633838653564, "learning_rate": 9.999732999128161e-06, "loss": 10.3594, "step": 66 }, { "epoch": 0.0035963382737576287, "eval_accuracy": 0.06136165553596391, "eval_loss": 10.3515625, "eval_runtime": 280.2984, "eval_samples_per_second": 120.468, "eval_steps_per_second": 3.346, "step": 66 }, { "epoch": 0.003650828247602441, "grad_norm": 1.3436386585235596, "learning_rate": 9.999727550130776e-06, "loss": 10.3516, "step": 67 }, { "epoch": 0.003650828247602441, "eval_accuracy": 0.06103861489296974, "eval_loss": 10.34375, "eval_runtime": 280.5172, "eval_samples_per_second": 120.374, "eval_steps_per_second": 3.344, "step": 67 }, { "epoch": 0.0037053182214472537, "grad_norm": 1.3115813732147217, "learning_rate": 9.999722101133393e-06, "loss": 10.3516, "step": 68 }, { "epoch": 0.0037053182214472537, "eval_accuracy": 0.060934138355594886, "eval_loss": 10.34375, "eval_runtime": 279.6787, "eval_samples_per_second": 120.735, "eval_steps_per_second": 3.354, "step": 68 }, { "epoch": 0.0037598081952920663, "grad_norm": 1.3387259244918823, "learning_rate": 9.999716652136008e-06, "loss": 10.3438, "step": 69 }, { "epoch": 0.0037598081952920663, "eval_accuracy": 0.061064147812328176, "eval_loss": 10.3359375, "eval_runtime": 279.3007, "eval_samples_per_second": 120.898, "eval_steps_per_second": 3.358, "step": 69 }, { "epoch": 0.003814298169136879, "grad_norm": 1.507016897201538, "learning_rate": 9.999711203138623e-06, "loss": 10.3594, "step": 70 }, { "epoch": 0.003814298169136879, "eval_accuracy": 0.061038383301864445, "eval_loss": 10.3359375, "eval_runtime": 280.6833, "eval_samples_per_second": 120.303, "eval_steps_per_second": 3.342, "step": 70 }, { "epoch": 0.0038687881429816913, "grad_norm": 1.2702033519744873, "learning_rate": 9.99970575414124e-06, "loss": 10.3594, "step": 71 }, { "epoch": 0.0038687881429816913, "eval_accuracy": 0.06097628793675803, "eval_loss": 10.328125, "eval_runtime": 279.2628, "eval_samples_per_second": 120.915, "eval_steps_per_second": 3.359, "step": 71 }, { "epoch": 0.003923278116826504, "grad_norm": 1.3358463048934937, "learning_rate": 9.999700305143855e-06, "loss": 10.3203, "step": 72 }, { "epoch": 0.003923278116826504, "eval_accuracy": 0.06098071711664674, "eval_loss": 10.328125, "eval_runtime": 280.3079, "eval_samples_per_second": 120.464, "eval_steps_per_second": 3.346, "step": 72 }, { "epoch": 0.003977768090671316, "grad_norm": 1.259023904800415, "learning_rate": 9.99969485614647e-06, "loss": 10.3516, "step": 73 }, { "epoch": 0.003977768090671316, "eval_accuracy": 0.06099933125173458, "eval_loss": 10.3203125, "eval_runtime": 280.2434, "eval_samples_per_second": 120.492, "eval_steps_per_second": 3.347, "step": 73 }, { "epoch": 0.004032258064516129, "grad_norm": 1.458150029182434, "learning_rate": 9.999689407149085e-06, "loss": 10.3203, "step": 74 }, { "epoch": 0.004032258064516129, "eval_accuracy": 0.061083225129626606, "eval_loss": 10.3125, "eval_runtime": 280.0404, "eval_samples_per_second": 120.579, "eval_steps_per_second": 3.35, "step": 74 }, { "epoch": 0.0040867480383609414, "grad_norm": 1.2943834066390991, "learning_rate": 9.999683958151702e-06, "loss": 10.3281, "step": 75 }, { "epoch": 0.0040867480383609414, "eval_accuracy": 0.06123138553923716, "eval_loss": 10.3125, "eval_runtime": 280.4851, "eval_samples_per_second": 120.388, "eval_steps_per_second": 3.344, "step": 75 }, { "epoch": 0.004141238012205754, "grad_norm": 1.2712702751159668, "learning_rate": 9.999678509154317e-06, "loss": 10.3438, "step": 76 }, { "epoch": 0.004141238012205754, "eval_accuracy": 0.061402212928278174, "eval_loss": 10.3046875, "eval_runtime": 279.7875, "eval_samples_per_second": 120.688, "eval_steps_per_second": 3.353, "step": 76 }, { "epoch": 0.0041957279860505665, "grad_norm": 1.3315693140029907, "learning_rate": 9.999673060156932e-06, "loss": 10.2969, "step": 77 }, { "epoch": 0.0041957279860505665, "eval_accuracy": 0.0617529576572429, "eval_loss": 10.3046875, "eval_runtime": 280.9468, "eval_samples_per_second": 120.19, "eval_steps_per_second": 3.339, "step": 77 }, { "epoch": 0.004250217959895379, "grad_norm": 1.2693438529968262, "learning_rate": 9.999667611159547e-06, "loss": 10.3281, "step": 78 }, { "epoch": 0.004250217959895379, "eval_accuracy": 0.06221338972345156, "eval_loss": 10.296875, "eval_runtime": 280.2001, "eval_samples_per_second": 120.51, "eval_steps_per_second": 3.348, "step": 78 }, { "epoch": 0.004304707933740192, "grad_norm": 1.4125028848648071, "learning_rate": 9.999662162162162e-06, "loss": 10.2891, "step": 79 }, { "epoch": 0.004304707933740192, "eval_accuracy": 0.0628054234352424, "eval_loss": 10.296875, "eval_runtime": 280.5046, "eval_samples_per_second": 120.379, "eval_steps_per_second": 3.344, "step": 79 }, { "epoch": 0.004359197907585004, "grad_norm": 1.4127213954925537, "learning_rate": 9.999656713164779e-06, "loss": 10.3047, "step": 80 }, { "epoch": 0.004359197907585004, "eval_accuracy": 0.0632026600785945, "eval_loss": 10.2890625, "eval_runtime": 281.1308, "eval_samples_per_second": 120.111, "eval_steps_per_second": 3.337, "step": 80 }, { "epoch": 0.004413687881429817, "grad_norm": 1.2919211387634277, "learning_rate": 9.999651264167394e-06, "loss": 10.2969, "step": 81 }, { "epoch": 0.004413687881429817, "eval_accuracy": 0.06365513120055874, "eval_loss": 10.28125, "eval_runtime": 280.4747, "eval_samples_per_second": 120.392, "eval_steps_per_second": 3.344, "step": 81 }, { "epoch": 0.004468177855274629, "grad_norm": 1.3636354207992554, "learning_rate": 9.999645815170009e-06, "loss": 10.2891, "step": 82 }, { "epoch": 0.004468177855274629, "eval_accuracy": 0.06432144775937197, "eval_loss": 10.28125, "eval_runtime": 280.6686, "eval_samples_per_second": 120.309, "eval_steps_per_second": 3.342, "step": 82 }, { "epoch": 0.004522667829119442, "grad_norm": 1.2588390111923218, "learning_rate": 9.999640366172625e-06, "loss": 10.3125, "step": 83 }, { "epoch": 0.004522667829119442, "eval_accuracy": 0.06491886596436085, "eval_loss": 10.2734375, "eval_runtime": 280.7018, "eval_samples_per_second": 120.295, "eval_steps_per_second": 3.342, "step": 83 }, { "epoch": 0.004577157802964254, "grad_norm": 1.2841159105300903, "learning_rate": 9.99963491717524e-06, "loss": 10.2891, "step": 84 }, { "epoch": 0.004577157802964254, "eval_accuracy": 0.0653847114726557, "eval_loss": 10.2734375, "eval_runtime": 280.4683, "eval_samples_per_second": 120.395, "eval_steps_per_second": 3.344, "step": 84 }, { "epoch": 0.004631647776809067, "grad_norm": 1.2950408458709717, "learning_rate": 9.999629468177855e-06, "loss": 10.2812, "step": 85 }, { "epoch": 0.004631647776809067, "eval_accuracy": 0.06567657416309995, "eval_loss": 10.265625, "eval_runtime": 279.8035, "eval_samples_per_second": 120.681, "eval_steps_per_second": 3.352, "step": 85 }, { "epoch": 0.004686137750653879, "grad_norm": 1.284853458404541, "learning_rate": 9.999624019180472e-06, "loss": 10.3047, "step": 86 }, { "epoch": 0.004686137750653879, "eval_accuracy": 0.06585295973866796, "eval_loss": 10.265625, "eval_runtime": 280.2049, "eval_samples_per_second": 120.508, "eval_steps_per_second": 3.348, "step": 86 }, { "epoch": 0.004740627724498693, "grad_norm": 1.229882836341858, "learning_rate": 9.999618570183087e-06, "loss": 10.2969, "step": 87 }, { "epoch": 0.004740627724498693, "eval_accuracy": 0.06600204651269968, "eval_loss": 10.2578125, "eval_runtime": 280.4096, "eval_samples_per_second": 120.42, "eval_steps_per_second": 3.345, "step": 87 }, { "epoch": 0.004795117698343505, "grad_norm": 1.3210368156433105, "learning_rate": 9.999613121185702e-06, "loss": 10.2578, "step": 88 }, { "epoch": 0.004795117698343505, "eval_accuracy": 0.06612053431194471, "eval_loss": 10.25, "eval_runtime": 280.0298, "eval_samples_per_second": 120.584, "eval_steps_per_second": 3.35, "step": 88 }, { "epoch": 0.004849607672188318, "grad_norm": 1.2671563625335693, "learning_rate": 9.999607672188319e-06, "loss": 10.2812, "step": 89 }, { "epoch": 0.004849607672188318, "eval_accuracy": 0.06618060325487982, "eval_loss": 10.25, "eval_runtime": 280.1841, "eval_samples_per_second": 120.517, "eval_steps_per_second": 3.348, "step": 89 }, { "epoch": 0.00490409764603313, "grad_norm": 1.2486541271209717, "learning_rate": 9.999602223190934e-06, "loss": 10.2734, "step": 90 }, { "epoch": 0.00490409764603313, "eval_accuracy": 0.06629969898077623, "eval_loss": 10.2421875, "eval_runtime": 279.9304, "eval_samples_per_second": 120.626, "eval_steps_per_second": 3.351, "step": 90 }, { "epoch": 0.004958587619877943, "grad_norm": 1.219537615776062, "learning_rate": 9.999596774193549e-06, "loss": 10.2891, "step": 91 }, { "epoch": 0.004958587619877943, "eval_accuracy": 0.06638553243417508, "eval_loss": 10.2421875, "eval_runtime": 280.2115, "eval_samples_per_second": 120.505, "eval_steps_per_second": 3.347, "step": 91 }, { "epoch": 0.005013077593722755, "grad_norm": 1.3005180358886719, "learning_rate": 9.999591325196166e-06, "loss": 10.2578, "step": 92 }, { "epoch": 0.005013077593722755, "eval_accuracy": 0.06655224908109715, "eval_loss": 10.234375, "eval_runtime": 280.5566, "eval_samples_per_second": 120.357, "eval_steps_per_second": 3.343, "step": 92 }, { "epoch": 0.005067567567567568, "grad_norm": 1.2326613664627075, "learning_rate": 9.99958587619878e-06, "loss": 10.2734, "step": 93 }, { "epoch": 0.005067567567567568, "eval_accuracy": 0.06677831094875031, "eval_loss": 10.234375, "eval_runtime": 279.0494, "eval_samples_per_second": 121.007, "eval_steps_per_second": 3.361, "step": 93 }, { "epoch": 0.00512205754141238, "grad_norm": 1.343145728111267, "learning_rate": 9.999580427201396e-06, "loss": 10.2266, "step": 94 }, { "epoch": 0.00512205754141238, "eval_accuracy": 0.0670887877742824, "eval_loss": 10.2265625, "eval_runtime": 278.9829, "eval_samples_per_second": 121.036, "eval_steps_per_second": 3.362, "step": 94 }, { "epoch": 0.005176547515257193, "grad_norm": 1.2524234056472778, "learning_rate": 9.999574978204011e-06, "loss": 10.2578, "step": 95 }, { "epoch": 0.005176547515257193, "eval_accuracy": 0.06735230950321652, "eval_loss": 10.2265625, "eval_runtime": 278.058, "eval_samples_per_second": 121.439, "eval_steps_per_second": 3.373, "step": 95 }, { "epoch": 0.0052310374891020054, "grad_norm": 1.232895016670227, "learning_rate": 9.999569529206626e-06, "loss": 10.25, "step": 96 }, { "epoch": 0.0052310374891020054, "eval_accuracy": 0.0675672260489275, "eval_loss": 10.21875, "eval_runtime": 280.2022, "eval_samples_per_second": 120.509, "eval_steps_per_second": 3.348, "step": 96 }, { "epoch": 0.005285527462946818, "grad_norm": 1.2870153188705444, "learning_rate": 9.999564080209243e-06, "loss": 10.2266, "step": 97 }, { "epoch": 0.005285527462946818, "eval_accuracy": 0.06775721760193143, "eval_loss": 10.21875, "eval_runtime": 279.5356, "eval_samples_per_second": 120.797, "eval_steps_per_second": 3.356, "step": 97 }, { "epoch": 0.0053400174367916305, "grad_norm": 1.2629221677780151, "learning_rate": 9.999558631211858e-06, "loss": 10.2266, "step": 98 }, { "epoch": 0.0053400174367916305, "eval_accuracy": 0.06793869818181586, "eval_loss": 10.2109375, "eval_runtime": 279.0624, "eval_samples_per_second": 121.002, "eval_steps_per_second": 3.361, "step": 98 }, { "epoch": 0.005394507410636443, "grad_norm": 1.279175043106079, "learning_rate": 9.999553182214473e-06, "loss": 10.2344, "step": 99 }, { "epoch": 0.005394507410636443, "eval_accuracy": 0.06805782285660043, "eval_loss": 10.2109375, "eval_runtime": 278.9598, "eval_samples_per_second": 121.046, "eval_steps_per_second": 3.362, "step": 99 }, { "epoch": 0.0054489973844812556, "grad_norm": 1.21150803565979, "learning_rate": 9.99954773321709e-06, "loss": 10.2422, "step": 100 }, { "epoch": 0.0054489973844812556, "eval_accuracy": 0.0681561332807969, "eval_loss": 10.203125, "eval_runtime": 278.9899, "eval_samples_per_second": 121.033, "eval_steps_per_second": 3.362, "step": 100 }, { "epoch": 0.005503487358326068, "grad_norm": 1.2411643266677856, "learning_rate": 9.999542284219704e-06, "loss": 10.2422, "step": 101 }, { "epoch": 0.005503487358326068, "eval_accuracy": 0.06829934343053183, "eval_loss": 10.203125, "eval_runtime": 278.3501, "eval_samples_per_second": 121.311, "eval_steps_per_second": 3.37, "step": 101 }, { "epoch": 0.005557977332170881, "grad_norm": 1.2620774507522583, "learning_rate": 9.99953683522232e-06, "loss": 10.2266, "step": 102 }, { "epoch": 0.005557977332170881, "eval_accuracy": 0.0684582438776503, "eval_loss": 10.1953125, "eval_runtime": 280.5409, "eval_samples_per_second": 120.364, "eval_steps_per_second": 3.344, "step": 102 }, { "epoch": 0.005612467306015693, "grad_norm": 1.2730474472045898, "learning_rate": 9.999531386224936e-06, "loss": 10.2188, "step": 103 }, { "epoch": 0.005612467306015693, "eval_accuracy": 0.06855808859291931, "eval_loss": 10.1953125, "eval_runtime": 279.3284, "eval_samples_per_second": 120.886, "eval_steps_per_second": 3.358, "step": 103 }, { "epoch": 0.005666957279860506, "grad_norm": 1.250596046447754, "learning_rate": 9.999525937227551e-06, "loss": 10.2109, "step": 104 }, { "epoch": 0.005666957279860506, "eval_accuracy": 0.06870066186711471, "eval_loss": 10.1875, "eval_runtime": 279.5974, "eval_samples_per_second": 120.77, "eval_steps_per_second": 3.355, "step": 104 }, { "epoch": 0.005721447253705318, "grad_norm": 1.3434346914291382, "learning_rate": 9.999520488230166e-06, "loss": 10.1797, "step": 105 }, { "epoch": 0.005721447253705318, "eval_accuracy": 0.06888807696907226, "eval_loss": 10.1875, "eval_runtime": 279.5391, "eval_samples_per_second": 120.795, "eval_steps_per_second": 3.356, "step": 105 }, { "epoch": 0.005775937227550131, "grad_norm": 1.28876793384552, "learning_rate": 9.999515039232783e-06, "loss": 10.1797, "step": 106 }, { "epoch": 0.005775937227550131, "eval_accuracy": 0.06907245243777284, "eval_loss": 10.1796875, "eval_runtime": 279.0416, "eval_samples_per_second": 121.011, "eval_steps_per_second": 3.362, "step": 106 }, { "epoch": 0.005830427201394943, "grad_norm": 1.3488516807556152, "learning_rate": 9.999509590235398e-06, "loss": 10.1719, "step": 107 }, { "epoch": 0.005830427201394943, "eval_accuracy": 0.06933892695329945, "eval_loss": 10.1796875, "eval_runtime": 279.7608, "eval_samples_per_second": 120.7, "eval_steps_per_second": 3.353, "step": 107 }, { "epoch": 0.005884917175239756, "grad_norm": 1.272630214691162, "learning_rate": 9.999504141238013e-06, "loss": 10.1875, "step": 108 }, { "epoch": 0.005884917175239756, "eval_accuracy": 0.06959364822023249, "eval_loss": 10.171875, "eval_runtime": 279.3313, "eval_samples_per_second": 120.885, "eval_steps_per_second": 3.358, "step": 108 }, { "epoch": 0.005939407149084568, "grad_norm": 1.275840401649475, "learning_rate": 9.999498692240628e-06, "loss": 10.1797, "step": 109 }, { "epoch": 0.005939407149084568, "eval_accuracy": 0.06980685678154193, "eval_loss": 10.171875, "eval_runtime": 279.6516, "eval_samples_per_second": 120.747, "eval_steps_per_second": 3.354, "step": 109 }, { "epoch": 0.005993897122929381, "grad_norm": 1.2549141645431519, "learning_rate": 9.999493243243245e-06, "loss": 10.1797, "step": 110 }, { "epoch": 0.005993897122929381, "eval_accuracy": 0.07002058642283829, "eval_loss": 10.1640625, "eval_runtime": 279.1415, "eval_samples_per_second": 120.967, "eval_steps_per_second": 3.36, "step": 110 }, { "epoch": 0.006048387096774193, "grad_norm": 1.3456748723983765, "learning_rate": 9.999487794245858e-06, "loss": 10.1406, "step": 111 }, { "epoch": 0.006048387096774193, "eval_accuracy": 0.07022299704886349, "eval_loss": 10.1640625, "eval_runtime": 279.1359, "eval_samples_per_second": 120.97, "eval_steps_per_second": 3.36, "step": 111 }, { "epoch": 0.006102877070619006, "grad_norm": 1.2664400339126587, "learning_rate": 9.999482345248475e-06, "loss": 10.1719, "step": 112 }, { "epoch": 0.006102877070619006, "eval_accuracy": 0.07042644983486251, "eval_loss": 10.1640625, "eval_runtime": 279.7876, "eval_samples_per_second": 120.688, "eval_steps_per_second": 3.353, "step": 112 }, { "epoch": 0.0061573670444638184, "grad_norm": 1.226805329322815, "learning_rate": 9.99947689625109e-06, "loss": 10.1953, "step": 113 }, { "epoch": 0.0061573670444638184, "eval_accuracy": 0.07062440233211086, "eval_loss": 10.15625, "eval_runtime": 280.1774, "eval_samples_per_second": 120.52, "eval_steps_per_second": 3.348, "step": 113 }, { "epoch": 0.006211857018308631, "grad_norm": 1.2513021230697632, "learning_rate": 9.999471447253705e-06, "loss": 10.1719, "step": 114 }, { "epoch": 0.006211857018308631, "eval_accuracy": 0.070792566423441, "eval_loss": 10.15625, "eval_runtime": 279.7452, "eval_samples_per_second": 120.706, "eval_steps_per_second": 3.353, "step": 114 }, { "epoch": 0.0062663469921534435, "grad_norm": 1.2754563093185425, "learning_rate": 9.999465998256322e-06, "loss": 10.1641, "step": 115 }, { "epoch": 0.0062663469921534435, "eval_accuracy": 0.0709808499920434, "eval_loss": 10.1484375, "eval_runtime": 279.6651, "eval_samples_per_second": 120.741, "eval_steps_per_second": 3.354, "step": 115 }, { "epoch": 0.006320836965998256, "grad_norm": 1.2457561492919922, "learning_rate": 9.999460549258937e-06, "loss": 10.1719, "step": 116 }, { "epoch": 0.006320836965998256, "eval_accuracy": 0.07117214424501458, "eval_loss": 10.1484375, "eval_runtime": 279.2608, "eval_samples_per_second": 120.916, "eval_steps_per_second": 3.359, "step": 116 }, { "epoch": 0.0063753269398430686, "grad_norm": 1.2697603702545166, "learning_rate": 9.999455100261552e-06, "loss": 10.1484, "step": 117 }, { "epoch": 0.0063753269398430686, "eval_accuracy": 0.07134343481626618, "eval_loss": 10.140625, "eval_runtime": 279.2939, "eval_samples_per_second": 120.901, "eval_steps_per_second": 3.358, "step": 117 }, { "epoch": 0.006429816913687881, "grad_norm": 1.2824610471725464, "learning_rate": 9.999449651264169e-06, "loss": 10.1562, "step": 118 }, { "epoch": 0.006429816913687881, "eval_accuracy": 0.07150007725010805, "eval_loss": 10.140625, "eval_runtime": 280.6459, "eval_samples_per_second": 120.319, "eval_steps_per_second": 3.342, "step": 118 }, { "epoch": 0.006484306887532694, "grad_norm": 1.2311967611312866, "learning_rate": 9.999444202266784e-06, "loss": 10.1562, "step": 119 }, { "epoch": 0.006484306887532694, "eval_accuracy": 0.07164412691759968, "eval_loss": 10.1328125, "eval_runtime": 278.8366, "eval_samples_per_second": 121.1, "eval_steps_per_second": 3.364, "step": 119 }, { "epoch": 0.006538796861377506, "grad_norm": 1.260802984237671, "learning_rate": 9.999438753269399e-06, "loss": 10.1484, "step": 120 }, { "epoch": 0.006538796861377506, "eval_accuracy": 0.07180059565811259, "eval_loss": 10.1328125, "eval_runtime": 279.3119, "eval_samples_per_second": 120.894, "eval_steps_per_second": 3.358, "step": 120 }, { "epoch": 0.006593286835222319, "grad_norm": 1.2933627367019653, "learning_rate": 9.999433304272015e-06, "loss": 10.1406, "step": 121 }, { "epoch": 0.006593286835222319, "eval_accuracy": 0.07193274733256984, "eval_loss": 10.125, "eval_runtime": 280.0499, "eval_samples_per_second": 120.575, "eval_steps_per_second": 3.349, "step": 121 }, { "epoch": 0.006647776809067132, "grad_norm": 1.2777124643325806, "learning_rate": 9.99942785527463e-06, "loss": 10.1328, "step": 122 }, { "epoch": 0.006647776809067132, "eval_accuracy": 0.07206449372259283, "eval_loss": 10.125, "eval_runtime": 278.3594, "eval_samples_per_second": 121.307, "eval_steps_per_second": 3.37, "step": 122 }, { "epoch": 0.006702266782911945, "grad_norm": 1.225155234336853, "learning_rate": 9.999422406277245e-06, "loss": 10.1641, "step": 123 }, { "epoch": 0.006702266782911945, "eval_accuracy": 0.07220330364132721, "eval_loss": 10.1171875, "eval_runtime": 279.7375, "eval_samples_per_second": 120.71, "eval_steps_per_second": 3.353, "step": 123 }, { "epoch": 0.006756756756756757, "grad_norm": 1.2431164979934692, "learning_rate": 9.999416957279862e-06, "loss": 10.1328, "step": 124 }, { "epoch": 0.006756756756756757, "eval_accuracy": 0.07233015766925091, "eval_loss": 10.1171875, "eval_runtime": 278.5065, "eval_samples_per_second": 121.243, "eval_steps_per_second": 3.368, "step": 124 }, { "epoch": 0.00681124673060157, "grad_norm": 1.2372488975524902, "learning_rate": 9.999411508282477e-06, "loss": 10.1484, "step": 125 }, { "epoch": 0.00681124673060157, "eval_accuracy": 0.07246410417477417, "eval_loss": 10.109375, "eval_runtime": 278.7434, "eval_samples_per_second": 121.14, "eval_steps_per_second": 3.365, "step": 125 }, { "epoch": 0.006865736704446382, "grad_norm": 1.2289838790893555, "learning_rate": 9.999406059285092e-06, "loss": 10.1406, "step": 126 }, { "epoch": 0.006865736704446382, "eval_accuracy": 0.07261460944432581, "eval_loss": 10.109375, "eval_runtime": 279.5991, "eval_samples_per_second": 120.769, "eval_steps_per_second": 3.355, "step": 126 }, { "epoch": 0.006920226678291195, "grad_norm": 1.231143832206726, "learning_rate": 9.999400610287709e-06, "loss": 10.1406, "step": 127 }, { "epoch": 0.006920226678291195, "eval_accuracy": 0.07277023866708203, "eval_loss": 10.1015625, "eval_runtime": 279.4963, "eval_samples_per_second": 120.814, "eval_steps_per_second": 3.356, "step": 127 }, { "epoch": 0.006974716652136007, "grad_norm": 1.3046151399612427, "learning_rate": 9.999395161290324e-06, "loss": 10.125, "step": 128 }, { "epoch": 0.006974716652136007, "eval_accuracy": 0.07291527259677114, "eval_loss": 10.1015625, "eval_runtime": 280.7304, "eval_samples_per_second": 120.283, "eval_steps_per_second": 3.341, "step": 128 }, { "epoch": 0.00702920662598082, "grad_norm": 1.2855069637298584, "learning_rate": 9.999389712292939e-06, "loss": 10.1172, "step": 129 }, { "epoch": 0.00702920662598082, "eval_accuracy": 0.07305231663332767, "eval_loss": 10.09375, "eval_runtime": 279.8824, "eval_samples_per_second": 120.647, "eval_steps_per_second": 3.351, "step": 129 }, { "epoch": 0.007083696599825632, "grad_norm": 1.282074213027954, "learning_rate": 9.999384263295554e-06, "loss": 10.1016, "step": 130 }, { "epoch": 0.007083696599825632, "eval_accuracy": 0.07317312034362562, "eval_loss": 10.09375, "eval_runtime": 279.9541, "eval_samples_per_second": 120.616, "eval_steps_per_second": 3.351, "step": 130 }, { "epoch": 0.007138186573670445, "grad_norm": 1.218042254447937, "learning_rate": 9.999378814298169e-06, "loss": 10.1172, "step": 131 }, { "epoch": 0.007138186573670445, "eval_accuracy": 0.07330454829587883, "eval_loss": 10.0859375, "eval_runtime": 280.2317, "eval_samples_per_second": 120.497, "eval_steps_per_second": 3.347, "step": 131 }, { "epoch": 0.007192676547515257, "grad_norm": 1.2612570524215698, "learning_rate": 9.999373365300786e-06, "loss": 10.1172, "step": 132 }, { "epoch": 0.007192676547515257, "eval_accuracy": 0.07344506619901475, "eval_loss": 10.0859375, "eval_runtime": 280.7991, "eval_samples_per_second": 120.253, "eval_steps_per_second": 3.34, "step": 132 }, { "epoch": 0.00724716652136007, "grad_norm": 1.2458122968673706, "learning_rate": 9.999367916303401e-06, "loss": 10.1172, "step": 133 }, { "epoch": 0.00724716652136007, "eval_accuracy": 0.07357432298465584, "eval_loss": 10.0859375, "eval_runtime": 281.5847, "eval_samples_per_second": 119.918, "eval_steps_per_second": 3.331, "step": 133 }, { "epoch": 0.007301656495204882, "grad_norm": 1.2749879360198975, "learning_rate": 9.999362467306016e-06, "loss": 10.0938, "step": 134 }, { "epoch": 0.007301656495204882, "eval_accuracy": 0.07369055277062427, "eval_loss": 10.078125, "eval_runtime": 279.1661, "eval_samples_per_second": 120.957, "eval_steps_per_second": 3.36, "step": 134 }, { "epoch": 0.007356146469049695, "grad_norm": 1.2771598100662231, "learning_rate": 9.999357018308633e-06, "loss": 10.1094, "step": 135 }, { "epoch": 0.007356146469049695, "eval_accuracy": 0.07383138911152996, "eval_loss": 10.078125, "eval_runtime": 280.6696, "eval_samples_per_second": 120.309, "eval_steps_per_second": 3.342, "step": 135 }, { "epoch": 0.0074106364428945075, "grad_norm": 1.2409011125564575, "learning_rate": 9.999351569311248e-06, "loss": 10.1094, "step": 136 }, { "epoch": 0.0074106364428945075, "eval_accuracy": 0.07399631092738602, "eval_loss": 10.0703125, "eval_runtime": 279.9187, "eval_samples_per_second": 120.631, "eval_steps_per_second": 3.351, "step": 136 }, { "epoch": 0.00746512641673932, "grad_norm": 1.3785778284072876, "learning_rate": 9.999346120313863e-06, "loss": 10.0703, "step": 137 }, { "epoch": 0.00746512641673932, "eval_accuracy": 0.07418262597159345, "eval_loss": 10.0703125, "eval_runtime": 278.8604, "eval_samples_per_second": 121.089, "eval_steps_per_second": 3.364, "step": 137 }, { "epoch": 0.0075196163905841325, "grad_norm": 1.2543950080871582, "learning_rate": 9.99934067131648e-06, "loss": 10.0781, "step": 138 }, { "epoch": 0.0075196163905841325, "eval_accuracy": 0.07433802360324437, "eval_loss": 10.0625, "eval_runtime": 278.9667, "eval_samples_per_second": 121.043, "eval_steps_per_second": 3.362, "step": 138 }, { "epoch": 0.007574106364428945, "grad_norm": 1.2550749778747559, "learning_rate": 9.999335222319094e-06, "loss": 10.0781, "step": 139 }, { "epoch": 0.007574106364428945, "eval_accuracy": 0.07445636665804858, "eval_loss": 10.0625, "eval_runtime": 280.3577, "eval_samples_per_second": 120.443, "eval_steps_per_second": 3.346, "step": 139 }, { "epoch": 0.007628596338273758, "grad_norm": 1.2808057069778442, "learning_rate": 9.99932977332171e-06, "loss": 10.0781, "step": 140 }, { "epoch": 0.007628596338273758, "eval_accuracy": 0.07457149638626687, "eval_loss": 10.0546875, "eval_runtime": 278.8572, "eval_samples_per_second": 121.091, "eval_steps_per_second": 3.364, "step": 140 }, { "epoch": 0.00768308631211857, "grad_norm": 1.2947779893875122, "learning_rate": 9.999324324324326e-06, "loss": 10.0625, "step": 141 }, { "epoch": 0.00768308631211857, "eval_accuracy": 0.0747105089472184, "eval_loss": 10.0546875, "eval_runtime": 278.2341, "eval_samples_per_second": 121.362, "eval_steps_per_second": 3.371, "step": 141 }, { "epoch": 0.007737576285963383, "grad_norm": 1.2656594514846802, "learning_rate": 9.999318875326941e-06, "loss": 10.0781, "step": 142 }, { "epoch": 0.007737576285963383, "eval_accuracy": 0.07485380594361782, "eval_loss": 10.046875, "eval_runtime": 279.1203, "eval_samples_per_second": 120.976, "eval_steps_per_second": 3.361, "step": 142 }, { "epoch": 0.007792066259808195, "grad_norm": 1.3355940580368042, "learning_rate": 9.999313426329556e-06, "loss": 10.0391, "step": 143 }, { "epoch": 0.007792066259808195, "eval_accuracy": 0.07498022573821908, "eval_loss": 10.046875, "eval_runtime": 279.5611, "eval_samples_per_second": 120.786, "eval_steps_per_second": 3.355, "step": 143 }, { "epoch": 0.007846556233653008, "grad_norm": 1.2356702089309692, "learning_rate": 9.999307977332171e-06, "loss": 10.0703, "step": 144 }, { "epoch": 0.007846556233653008, "eval_accuracy": 0.07511967253249303, "eval_loss": 10.046875, "eval_runtime": 279.9704, "eval_samples_per_second": 120.609, "eval_steps_per_second": 3.35, "step": 144 }, { "epoch": 0.007901046207497821, "grad_norm": 1.2823630571365356, "learning_rate": 9.999302528334788e-06, "loss": 10.0391, "step": 145 }, { "epoch": 0.007901046207497821, "eval_accuracy": 0.07525402432245056, "eval_loss": 10.0390625, "eval_runtime": 279.3818, "eval_samples_per_second": 120.863, "eval_steps_per_second": 3.357, "step": 145 }, { "epoch": 0.007955536181342633, "grad_norm": 1.2719727754592896, "learning_rate": 9.999297079337401e-06, "loss": 10.0469, "step": 146 }, { "epoch": 0.007955536181342633, "eval_accuracy": 0.07538672602578286, "eval_loss": 10.0390625, "eval_runtime": 280.0065, "eval_samples_per_second": 120.594, "eval_steps_per_second": 3.35, "step": 146 }, { "epoch": 0.008010026155187446, "grad_norm": 1.2450186014175415, "learning_rate": 9.999291630340018e-06, "loss": 10.0547, "step": 147 }, { "epoch": 0.008010026155187446, "eval_accuracy": 0.07551505644700279, "eval_loss": 10.03125, "eval_runtime": 279.889, "eval_samples_per_second": 120.644, "eval_steps_per_second": 3.351, "step": 147 }, { "epoch": 0.008064516129032258, "grad_norm": 1.2184418439865112, "learning_rate": 9.999286181342633e-06, "loss": 10.0703, "step": 148 }, { "epoch": 0.008064516129032258, "eval_accuracy": 0.07562749392862206, "eval_loss": 10.03125, "eval_runtime": 278.8225, "eval_samples_per_second": 121.106, "eval_steps_per_second": 3.364, "step": 148 }, { "epoch": 0.008119006102877071, "grad_norm": 1.241905689239502, "learning_rate": 9.999280732345248e-06, "loss": 10.0469, "step": 149 }, { "epoch": 0.008119006102877071, "eval_accuracy": 0.0757480660478147, "eval_loss": 10.0234375, "eval_runtime": 279.6408, "eval_samples_per_second": 120.751, "eval_steps_per_second": 3.354, "step": 149 }, { "epoch": 0.008173496076721883, "grad_norm": 1.2825335264205933, "learning_rate": 9.999275283347865e-06, "loss": 10.0391, "step": 150 }, { "epoch": 0.008173496076721883, "eval_accuracy": 0.07586571432930304, "eval_loss": 10.0234375, "eval_runtime": 279.96, "eval_samples_per_second": 120.614, "eval_steps_per_second": 3.35, "step": 150 }, { "epoch": 0.008227986050566696, "grad_norm": 1.2704492807388306, "learning_rate": 9.99926983435048e-06, "loss": 10.0391, "step": 151 }, { "epoch": 0.008227986050566696, "eval_accuracy": 0.07599531850160207, "eval_loss": 10.015625, "eval_runtime": 278.0769, "eval_samples_per_second": 121.43, "eval_steps_per_second": 3.373, "step": 151 }, { "epoch": 0.008282476024411508, "grad_norm": 1.2123051881790161, "learning_rate": 9.999264385353095e-06, "loss": 10.0391, "step": 152 }, { "epoch": 0.008282476024411508, "eval_accuracy": 0.07611594851857105, "eval_loss": 10.015625, "eval_runtime": 279.8596, "eval_samples_per_second": 120.657, "eval_steps_per_second": 3.352, "step": 152 }, { "epoch": 0.008336965998256321, "grad_norm": 1.258999228477478, "learning_rate": 9.999258936355712e-06, "loss": 10.0391, "step": 153 }, { "epoch": 0.008336965998256321, "eval_accuracy": 0.07623133878678279, "eval_loss": 10.015625, "eval_runtime": 279.3191, "eval_samples_per_second": 120.89, "eval_steps_per_second": 3.358, "step": 153 }, { "epoch": 0.008391455972101133, "grad_norm": 1.194441795349121, "learning_rate": 9.999253487358327e-06, "loss": 10.0469, "step": 154 }, { "epoch": 0.008391455972101133, "eval_accuracy": 0.07634800280607362, "eval_loss": 10.0078125, "eval_runtime": 279.5107, "eval_samples_per_second": 120.808, "eval_steps_per_second": 3.356, "step": 154 }, { "epoch": 0.008445945945945946, "grad_norm": 1.289880633354187, "learning_rate": 9.999248038360942e-06, "loss": 10.0312, "step": 155 }, { "epoch": 0.008445945945945946, "eval_accuracy": 0.07645618480113316, "eval_loss": 10.0078125, "eval_runtime": 279.6787, "eval_samples_per_second": 120.735, "eval_steps_per_second": 3.354, "step": 155 }, { "epoch": 0.008500435919790758, "grad_norm": 1.318518042564392, "learning_rate": 9.999242589363559e-06, "loss": 9.9844, "step": 156 }, { "epoch": 0.008500435919790758, "eval_accuracy": 0.07656841964053529, "eval_loss": 10.0, "eval_runtime": 279.7302, "eval_samples_per_second": 120.713, "eval_steps_per_second": 3.353, "step": 156 }, { "epoch": 0.008554925893635571, "grad_norm": 1.3092557191848755, "learning_rate": 9.999237140366174e-06, "loss": 10.0, "step": 157 }, { "epoch": 0.008554925893635571, "eval_accuracy": 0.0766785701599898, "eval_loss": 10.0, "eval_runtime": 279.5756, "eval_samples_per_second": 120.779, "eval_steps_per_second": 3.355, "step": 157 }, { "epoch": 0.008609415867480383, "grad_norm": 1.2869155406951904, "learning_rate": 9.999231691368789e-06, "loss": 10.0078, "step": 158 }, { "epoch": 0.008609415867480383, "eval_accuracy": 0.07677942808634446, "eval_loss": 9.9921875, "eval_runtime": 281.549, "eval_samples_per_second": 119.933, "eval_steps_per_second": 3.332, "step": 158 }, { "epoch": 0.008663905841325197, "grad_norm": 1.2500320672988892, "learning_rate": 9.999226242371405e-06, "loss": 10.0078, "step": 159 }, { "epoch": 0.008663905841325197, "eval_accuracy": 0.07687990967715302, "eval_loss": 9.9921875, "eval_runtime": 280.7605, "eval_samples_per_second": 120.27, "eval_steps_per_second": 3.341, "step": 159 }, { "epoch": 0.008718395815170008, "grad_norm": 1.2038897275924683, "learning_rate": 9.99922079337402e-06, "loss": 10.0234, "step": 160 }, { "epoch": 0.008718395815170008, "eval_accuracy": 0.07697283560815144, "eval_loss": 9.9921875, "eval_runtime": 278.5618, "eval_samples_per_second": 121.219, "eval_steps_per_second": 3.367, "step": 160 }, { "epoch": 0.008772885789014822, "grad_norm": 1.2646440267562866, "learning_rate": 9.999215344376635e-06, "loss": 9.9922, "step": 161 }, { "epoch": 0.008772885789014822, "eval_accuracy": 0.07707207239676907, "eval_loss": 9.984375, "eval_runtime": 279.0044, "eval_samples_per_second": 121.027, "eval_steps_per_second": 3.362, "step": 161 }, { "epoch": 0.008827375762859633, "grad_norm": 1.2453374862670898, "learning_rate": 9.999209895379252e-06, "loss": 9.9922, "step": 162 }, { "epoch": 0.008827375762859633, "eval_accuracy": 0.07716395616779366, "eval_loss": 9.984375, "eval_runtime": 280.4701, "eval_samples_per_second": 120.394, "eval_steps_per_second": 3.344, "step": 162 }, { "epoch": 0.008881865736704447, "grad_norm": 1.3001788854599, "learning_rate": 9.999204446381865e-06, "loss": 9.9766, "step": 163 }, { "epoch": 0.008881865736704447, "eval_accuracy": 0.07725401615886408, "eval_loss": 9.9765625, "eval_runtime": 280.3097, "eval_samples_per_second": 120.463, "eval_steps_per_second": 3.346, "step": 163 }, { "epoch": 0.008936355710549258, "grad_norm": 1.2622265815734863, "learning_rate": 9.999198997384482e-06, "loss": 9.9922, "step": 164 }, { "epoch": 0.008936355710549258, "eval_accuracy": 0.07734222342109218, "eval_loss": 9.9765625, "eval_runtime": 279.8973, "eval_samples_per_second": 120.641, "eval_steps_per_second": 3.351, "step": 164 }, { "epoch": 0.008990845684394072, "grad_norm": 1.3013081550598145, "learning_rate": 9.999193548387097e-06, "loss": 9.9766, "step": 165 }, { "epoch": 0.008990845684394072, "eval_accuracy": 0.07743468616988, "eval_loss": 9.96875, "eval_runtime": 280.2864, "eval_samples_per_second": 120.473, "eval_steps_per_second": 3.347, "step": 165 }, { "epoch": 0.009045335658238883, "grad_norm": 1.258622407913208, "learning_rate": 9.999188099389712e-06, "loss": 9.9844, "step": 166 }, { "epoch": 0.009045335658238883, "eval_accuracy": 0.07752159073214084, "eval_loss": 9.96875, "eval_runtime": 280.7557, "eval_samples_per_second": 120.272, "eval_steps_per_second": 3.341, "step": 166 }, { "epoch": 0.009099825632083697, "grad_norm": 1.2523554563522339, "learning_rate": 9.999182650392329e-06, "loss": 9.9766, "step": 167 }, { "epoch": 0.009099825632083697, "eval_accuracy": 0.07759642360803831, "eval_loss": 9.96875, "eval_runtime": 281.1831, "eval_samples_per_second": 120.089, "eval_steps_per_second": 3.336, "step": 167 }, { "epoch": 0.009154315605928508, "grad_norm": 1.2290598154067993, "learning_rate": 9.999177201394944e-06, "loss": 9.9844, "step": 168 }, { "epoch": 0.009154315605928508, "eval_accuracy": 0.07767707521045625, "eval_loss": 9.9609375, "eval_runtime": 281.1351, "eval_samples_per_second": 120.11, "eval_steps_per_second": 3.336, "step": 168 }, { "epoch": 0.009208805579773322, "grad_norm": 1.2900922298431396, "learning_rate": 9.999171752397559e-06, "loss": 9.9609, "step": 169 }, { "epoch": 0.009208805579773322, "eval_accuracy": 0.07775179229080108, "eval_loss": 9.9609375, "eval_runtime": 280.4797, "eval_samples_per_second": 120.39, "eval_steps_per_second": 3.344, "step": 169 }, { "epoch": 0.009263295553618133, "grad_norm": 1.246096134185791, "learning_rate": 9.999166303400176e-06, "loss": 9.9766, "step": 170 }, { "epoch": 0.009263295553618133, "eval_accuracy": 0.07782975164662, "eval_loss": 9.953125, "eval_runtime": 278.9976, "eval_samples_per_second": 121.03, "eval_steps_per_second": 3.362, "step": 170 }, { "epoch": 0.009317785527462947, "grad_norm": 1.2970362901687622, "learning_rate": 9.99916085440279e-06, "loss": 9.9531, "step": 171 }, { "epoch": 0.009317785527462947, "eval_accuracy": 0.07790701622912304, "eval_loss": 9.953125, "eval_runtime": 281.1285, "eval_samples_per_second": 120.112, "eval_steps_per_second": 3.337, "step": 171 }, { "epoch": 0.009372275501307759, "grad_norm": 1.204124093055725, "learning_rate": 9.999155405405406e-06, "loss": 9.9922, "step": 172 }, { "epoch": 0.009372275501307759, "eval_accuracy": 0.07798121222948096, "eval_loss": 9.953125, "eval_runtime": 280.5802, "eval_samples_per_second": 120.347, "eval_steps_per_second": 3.343, "step": 172 }, { "epoch": 0.009426765475152572, "grad_norm": 1.29256010055542, "learning_rate": 9.999149956408023e-06, "loss": 9.9531, "step": 173 }, { "epoch": 0.009426765475152572, "eval_accuracy": 0.07805448186541772, "eval_loss": 9.9453125, "eval_runtime": 279.9622, "eval_samples_per_second": 120.613, "eval_steps_per_second": 3.35, "step": 173 }, { "epoch": 0.009481255448997385, "grad_norm": 1.3079378604888916, "learning_rate": 9.999144507410638e-06, "loss": 9.9375, "step": 174 }, { "epoch": 0.009481255448997385, "eval_accuracy": 0.07812670934138066, "eval_loss": 9.9453125, "eval_runtime": 281.3535, "eval_samples_per_second": 120.016, "eval_steps_per_second": 3.334, "step": 174 }, { "epoch": 0.009535745422842197, "grad_norm": 1.2296249866485596, "learning_rate": 9.999139058413253e-06, "loss": 9.9688, "step": 175 }, { "epoch": 0.009535745422842197, "eval_accuracy": 0.07820545031717994, "eval_loss": 9.9375, "eval_runtime": 281.384, "eval_samples_per_second": 120.003, "eval_steps_per_second": 3.334, "step": 175 }, { "epoch": 0.00959023539668701, "grad_norm": 1.2954541444778442, "learning_rate": 9.99913360941587e-06, "loss": 9.9453, "step": 176 }, { "epoch": 0.00959023539668701, "eval_accuracy": 0.07826963000223398, "eval_loss": 9.9375, "eval_runtime": 281.0155, "eval_samples_per_second": 120.161, "eval_steps_per_second": 3.338, "step": 176 }, { "epoch": 0.009644725370531822, "grad_norm": 1.2820461988449097, "learning_rate": 9.999128160418484e-06, "loss": 9.9375, "step": 177 }, { "epoch": 0.009644725370531822, "eval_accuracy": 0.0783392520782624, "eval_loss": 9.9375, "eval_runtime": 280.6598, "eval_samples_per_second": 120.313, "eval_steps_per_second": 3.342, "step": 177 }, { "epoch": 0.009699215344376635, "grad_norm": 1.278702974319458, "learning_rate": 9.9991227114211e-06, "loss": 9.9375, "step": 178 }, { "epoch": 0.009699215344376635, "eval_accuracy": 0.07841200063421225, "eval_loss": 9.9296875, "eval_runtime": 279.7691, "eval_samples_per_second": 120.696, "eval_steps_per_second": 3.353, "step": 178 }, { "epoch": 0.009753705318221447, "grad_norm": 1.253942847251892, "learning_rate": 9.999117262423716e-06, "loss": 9.9453, "step": 179 }, { "epoch": 0.009753705318221447, "eval_accuracy": 0.07850298698970383, "eval_loss": 9.9296875, "eval_runtime": 279.9236, "eval_samples_per_second": 120.629, "eval_steps_per_second": 3.351, "step": 179 }, { "epoch": 0.00980819529206626, "grad_norm": 1.2406857013702393, "learning_rate": 9.99911181342633e-06, "loss": 9.9453, "step": 180 }, { "epoch": 0.00980819529206626, "eval_accuracy": 0.078591744280807, "eval_loss": 9.921875, "eval_runtime": 280.2024, "eval_samples_per_second": 120.509, "eval_steps_per_second": 3.348, "step": 180 }, { "epoch": 0.009862685265911072, "grad_norm": 1.2425099611282349, "learning_rate": 9.999106364428946e-06, "loss": 9.9297, "step": 181 }, { "epoch": 0.009862685265911072, "eval_accuracy": 0.07866854568109945, "eval_loss": 9.921875, "eval_runtime": 278.8945, "eval_samples_per_second": 121.074, "eval_steps_per_second": 3.363, "step": 181 }, { "epoch": 0.009917175239755886, "grad_norm": 1.2617361545562744, "learning_rate": 9.999100915431561e-06, "loss": 9.9375, "step": 182 }, { "epoch": 0.009917175239755886, "eval_accuracy": 0.07874769194133299, "eval_loss": 9.9140625, "eval_runtime": 281.1799, "eval_samples_per_second": 120.09, "eval_steps_per_second": 3.336, "step": 182 }, { "epoch": 0.009971665213600697, "grad_norm": 1.2626757621765137, "learning_rate": 9.999095466434176e-06, "loss": 9.9375, "step": 183 }, { "epoch": 0.009971665213600697, "eval_accuracy": 0.07882507231938868, "eval_loss": 9.9140625, "eval_runtime": 280.0578, "eval_samples_per_second": 120.572, "eval_steps_per_second": 3.349, "step": 183 }, { "epoch": 0.01002615518744551, "grad_norm": 1.312477469444275, "learning_rate": 9.999090017436791e-06, "loss": 9.8984, "step": 184 }, { "epoch": 0.01002615518744551, "eval_accuracy": 0.07890256849299702, "eval_loss": 9.9140625, "eval_runtime": 279.5807, "eval_samples_per_second": 120.777, "eval_steps_per_second": 3.355, "step": 184 }, { "epoch": 0.010080645161290322, "grad_norm": 1.2198225259780884, "learning_rate": 9.999084568439408e-06, "loss": 9.9375, "step": 185 }, { "epoch": 0.010080645161290322, "eval_accuracy": 0.07897815403998669, "eval_loss": 9.90625, "eval_runtime": 281.1433, "eval_samples_per_second": 120.106, "eval_steps_per_second": 3.336, "step": 185 }, { "epoch": 0.010135135135135136, "grad_norm": 1.2455923557281494, "learning_rate": 9.999079119442023e-06, "loss": 9.9297, "step": 186 }, { "epoch": 0.010135135135135136, "eval_accuracy": 0.07905443436029225, "eval_loss": 9.90625, "eval_runtime": 280.4858, "eval_samples_per_second": 120.388, "eval_steps_per_second": 3.344, "step": 186 }, { "epoch": 0.010189625108979947, "grad_norm": 1.2409745454788208, "learning_rate": 9.999073670444638e-06, "loss": 9.9297, "step": 187 }, { "epoch": 0.010189625108979947, "eval_accuracy": 0.07912625655182093, "eval_loss": 9.8984375, "eval_runtime": 280.6787, "eval_samples_per_second": 120.305, "eval_steps_per_second": 3.342, "step": 187 }, { "epoch": 0.01024411508282476, "grad_norm": 1.2540324926376343, "learning_rate": 9.999068221447255e-06, "loss": 9.9141, "step": 188 }, { "epoch": 0.01024411508282476, "eval_accuracy": 0.07919081257242107, "eval_loss": 9.8984375, "eval_runtime": 281.0078, "eval_samples_per_second": 120.164, "eval_steps_per_second": 3.338, "step": 188 }, { "epoch": 0.010298605056669572, "grad_norm": 1.2172249555587769, "learning_rate": 9.99906277244987e-06, "loss": 9.9219, "step": 189 }, { "epoch": 0.010298605056669572, "eval_accuracy": 0.07925064992425089, "eval_loss": 9.8984375, "eval_runtime": 280.6094, "eval_samples_per_second": 120.335, "eval_steps_per_second": 3.343, "step": 189 }, { "epoch": 0.010353095030514386, "grad_norm": 1.2848864793777466, "learning_rate": 9.999057323452485e-06, "loss": 9.8984, "step": 190 }, { "epoch": 0.010353095030514386, "eval_accuracy": 0.07933312530662301, "eval_loss": 9.890625, "eval_runtime": 280.1069, "eval_samples_per_second": 120.55, "eval_steps_per_second": 3.349, "step": 190 }, { "epoch": 0.010407585004359197, "grad_norm": 1.3001790046691895, "learning_rate": 9.999051874455102e-06, "loss": 9.8828, "step": 191 }, { "epoch": 0.010407585004359197, "eval_accuracy": 0.07942619598206223, "eval_loss": 9.890625, "eval_runtime": 279.216, "eval_samples_per_second": 120.935, "eval_steps_per_second": 3.359, "step": 191 }, { "epoch": 0.010462074978204011, "grad_norm": 1.2808548212051392, "learning_rate": 9.999046425457717e-06, "loss": 9.8984, "step": 192 }, { "epoch": 0.010462074978204011, "eval_accuracy": 0.07953382794824668, "eval_loss": 9.8828125, "eval_runtime": 279.7965, "eval_samples_per_second": 120.684, "eval_steps_per_second": 3.352, "step": 192 }, { "epoch": 0.010516564952048823, "grad_norm": 1.2758278846740723, "learning_rate": 9.999040976460332e-06, "loss": 9.8906, "step": 193 }, { "epoch": 0.010516564952048823, "eval_accuracy": 0.07961514537509233, "eval_loss": 9.8828125, "eval_runtime": 280.4068, "eval_samples_per_second": 120.421, "eval_steps_per_second": 3.345, "step": 193 }, { "epoch": 0.010571054925893636, "grad_norm": 1.2733352184295654, "learning_rate": 9.999035527462949e-06, "loss": 9.9062, "step": 194 }, { "epoch": 0.010571054925893636, "eval_accuracy": 0.07969660754637879, "eval_loss": 9.8828125, "eval_runtime": 280.0801, "eval_samples_per_second": 120.562, "eval_steps_per_second": 3.349, "step": 194 }, { "epoch": 0.010625544899738448, "grad_norm": 1.3207552433013916, "learning_rate": 9.999030078465564e-06, "loss": 9.875, "step": 195 }, { "epoch": 0.010625544899738448, "eval_accuracy": 0.07977876449098113, "eval_loss": 9.875, "eval_runtime": 279.7352, "eval_samples_per_second": 120.711, "eval_steps_per_second": 3.353, "step": 195 }, { "epoch": 0.010680034873583261, "grad_norm": 1.3311365842819214, "learning_rate": 9.999024629468179e-06, "loss": 9.8594, "step": 196 }, { "epoch": 0.010680034873583261, "eval_accuracy": 0.07984216255605482, "eval_loss": 9.875, "eval_runtime": 280.0513, "eval_samples_per_second": 120.574, "eval_steps_per_second": 3.349, "step": 196 }, { "epoch": 0.010734524847428073, "grad_norm": 1.2795319557189941, "learning_rate": 9.999019180470794e-06, "loss": 9.8828, "step": 197 }, { "epoch": 0.010734524847428073, "eval_accuracy": 0.07991207412096483, "eval_loss": 9.875, "eval_runtime": 279.4752, "eval_samples_per_second": 120.823, "eval_steps_per_second": 3.356, "step": 197 }, { "epoch": 0.010789014821272886, "grad_norm": 1.2289413213729858, "learning_rate": 9.999013731473409e-06, "loss": 9.8984, "step": 198 }, { "epoch": 0.010789014821272886, "eval_accuracy": 0.07998430159692778, "eval_loss": 9.8671875, "eval_runtime": 279.5837, "eval_samples_per_second": 120.776, "eval_steps_per_second": 3.355, "step": 198 }, { "epoch": 0.010843504795117698, "grad_norm": 1.233830451965332, "learning_rate": 9.999008282476025e-06, "loss": 9.8906, "step": 199 }, { "epoch": 0.010843504795117698, "eval_accuracy": 0.0800536920818509, "eval_loss": 9.8671875, "eval_runtime": 279.9047, "eval_samples_per_second": 120.637, "eval_steps_per_second": 3.351, "step": 199 }, { "epoch": 0.010897994768962511, "grad_norm": 1.2242302894592285, "learning_rate": 9.99900283347864e-06, "loss": 9.9062, "step": 200 }, { "epoch": 0.010897994768962511, "eval_accuracy": 0.08014829704836268, "eval_loss": 9.859375, "eval_runtime": 278.8987, "eval_samples_per_second": 121.073, "eval_steps_per_second": 3.363, "step": 200 }, { "epoch": 0.010952484742807323, "grad_norm": 1.2602059841156006, "learning_rate": 9.998997384481255e-06, "loss": 9.8672, "step": 201 }, { "epoch": 0.010952484742807323, "eval_accuracy": 0.08021710855552257, "eval_loss": 9.859375, "eval_runtime": 279.1708, "eval_samples_per_second": 120.955, "eval_steps_per_second": 3.36, "step": 201 }, { "epoch": 0.011006974716652136, "grad_norm": 1.2747395038604736, "learning_rate": 9.998991935483872e-06, "loss": 9.8672, "step": 202 }, { "epoch": 0.011006974716652136, "eval_accuracy": 0.08028279358276101, "eval_loss": 9.859375, "eval_runtime": 279.6011, "eval_samples_per_second": 120.768, "eval_steps_per_second": 3.355, "step": 202 }, { "epoch": 0.011061464690496948, "grad_norm": 1.2887182235717773, "learning_rate": 9.998986486486487e-06, "loss": 9.8906, "step": 203 }, { "epoch": 0.011061464690496948, "eval_accuracy": 0.08035612111647408, "eval_loss": 9.8515625, "eval_runtime": 278.7511, "eval_samples_per_second": 121.137, "eval_steps_per_second": 3.365, "step": 203 }, { "epoch": 0.011115954664341761, "grad_norm": 1.2620630264282227, "learning_rate": 9.998981037489102e-06, "loss": 9.8828, "step": 204 }, { "epoch": 0.011115954664341761, "eval_accuracy": 0.08042151665483091, "eval_loss": 9.8515625, "eval_runtime": 280.8141, "eval_samples_per_second": 120.247, "eval_steps_per_second": 3.34, "step": 204 }, { "epoch": 0.011170444638186573, "grad_norm": 1.204312801361084, "learning_rate": 9.998975588491719e-06, "loss": 9.8906, "step": 205 }, { "epoch": 0.011170444638186573, "eval_accuracy": 0.08048413309992423, "eval_loss": 9.84375, "eval_runtime": 278.2837, "eval_samples_per_second": 121.34, "eval_steps_per_second": 3.371, "step": 205 }, { "epoch": 0.011224934612031386, "grad_norm": 1.2498817443847656, "learning_rate": 9.998970139494334e-06, "loss": 9.8828, "step": 206 }, { "epoch": 0.011224934612031386, "eval_accuracy": 0.08053965706741799, "eval_loss": 9.84375, "eval_runtime": 280.6201, "eval_samples_per_second": 120.33, "eval_steps_per_second": 3.343, "step": 206 }, { "epoch": 0.011279424585876198, "grad_norm": 1.2628120183944702, "learning_rate": 9.998964690496949e-06, "loss": 9.8594, "step": 207 }, { "epoch": 0.011279424585876198, "eval_accuracy": 0.08057526419985664, "eval_loss": 9.84375, "eval_runtime": 281.1793, "eval_samples_per_second": 120.091, "eval_steps_per_second": 3.336, "step": 207 }, { "epoch": 0.011333914559721011, "grad_norm": 1.286372184753418, "learning_rate": 9.998959241499566e-06, "loss": 9.875, "step": 208 }, { "epoch": 0.011333914559721011, "eval_accuracy": 0.0806213797786979, "eval_loss": 9.8359375, "eval_runtime": 279.1646, "eval_samples_per_second": 120.957, "eval_steps_per_second": 3.36, "step": 208 }, { "epoch": 0.011388404533565825, "grad_norm": 1.2716000080108643, "learning_rate": 9.99895379250218e-06, "loss": 9.8594, "step": 209 }, { "epoch": 0.011388404533565825, "eval_accuracy": 0.0807168821607427, "eval_loss": 9.8359375, "eval_runtime": 279.4887, "eval_samples_per_second": 120.817, "eval_steps_per_second": 3.356, "step": 209 }, { "epoch": 0.011442894507410636, "grad_norm": 1.2634248733520508, "learning_rate": 9.998948343504796e-06, "loss": 9.8516, "step": 210 }, { "epoch": 0.011442894507410636, "eval_accuracy": 0.08081287667388623, "eval_loss": 9.828125, "eval_runtime": 280.2441, "eval_samples_per_second": 120.491, "eval_steps_per_second": 3.347, "step": 210 }, { "epoch": 0.01149738448125545, "grad_norm": 1.2706140279769897, "learning_rate": 9.998942894507413e-06, "loss": 9.8359, "step": 211 }, { "epoch": 0.01149738448125545, "eval_accuracy": 0.08091327141803031, "eval_loss": 9.828125, "eval_runtime": 279.7835, "eval_samples_per_second": 120.69, "eval_steps_per_second": 3.353, "step": 211 }, { "epoch": 0.011551874455100261, "grad_norm": 1.3309648036956787, "learning_rate": 9.998937445510028e-06, "loss": 9.8281, "step": 212 }, { "epoch": 0.011551874455100261, "eval_accuracy": 0.08096497413228675, "eval_loss": 9.828125, "eval_runtime": 279.8407, "eval_samples_per_second": 120.665, "eval_steps_per_second": 3.352, "step": 212 }, { "epoch": 0.011606364428945075, "grad_norm": 1.228298544883728, "learning_rate": 9.998931996512643e-06, "loss": 9.8516, "step": 213 }, { "epoch": 0.011606364428945075, "eval_accuracy": 0.08100848431119348, "eval_loss": 9.8203125, "eval_runtime": 278.724, "eval_samples_per_second": 121.149, "eval_steps_per_second": 3.365, "step": 213 }, { "epoch": 0.011660854402789887, "grad_norm": 1.2184377908706665, "learning_rate": 9.998926547515258e-06, "loss": 9.8516, "step": 214 }, { "epoch": 0.011660854402789887, "eval_accuracy": 0.0810596080476867, "eval_loss": 9.8203125, "eval_runtime": 279.3251, "eval_samples_per_second": 120.888, "eval_steps_per_second": 3.358, "step": 214 }, { "epoch": 0.0117153443766347, "grad_norm": 1.311761498451233, "learning_rate": 9.998921098517873e-06, "loss": 9.8281, "step": 215 }, { "epoch": 0.0117153443766347, "eval_accuracy": 0.08114680209882913, "eval_loss": 9.8203125, "eval_runtime": 279.8598, "eval_samples_per_second": 120.657, "eval_steps_per_second": 3.352, "step": 215 }, { "epoch": 0.011769834350479512, "grad_norm": 1.2210040092468262, "learning_rate": 9.99891564952049e-06, "loss": 9.8438, "step": 216 }, { "epoch": 0.011769834350479512, "eval_accuracy": 0.08124644417188101, "eval_loss": 9.8125, "eval_runtime": 280.305, "eval_samples_per_second": 120.465, "eval_steps_per_second": 3.346, "step": 216 }, { "epoch": 0.011824324324324325, "grad_norm": 1.2239922285079956, "learning_rate": 9.998910200523105e-06, "loss": 9.8359, "step": 217 }, { "epoch": 0.011824324324324325, "eval_accuracy": 0.08132301398106818, "eval_loss": 9.8125, "eval_runtime": 280.4951, "eval_samples_per_second": 120.384, "eval_steps_per_second": 3.344, "step": 217 }, { "epoch": 0.011878814298169137, "grad_norm": 1.26374089717865, "learning_rate": 9.99890475152572e-06, "loss": 9.8281, "step": 218 }, { "epoch": 0.011878814298169137, "eval_accuracy": 0.08139370716595856, "eval_loss": 9.8046875, "eval_runtime": 280.3905, "eval_samples_per_second": 120.428, "eval_steps_per_second": 3.345, "step": 218 }, { "epoch": 0.01193330427201395, "grad_norm": 1.255954384803772, "learning_rate": 9.998899302528335e-06, "loss": 9.8281, "step": 219 }, { "epoch": 0.01193330427201395, "eval_accuracy": 0.08145580253106498, "eval_loss": 9.8046875, "eval_runtime": 278.85, "eval_samples_per_second": 121.094, "eval_steps_per_second": 3.364, "step": 219 }, { "epoch": 0.011987794245858762, "grad_norm": 1.2596890926361084, "learning_rate": 9.998893853530951e-06, "loss": 9.8281, "step": 220 }, { "epoch": 0.011987794245858762, "eval_accuracy": 0.08151057382746654, "eval_loss": 9.8046875, "eval_runtime": 280.1905, "eval_samples_per_second": 120.514, "eval_steps_per_second": 3.348, "step": 220 }, { "epoch": 0.012042284219703575, "grad_norm": 1.3286200761795044, "learning_rate": 9.998888404533566e-06, "loss": 9.7969, "step": 221 }, { "epoch": 0.012042284219703575, "eval_accuracy": 0.08155735523073553, "eval_loss": 9.796875, "eval_runtime": 279.7838, "eval_samples_per_second": 120.69, "eval_steps_per_second": 3.353, "step": 221 }, { "epoch": 0.012096774193548387, "grad_norm": 1.2406117916107178, "learning_rate": 9.998882955536181e-06, "loss": 9.8281, "step": 222 }, { "epoch": 0.012096774193548387, "eval_accuracy": 0.08160451296955061, "eval_loss": 9.796875, "eval_runtime": 279.2237, "eval_samples_per_second": 120.932, "eval_steps_per_second": 3.359, "step": 222 }, { "epoch": 0.0121512641673932, "grad_norm": 1.2828328609466553, "learning_rate": 9.998877506538798e-06, "loss": 9.8047, "step": 223 }, { "epoch": 0.0121512641673932, "eval_accuracy": 0.08167708783217148, "eval_loss": 9.7890625, "eval_runtime": 280.1502, "eval_samples_per_second": 120.532, "eval_steps_per_second": 3.348, "step": 223 }, { "epoch": 0.012205754141238012, "grad_norm": 1.3072834014892578, "learning_rate": 9.998872057541413e-06, "loss": 9.8047, "step": 224 }, { "epoch": 0.012205754141238012, "eval_accuracy": 0.08175794207680655, "eval_loss": 9.7890625, "eval_runtime": 279.8278, "eval_samples_per_second": 120.671, "eval_steps_per_second": 3.352, "step": 224 }, { "epoch": 0.012260244115082825, "grad_norm": 1.2602524757385254, "learning_rate": 9.998866608544028e-06, "loss": 9.8047, "step": 225 }, { "epoch": 0.012260244115082825, "eval_accuracy": 0.0818193426685971, "eval_loss": 9.7890625, "eval_runtime": 279.4782, "eval_samples_per_second": 120.822, "eval_steps_per_second": 3.356, "step": 225 }, { "epoch": 0.012314734088927637, "grad_norm": 1.2501016855239868, "learning_rate": 9.998861159546645e-06, "loss": 9.8047, "step": 226 }, { "epoch": 0.012314734088927637, "eval_accuracy": 0.08185301022552892, "eval_loss": 9.78125, "eval_runtime": 279.6131, "eval_samples_per_second": 120.763, "eval_steps_per_second": 3.355, "step": 226 }, { "epoch": 0.01236922406277245, "grad_norm": 1.2058792114257812, "learning_rate": 9.99885571054926e-06, "loss": 9.8281, "step": 227 }, { "epoch": 0.01236922406277245, "eval_accuracy": 0.08187451924943291, "eval_loss": 9.78125, "eval_runtime": 279.6288, "eval_samples_per_second": 120.757, "eval_steps_per_second": 3.354, "step": 227 }, { "epoch": 0.012423714036617262, "grad_norm": 1.2502729892730713, "learning_rate": 9.998850261551875e-06, "loss": 9.7812, "step": 228 }, { "epoch": 0.012423714036617262, "eval_accuracy": 0.08191142908183882, "eval_loss": 9.78125, "eval_runtime": 279.6067, "eval_samples_per_second": 120.766, "eval_steps_per_second": 3.355, "step": 228 }, { "epoch": 0.012478204010462075, "grad_norm": 1.2606781721115112, "learning_rate": 9.998844812554492e-06, "loss": 9.7891, "step": 229 }, { "epoch": 0.012478204010462075, "eval_accuracy": 0.08197398762915582, "eval_loss": 9.7734375, "eval_runtime": 279.4306, "eval_samples_per_second": 120.842, "eval_steps_per_second": 3.357, "step": 229 }, { "epoch": 0.012532693984306887, "grad_norm": 1.248340368270874, "learning_rate": 9.998839363557107e-06, "loss": 9.7969, "step": 230 }, { "epoch": 0.012532693984306887, "eval_accuracy": 0.08206019741810078, "eval_loss": 9.7734375, "eval_runtime": 278.547, "eval_samples_per_second": 121.226, "eval_steps_per_second": 3.367, "step": 230 }, { "epoch": 0.0125871839581517, "grad_norm": 1.3996437788009644, "learning_rate": 9.998833914559722e-06, "loss": 9.7578, "step": 231 }, { "epoch": 0.0125871839581517, "eval_accuracy": 0.08214620456482859, "eval_loss": 9.765625, "eval_runtime": 279.48, "eval_samples_per_second": 120.821, "eval_steps_per_second": 3.356, "step": 231 }, { "epoch": 0.012641673931996512, "grad_norm": 1.2275718450546265, "learning_rate": 9.998828465562337e-06, "loss": 9.8125, "step": 232 }, { "epoch": 0.012641673931996512, "eval_accuracy": 0.08221081848320505, "eval_loss": 9.765625, "eval_runtime": 280.1352, "eval_samples_per_second": 120.538, "eval_steps_per_second": 3.348, "step": 232 }, { "epoch": 0.012696163905841325, "grad_norm": 1.294257640838623, "learning_rate": 9.998823016564952e-06, "loss": 9.7734, "step": 233 }, { "epoch": 0.012696163905841325, "eval_accuracy": 0.0822595105130927, "eval_loss": 9.765625, "eval_runtime": 278.3308, "eval_samples_per_second": 121.32, "eval_steps_per_second": 3.37, "step": 233 }, { "epoch": 0.012750653879686137, "grad_norm": 1.302538275718689, "learning_rate": 9.998817567567569e-06, "loss": 9.7656, "step": 234 }, { "epoch": 0.012750653879686137, "eval_accuracy": 0.0823088394185199, "eval_loss": 9.7578125, "eval_runtime": 279.0657, "eval_samples_per_second": 121.0, "eval_steps_per_second": 3.361, "step": 234 }, { "epoch": 0.01280514385353095, "grad_norm": 1.2675853967666626, "learning_rate": 9.998812118570184e-06, "loss": 9.7578, "step": 235 }, { "epoch": 0.01280514385353095, "eval_accuracy": 0.0823553023840191, "eval_loss": 9.7578125, "eval_runtime": 279.2742, "eval_samples_per_second": 120.91, "eval_steps_per_second": 3.359, "step": 235 }, { "epoch": 0.012859633827375762, "grad_norm": 1.2685391902923584, "learning_rate": 9.998806669572799e-06, "loss": 9.7891, "step": 236 }, { "epoch": 0.012859633827375762, "eval_accuracy": 0.08240431285167651, "eval_loss": 9.7578125, "eval_runtime": 279.3255, "eval_samples_per_second": 120.888, "eval_steps_per_second": 3.358, "step": 236 }, { "epoch": 0.012914123801220576, "grad_norm": 1.279767632484436, "learning_rate": 9.998801220575415e-06, "loss": 9.7812, "step": 237 }, { "epoch": 0.012914123801220576, "eval_accuracy": 0.08243551975311462, "eval_loss": 9.75, "eval_runtime": 278.6385, "eval_samples_per_second": 121.186, "eval_steps_per_second": 3.366, "step": 237 }, { "epoch": 0.012968613775065387, "grad_norm": 1.2205240726470947, "learning_rate": 9.99879577157803e-06, "loss": 9.7656, "step": 238 }, { "epoch": 0.012968613775065387, "eval_accuracy": 0.08245219431269564, "eval_loss": 9.75, "eval_runtime": 278.8375, "eval_samples_per_second": 121.099, "eval_steps_per_second": 3.364, "step": 238 }, { "epoch": 0.0130231037489102, "grad_norm": 1.1911712884902954, "learning_rate": 9.998790322580645e-06, "loss": 9.7969, "step": 239 }, { "epoch": 0.0130231037489102, "eval_accuracy": 0.08246684245010535, "eval_loss": 9.75, "eval_runtime": 279.7565, "eval_samples_per_second": 120.701, "eval_steps_per_second": 3.353, "step": 239 }, { "epoch": 0.013077593722755012, "grad_norm": 1.250552773475647, "learning_rate": 9.998784873583262e-06, "loss": 9.75, "step": 240 }, { "epoch": 0.013077593722755012, "eval_accuracy": 0.0824862092562854, "eval_loss": 9.7421875, "eval_runtime": 279.3878, "eval_samples_per_second": 120.861, "eval_steps_per_second": 3.357, "step": 240 }, { "epoch": 0.013132083696599826, "grad_norm": 1.1946778297424316, "learning_rate": 9.998779424585877e-06, "loss": 9.7734, "step": 241 }, { "epoch": 0.013132083696599826, "eval_accuracy": 0.08250629978466949, "eval_loss": 9.7421875, "eval_runtime": 278.1923, "eval_samples_per_second": 121.38, "eval_steps_per_second": 3.372, "step": 241 }, { "epoch": 0.013186573670444637, "grad_norm": 1.2602076530456543, "learning_rate": 9.998773975588492e-06, "loss": 9.7578, "step": 242 }, { "epoch": 0.013186573670444637, "eval_accuracy": 0.08251411598447309, "eval_loss": 9.734375, "eval_runtime": 278.443, "eval_samples_per_second": 121.271, "eval_steps_per_second": 3.369, "step": 242 }, { "epoch": 0.01324106364428945, "grad_norm": 1.2365894317626953, "learning_rate": 9.998768526591109e-06, "loss": 9.7656, "step": 243 }, { "epoch": 0.01324106364428945, "eval_accuracy": 0.08253744878833126, "eval_loss": 9.734375, "eval_runtime": 278.0592, "eval_samples_per_second": 121.438, "eval_steps_per_second": 3.373, "step": 243 }, { "epoch": 0.013295553618134264, "grad_norm": 1.2928210496902466, "learning_rate": 9.998763077593724e-06, "loss": 9.7266, "step": 244 }, { "epoch": 0.013295553618134264, "eval_accuracy": 0.0825691478208681, "eval_loss": 9.734375, "eval_runtime": 277.8043, "eval_samples_per_second": 121.55, "eval_steps_per_second": 3.376, "step": 244 }, { "epoch": 0.013350043591979076, "grad_norm": 1.2674510478973389, "learning_rate": 9.998757628596339e-06, "loss": 9.75, "step": 245 }, { "epoch": 0.013350043591979076, "eval_accuracy": 0.08259832830013489, "eval_loss": 9.7265625, "eval_runtime": 278.1561, "eval_samples_per_second": 121.396, "eval_steps_per_second": 3.372, "step": 245 }, { "epoch": 0.01340453356582389, "grad_norm": 1.2813974618911743, "learning_rate": 9.998752179598956e-06, "loss": 9.7422, "step": 246 }, { "epoch": 0.01340453356582389, "eval_accuracy": 0.08266126318299799, "eval_loss": 9.7265625, "eval_runtime": 280.0345, "eval_samples_per_second": 120.582, "eval_steps_per_second": 3.35, "step": 246 }, { "epoch": 0.0134590235396687, "grad_norm": 1.2468206882476807, "learning_rate": 9.99874673060157e-06, "loss": 9.75, "step": 247 }, { "epoch": 0.0134590235396687, "eval_accuracy": 0.08272399542364396, "eval_loss": 9.7265625, "eval_runtime": 277.7706, "eval_samples_per_second": 121.564, "eval_steps_per_second": 3.377, "step": 247 }, { "epoch": 0.013513513513513514, "grad_norm": 1.1757469177246094, "learning_rate": 9.998741281604186e-06, "loss": 9.7656, "step": 248 }, { "epoch": 0.013513513513513514, "eval_accuracy": 0.08277827458894678, "eval_loss": 9.71875, "eval_runtime": 278.8949, "eval_samples_per_second": 121.074, "eval_steps_per_second": 3.363, "step": 248 }, { "epoch": 0.013568003487358326, "grad_norm": 1.2581287622451782, "learning_rate": 9.998735832606801e-06, "loss": 9.7266, "step": 249 }, { "epoch": 0.013568003487358326, "eval_accuracy": 0.08281637132576732, "eval_loss": 9.71875, "eval_runtime": 277.5935, "eval_samples_per_second": 121.642, "eval_steps_per_second": 3.379, "step": 249 }, { "epoch": 0.01362249346120314, "grad_norm": 1.2253607511520386, "learning_rate": 9.998730383609416e-06, "loss": 9.75, "step": 250 }, { "epoch": 0.01362249346120314, "eval_accuracy": 0.08284589919169204, "eval_loss": 9.7109375, "eval_runtime": 277.5385, "eval_samples_per_second": 121.666, "eval_steps_per_second": 3.38, "step": 250 }, { "epoch": 0.013676983435047951, "grad_norm": 1.2590970993041992, "learning_rate": 9.998724934612033e-06, "loss": 9.7266, "step": 251 }, { "epoch": 0.013676983435047951, "eval_accuracy": 0.08286613446451693, "eval_loss": 9.7109375, "eval_runtime": 278.6492, "eval_samples_per_second": 121.181, "eval_steps_per_second": 3.366, "step": 251 }, { "epoch": 0.013731473408892764, "grad_norm": 1.2588908672332764, "learning_rate": 9.998719485614648e-06, "loss": 9.7266, "step": 252 }, { "epoch": 0.013731473408892764, "eval_accuracy": 0.08288451700849948, "eval_loss": 9.7109375, "eval_runtime": 278.1724, "eval_samples_per_second": 121.389, "eval_steps_per_second": 3.372, "step": 252 }, { "epoch": 0.013785963382737576, "grad_norm": 1.2583962678909302, "learning_rate": 9.998714036617263e-06, "loss": 9.7266, "step": 253 }, { "epoch": 0.013785963382737576, "eval_accuracy": 0.0829049549235415, "eval_loss": 9.703125, "eval_runtime": 277.9162, "eval_samples_per_second": 121.501, "eval_steps_per_second": 3.375, "step": 253 }, { "epoch": 0.01384045335658239, "grad_norm": 1.2444450855255127, "learning_rate": 9.998708587619878e-06, "loss": 9.7266, "step": 254 }, { "epoch": 0.01384045335658239, "eval_accuracy": 0.08292394539417544, "eval_loss": 9.703125, "eval_runtime": 279.5035, "eval_samples_per_second": 120.811, "eval_steps_per_second": 3.356, "step": 254 }, { "epoch": 0.013894943330427201, "grad_norm": 1.201521396636963, "learning_rate": 9.998703138622494e-06, "loss": 9.7344, "step": 255 }, { "epoch": 0.013894943330427201, "eval_accuracy": 0.08293364327170955, "eval_loss": 9.703125, "eval_runtime": 278.5449, "eval_samples_per_second": 121.226, "eval_steps_per_second": 3.368, "step": 255 }, { "epoch": 0.013949433304272014, "grad_norm": 1.2879900932312012, "learning_rate": 9.99869768962511e-06, "loss": 9.7109, "step": 256 }, { "epoch": 0.013949433304272014, "eval_accuracy": 0.08292825877851151, "eval_loss": 9.6953125, "eval_runtime": 278.9219, "eval_samples_per_second": 121.063, "eval_steps_per_second": 3.363, "step": 256 }, { "epoch": 0.014003923278116826, "grad_norm": 1.2371270656585693, "learning_rate": 9.998692240627725e-06, "loss": 9.7109, "step": 257 }, { "epoch": 0.014003923278116826, "eval_accuracy": 0.08293071943400523, "eval_loss": 9.6953125, "eval_runtime": 280.4149, "eval_samples_per_second": 120.418, "eval_steps_per_second": 3.345, "step": 257 }, { "epoch": 0.01405841325196164, "grad_norm": 1.2438043355941772, "learning_rate": 9.998686791630341e-06, "loss": 9.7109, "step": 258 }, { "epoch": 0.01405841325196164, "eval_accuracy": 0.08297243478184596, "eval_loss": 9.6953125, "eval_runtime": 279.9479, "eval_samples_per_second": 120.619, "eval_steps_per_second": 3.351, "step": 258 }, { "epoch": 0.014112903225806451, "grad_norm": 1.2312678098678589, "learning_rate": 9.998681342632956e-06, "loss": 9.7031, "step": 259 }, { "epoch": 0.014112903225806451, "eval_accuracy": 0.08300974989868613, "eval_loss": 9.6875, "eval_runtime": 278.9806, "eval_samples_per_second": 121.037, "eval_steps_per_second": 3.362, "step": 259 }, { "epoch": 0.014167393199651265, "grad_norm": 1.289109706878662, "learning_rate": 9.998675893635571e-06, "loss": 9.7109, "step": 260 }, { "epoch": 0.014167393199651265, "eval_accuracy": 0.0830896198811237, "eval_loss": 9.6875, "eval_runtime": 280.8942, "eval_samples_per_second": 120.213, "eval_steps_per_second": 3.339, "step": 260 }, { "epoch": 0.014221883173496076, "grad_norm": 1.2733935117721558, "learning_rate": 9.998670444638188e-06, "loss": 9.6953, "step": 261 }, { "epoch": 0.014221883173496076, "eval_accuracy": 0.08316147102154055, "eval_loss": 9.6796875, "eval_runtime": 279.4498, "eval_samples_per_second": 120.834, "eval_steps_per_second": 3.357, "step": 261 }, { "epoch": 0.01427637314734089, "grad_norm": 1.2618088722229004, "learning_rate": 9.998664995640803e-06, "loss": 9.7031, "step": 262 }, { "epoch": 0.01427637314734089, "eval_accuracy": 0.08321233421804031, "eval_loss": 9.6796875, "eval_runtime": 278.1291, "eval_samples_per_second": 121.408, "eval_steps_per_second": 3.373, "step": 262 }, { "epoch": 0.014330863121185701, "grad_norm": 1.2773399353027344, "learning_rate": 9.998659546643418e-06, "loss": 9.6953, "step": 263 }, { "epoch": 0.014330863121185701, "eval_accuracy": 0.08324872297045931, "eval_loss": 9.6796875, "eval_runtime": 278.6034, "eval_samples_per_second": 121.201, "eval_steps_per_second": 3.367, "step": 263 }, { "epoch": 0.014385353095030515, "grad_norm": 1.246199131011963, "learning_rate": 9.998654097646035e-06, "loss": 9.6875, "step": 264 }, { "epoch": 0.014385353095030515, "eval_accuracy": 0.08326855295884994, "eval_loss": 9.671875, "eval_runtime": 278.4105, "eval_samples_per_second": 121.285, "eval_steps_per_second": 3.369, "step": 264 }, { "epoch": 0.014439843068875326, "grad_norm": 1.2552728652954102, "learning_rate": 9.998648648648648e-06, "loss": 9.6719, "step": 265 }, { "epoch": 0.014439843068875326, "eval_accuracy": 0.08325928931463826, "eval_loss": 9.671875, "eval_runtime": 279.4834, "eval_samples_per_second": 120.819, "eval_steps_per_second": 3.356, "step": 265 }, { "epoch": 0.01449433304272014, "grad_norm": 1.2612943649291992, "learning_rate": 9.998643199651265e-06, "loss": 9.6797, "step": 266 }, { "epoch": 0.01449433304272014, "eval_accuracy": 0.08324947564155151, "eval_loss": 9.671875, "eval_runtime": 277.8081, "eval_samples_per_second": 121.548, "eval_steps_per_second": 3.376, "step": 266 }, { "epoch": 0.014548823016564951, "grad_norm": 1.1741077899932861, "learning_rate": 9.99863775065388e-06, "loss": 9.7188, "step": 267 }, { "epoch": 0.014548823016564951, "eval_accuracy": 0.08325022831264371, "eval_loss": 9.6640625, "eval_runtime": 278.7089, "eval_samples_per_second": 121.155, "eval_steps_per_second": 3.366, "step": 267 }, { "epoch": 0.014603312990409765, "grad_norm": 1.2685978412628174, "learning_rate": 9.998632301656495e-06, "loss": 9.6953, "step": 268 }, { "epoch": 0.014603312990409765, "eval_accuracy": 0.08326976881215271, "eval_loss": 9.6640625, "eval_runtime": 278.6178, "eval_samples_per_second": 121.195, "eval_steps_per_second": 3.367, "step": 268 }, { "epoch": 0.014657802964254577, "grad_norm": 1.329729437828064, "learning_rate": 9.998626852659112e-06, "loss": 9.6797, "step": 269 }, { "epoch": 0.014657802964254577, "eval_accuracy": 0.08330664969567048, "eval_loss": 9.6640625, "eval_runtime": 276.7799, "eval_samples_per_second": 121.999, "eval_steps_per_second": 3.389, "step": 269 }, { "epoch": 0.01471229293809939, "grad_norm": 1.2404329776763916, "learning_rate": 9.998621403661727e-06, "loss": 9.6719, "step": 270 }, { "epoch": 0.01471229293809939, "eval_accuracy": 0.08336324477202621, "eval_loss": 9.65625, "eval_runtime": 277.6213, "eval_samples_per_second": 121.63, "eval_steps_per_second": 3.379, "step": 270 }, { "epoch": 0.014766782911944202, "grad_norm": 1.2414337396621704, "learning_rate": 9.998615954664342e-06, "loss": 9.6875, "step": 271 }, { "epoch": 0.014766782911944202, "eval_accuracy": 0.0834355011968773, "eval_loss": 9.65625, "eval_runtime": 278.0197, "eval_samples_per_second": 121.455, "eval_steps_per_second": 3.374, "step": 271 }, { "epoch": 0.014821272885789015, "grad_norm": 1.241061806678772, "learning_rate": 9.998610505666959e-06, "loss": 9.6641, "step": 272 }, { "epoch": 0.014821272885789015, "eval_accuracy": 0.083517542345927, "eval_loss": 9.6484375, "eval_runtime": 278.7783, "eval_samples_per_second": 121.125, "eval_steps_per_second": 3.365, "step": 272 }, { "epoch": 0.014875762859633827, "grad_norm": 1.2449415922164917, "learning_rate": 9.998605056669574e-06, "loss": 9.6719, "step": 273 }, { "epoch": 0.014875762859633827, "eval_accuracy": 0.08358869871302796, "eval_loss": 9.6484375, "eval_runtime": 278.069, "eval_samples_per_second": 121.434, "eval_steps_per_second": 3.373, "step": 273 }, { "epoch": 0.01493025283347864, "grad_norm": 1.2785589694976807, "learning_rate": 9.998599607672189e-06, "loss": 9.6719, "step": 274 }, { "epoch": 0.01493025283347864, "eval_accuracy": 0.08364344106054136, "eval_loss": 9.6484375, "eval_runtime": 278.0063, "eval_samples_per_second": 121.461, "eval_steps_per_second": 3.374, "step": 274 }, { "epoch": 0.014984742807323452, "grad_norm": 1.2922983169555664, "learning_rate": 9.998594158674805e-06, "loss": 9.6406, "step": 275 }, { "epoch": 0.014984742807323452, "eval_accuracy": 0.08369166990821841, "eval_loss": 9.640625, "eval_runtime": 278.9804, "eval_samples_per_second": 121.037, "eval_steps_per_second": 3.362, "step": 275 }, { "epoch": 0.015039232781168265, "grad_norm": 1.2422248125076294, "learning_rate": 9.99858870967742e-06, "loss": 9.6641, "step": 276 }, { "epoch": 0.015039232781168265, "eval_accuracy": 0.08373764074261889, "eval_loss": 9.640625, "eval_runtime": 279.0599, "eval_samples_per_second": 121.003, "eval_steps_per_second": 3.361, "step": 276 }, { "epoch": 0.015093722755013077, "grad_norm": 1.3546416759490967, "learning_rate": 9.998583260680035e-06, "loss": 9.6328, "step": 277 }, { "epoch": 0.015093722755013077, "eval_accuracy": 0.08375952610206898, "eval_loss": 9.640625, "eval_runtime": 279.3312, "eval_samples_per_second": 120.885, "eval_steps_per_second": 3.358, "step": 277 }, { "epoch": 0.01514821272885789, "grad_norm": 1.3069231510162354, "learning_rate": 9.998577811682652e-06, "loss": 9.6328, "step": 278 }, { "epoch": 0.01514821272885789, "eval_accuracy": 0.08376647383522773, "eval_loss": 9.6328125, "eval_runtime": 277.9079, "eval_samples_per_second": 121.504, "eval_steps_per_second": 3.375, "step": 278 }, { "epoch": 0.015202702702702704, "grad_norm": 1.2460886240005493, "learning_rate": 9.998572362685267e-06, "loss": 9.6484, "step": 279 }, { "epoch": 0.015202702702702704, "eval_accuracy": 0.08376496849304334, "eval_loss": 9.6328125, "eval_runtime": 277.6965, "eval_samples_per_second": 121.597, "eval_steps_per_second": 3.378, "step": 279 }, { "epoch": 0.015257192676547515, "grad_norm": 1.2432016134262085, "learning_rate": 9.998566913687882e-06, "loss": 9.6484, "step": 280 }, { "epoch": 0.015257192676547515, "eval_accuracy": 0.08377012139513608, "eval_loss": 9.6328125, "eval_runtime": 278.8575, "eval_samples_per_second": 121.091, "eval_steps_per_second": 3.364, "step": 280 }, { "epoch": 0.015311682650392329, "grad_norm": 1.1862634420394897, "learning_rate": 9.998561464690499e-06, "loss": 9.6875, "step": 281 }, { "epoch": 0.015311682650392329, "eval_accuracy": 0.08376899238849779, "eval_loss": 9.625, "eval_runtime": 277.3994, "eval_samples_per_second": 121.727, "eval_steps_per_second": 3.381, "step": 281 }, { "epoch": 0.01536617262423714, "grad_norm": 1.2821177244186401, "learning_rate": 9.998556015693112e-06, "loss": 9.6328, "step": 282 }, { "epoch": 0.01536617262423714, "eval_accuracy": 0.08377649015053161, "eval_loss": 9.625, "eval_runtime": 277.0927, "eval_samples_per_second": 121.862, "eval_steps_per_second": 3.385, "step": 282 }, { "epoch": 0.015420662598081954, "grad_norm": 1.2484102249145508, "learning_rate": 9.998550566695729e-06, "loss": 9.6562, "step": 283 }, { "epoch": 0.015420662598081954, "eval_accuracy": 0.08380494690759437, "eval_loss": 9.6171875, "eval_runtime": 277.7775, "eval_samples_per_second": 121.561, "eval_steps_per_second": 3.377, "step": 283 }, { "epoch": 0.015475152571926765, "grad_norm": 1.188653588294983, "learning_rate": 9.998545117698344e-06, "loss": 9.6719, "step": 284 }, { "epoch": 0.015475152571926765, "eval_accuracy": 0.08383511164905864, "eval_loss": 9.6171875, "eval_runtime": 278.6388, "eval_samples_per_second": 121.186, "eval_steps_per_second": 3.366, "step": 284 }, { "epoch": 0.015529642545771579, "grad_norm": 1.2269041538238525, "learning_rate": 9.998539668700959e-06, "loss": 9.6641, "step": 285 }, { "epoch": 0.015529642545771579, "eval_accuracy": 0.08384709648875752, "eval_loss": 9.6171875, "eval_runtime": 278.3677, "eval_samples_per_second": 121.304, "eval_steps_per_second": 3.37, "step": 285 }, { "epoch": 0.01558413251961639, "grad_norm": 1.266385555267334, "learning_rate": 9.998534219703576e-06, "loss": 9.6328, "step": 286 }, { "epoch": 0.01558413251961639, "eval_accuracy": 0.08384223307554638, "eval_loss": 9.609375, "eval_runtime": 278.3739, "eval_samples_per_second": 121.301, "eval_steps_per_second": 3.37, "step": 286 }, { "epoch": 0.015638622493461204, "grad_norm": 1.28980553150177, "learning_rate": 9.998528770706191e-06, "loss": 9.6328, "step": 287 }, { "epoch": 0.015638622493461204, "eval_accuracy": 0.08388331154784755, "eval_loss": 9.609375, "eval_runtime": 279.4156, "eval_samples_per_second": 120.849, "eval_steps_per_second": 3.357, "step": 287 }, { "epoch": 0.015693112467306015, "grad_norm": 1.2614293098449707, "learning_rate": 9.998523321708806e-06, "loss": 9.625, "step": 288 }, { "epoch": 0.015693112467306015, "eval_accuracy": 0.08391469214261461, "eval_loss": 9.609375, "eval_runtime": 278.4116, "eval_samples_per_second": 121.284, "eval_steps_per_second": 3.369, "step": 288 }, { "epoch": 0.015747602441150827, "grad_norm": 1.226367473602295, "learning_rate": 9.998517872711423e-06, "loss": 9.6328, "step": 289 }, { "epoch": 0.015747602441150827, "eval_accuracy": 0.08397444264777995, "eval_loss": 9.6015625, "eval_runtime": 277.9542, "eval_samples_per_second": 121.484, "eval_steps_per_second": 3.375, "step": 289 }, { "epoch": 0.015802092414995642, "grad_norm": 1.2424851655960083, "learning_rate": 9.998512423714038e-06, "loss": 9.6172, "step": 290 }, { "epoch": 0.015802092414995642, "eval_accuracy": 0.08402672433979962, "eval_loss": 9.6015625, "eval_runtime": 277.2733, "eval_samples_per_second": 121.782, "eval_steps_per_second": 3.383, "step": 290 }, { "epoch": 0.015856582388840454, "grad_norm": 1.2392754554748535, "learning_rate": 9.998506974716653e-06, "loss": 9.6172, "step": 291 }, { "epoch": 0.015856582388840454, "eval_accuracy": 0.08409255411147887, "eval_loss": 9.6015625, "eval_runtime": 276.0303, "eval_samples_per_second": 122.331, "eval_steps_per_second": 3.398, "step": 291 }, { "epoch": 0.015911072362685266, "grad_norm": 1.2615457773208618, "learning_rate": 9.998501525719268e-06, "loss": 9.6094, "step": 292 }, { "epoch": 0.015911072362685266, "eval_accuracy": 0.08412827703947016, "eval_loss": 9.59375, "eval_runtime": 278.5317, "eval_samples_per_second": 121.232, "eval_steps_per_second": 3.368, "step": 292 }, { "epoch": 0.015965562336530077, "grad_norm": 1.2395169734954834, "learning_rate": 9.998496076721884e-06, "loss": 9.6172, "step": 293 }, { "epoch": 0.015965562336530077, "eval_accuracy": 0.0841603524075531, "eval_loss": 9.59375, "eval_runtime": 278.5884, "eval_samples_per_second": 121.207, "eval_steps_per_second": 3.367, "step": 293 }, { "epoch": 0.016020052310374892, "grad_norm": 1.2416415214538574, "learning_rate": 9.9984906277245e-06, "loss": 9.6094, "step": 294 }, { "epoch": 0.016020052310374892, "eval_accuracy": 0.08417905338930542, "eval_loss": 9.59375, "eval_runtime": 279.278, "eval_samples_per_second": 120.908, "eval_steps_per_second": 3.359, "step": 294 }, { "epoch": 0.016074542284219704, "grad_norm": 1.2306995391845703, "learning_rate": 9.998485178727115e-06, "loss": 9.6328, "step": 295 }, { "epoch": 0.016074542284219704, "eval_accuracy": 0.08419016976235945, "eval_loss": 9.5859375, "eval_runtime": 279.0203, "eval_samples_per_second": 121.02, "eval_steps_per_second": 3.362, "step": 295 }, { "epoch": 0.016129032258064516, "grad_norm": 1.2327263355255127, "learning_rate": 9.998479729729731e-06, "loss": 9.5938, "step": 296 }, { "epoch": 0.016129032258064516, "eval_accuracy": 0.08418545109358912, "eval_loss": 9.5859375, "eval_runtime": 279.0896, "eval_samples_per_second": 120.99, "eval_steps_per_second": 3.361, "step": 296 }, { "epoch": 0.016183522231909327, "grad_norm": 1.322943091392517, "learning_rate": 9.998474280732346e-06, "loss": 9.5938, "step": 297 }, { "epoch": 0.016183522231909327, "eval_accuracy": 0.08417552162494972, "eval_loss": 9.578125, "eval_runtime": 278.5612, "eval_samples_per_second": 121.219, "eval_steps_per_second": 3.367, "step": 297 }, { "epoch": 0.016238012205754142, "grad_norm": 1.264687180519104, "learning_rate": 9.998468831734961e-06, "loss": 9.6016, "step": 298 }, { "epoch": 0.016238012205754142, "eval_accuracy": 0.08418681169133271, "eval_loss": 9.578125, "eval_runtime": 276.7181, "eval_samples_per_second": 122.027, "eval_steps_per_second": 3.39, "step": 298 }, { "epoch": 0.016292502179598954, "grad_norm": 1.2931830883026123, "learning_rate": 9.998463382737576e-06, "loss": 9.5781, "step": 299 }, { "epoch": 0.016292502179598954, "eval_accuracy": 0.08422736908364697, "eval_loss": 9.578125, "eval_runtime": 278.6618, "eval_samples_per_second": 121.176, "eval_steps_per_second": 3.366, "step": 299 }, { "epoch": 0.016346992153443766, "grad_norm": 1.247689962387085, "learning_rate": 9.998457933740191e-06, "loss": 9.5938, "step": 300 }, { "epoch": 0.016346992153443766, "eval_accuracy": 0.08427264514473155, "eval_loss": 9.5703125, "eval_runtime": 278.3198, "eval_samples_per_second": 121.324, "eval_steps_per_second": 3.37, "step": 300 }, { "epoch": 0.016401482127288577, "grad_norm": 1.269108772277832, "learning_rate": 9.998452484742808e-06, "loss": 9.5938, "step": 301 }, { "epoch": 0.016401482127288577, "eval_accuracy": 0.08434733327618822, "eval_loss": 9.5703125, "eval_runtime": 277.9862, "eval_samples_per_second": 121.47, "eval_steps_per_second": 3.374, "step": 301 }, { "epoch": 0.016455972101133393, "grad_norm": 1.2147337198257446, "learning_rate": 9.998447035745423e-06, "loss": 9.6016, "step": 302 }, { "epoch": 0.016455972101133393, "eval_accuracy": 0.08440334937478074, "eval_loss": 9.5703125, "eval_runtime": 278.4928, "eval_samples_per_second": 121.249, "eval_steps_per_second": 3.368, "step": 302 }, { "epoch": 0.016510462074978204, "grad_norm": 1.2472692728042603, "learning_rate": 9.998441586748038e-06, "loss": 9.5781, "step": 303 }, { "epoch": 0.016510462074978204, "eval_accuracy": 0.08445693376676766, "eval_loss": 9.5625, "eval_runtime": 277.2562, "eval_samples_per_second": 121.79, "eval_steps_per_second": 3.383, "step": 303 }, { "epoch": 0.016564952048823016, "grad_norm": 1.2579458951950073, "learning_rate": 9.998436137750655e-06, "loss": 9.6016, "step": 304 }, { "epoch": 0.016564952048823016, "eval_accuracy": 0.08448973285705465, "eval_loss": 9.5625, "eval_runtime": 277.2268, "eval_samples_per_second": 121.803, "eval_steps_per_second": 3.384, "step": 304 }, { "epoch": 0.016619442022667828, "grad_norm": 1.2708834409713745, "learning_rate": 9.99843068875327e-06, "loss": 9.5703, "step": 305 }, { "epoch": 0.016619442022667828, "eval_accuracy": 0.084512920916472, "eval_loss": 9.5625, "eval_runtime": 277.9135, "eval_samples_per_second": 121.502, "eval_steps_per_second": 3.375, "step": 305 }, { "epoch": 0.016673931996512643, "grad_norm": 1.2249526977539062, "learning_rate": 9.998425239755885e-06, "loss": 9.5781, "step": 306 }, { "epoch": 0.016673931996512643, "eval_accuracy": 0.08454019076912014, "eval_loss": 9.5546875, "eval_runtime": 277.7743, "eval_samples_per_second": 121.563, "eval_steps_per_second": 3.377, "step": 306 }, { "epoch": 0.016728421970357454, "grad_norm": 1.214102029800415, "learning_rate": 9.998419790758502e-06, "loss": 9.5938, "step": 307 }, { "epoch": 0.016728421970357454, "eval_accuracy": 0.08456963178838038, "eval_loss": 9.5546875, "eval_runtime": 277.7599, "eval_samples_per_second": 121.569, "eval_steps_per_second": 3.377, "step": 307 }, { "epoch": 0.016782911944202266, "grad_norm": 1.2691110372543335, "learning_rate": 9.998414341761117e-06, "loss": 9.5391, "step": 308 }, { "epoch": 0.016782911944202266, "eval_accuracy": 0.08458882490123146, "eval_loss": 9.5546875, "eval_runtime": 278.1571, "eval_samples_per_second": 121.395, "eval_steps_per_second": 3.372, "step": 308 }, { "epoch": 0.016837401918047078, "grad_norm": 1.2669826745986938, "learning_rate": 9.998408892763732e-06, "loss": 9.5625, "step": 309 }, { "epoch": 0.016837401918047078, "eval_accuracy": 0.08460961020293142, "eval_loss": 9.546875, "eval_runtime": 277.952, "eval_samples_per_second": 121.485, "eval_steps_per_second": 3.375, "step": 309 }, { "epoch": 0.016891891891891893, "grad_norm": 1.2808586359024048, "learning_rate": 9.998403443766349e-06, "loss": 9.5547, "step": 310 }, { "epoch": 0.016891891891891893, "eval_accuracy": 0.08461615265165591, "eval_loss": 9.546875, "eval_runtime": 277.1252, "eval_samples_per_second": 121.847, "eval_steps_per_second": 3.385, "step": 310 }, { "epoch": 0.016946381865736704, "grad_norm": 1.252959132194519, "learning_rate": 9.998397994768964e-06, "loss": 9.5703, "step": 311 }, { "epoch": 0.016946381865736704, "eval_accuracy": 0.08458865120790249, "eval_loss": 9.546875, "eval_runtime": 278.3797, "eval_samples_per_second": 121.298, "eval_steps_per_second": 3.369, "step": 311 }, { "epoch": 0.017000871839581516, "grad_norm": 1.2392700910568237, "learning_rate": 9.998392545771579e-06, "loss": 9.5625, "step": 312 }, { "epoch": 0.017000871839581516, "eval_accuracy": 0.08457846119926964, "eval_loss": 9.5390625, "eval_runtime": 278.5979, "eval_samples_per_second": 121.203, "eval_steps_per_second": 3.367, "step": 312 }, { "epoch": 0.01705536181342633, "grad_norm": 1.363663673400879, "learning_rate": 9.998387096774195e-06, "loss": 9.5469, "step": 313 }, { "epoch": 0.01705536181342633, "eval_accuracy": 0.0846050362786019, "eval_loss": 9.5390625, "eval_runtime": 279.48, "eval_samples_per_second": 120.821, "eval_steps_per_second": 3.356, "step": 313 }, { "epoch": 0.017109851787271143, "grad_norm": 1.258362889289856, "learning_rate": 9.99838164777681e-06, "loss": 9.5469, "step": 314 }, { "epoch": 0.017109851787271143, "eval_accuracy": 0.08462159504263028, "eval_loss": 9.5390625, "eval_runtime": 280.9528, "eval_samples_per_second": 120.187, "eval_steps_per_second": 3.339, "step": 314 }, { "epoch": 0.017164341761115955, "grad_norm": 1.2624891996383667, "learning_rate": 9.998376198779425e-06, "loss": 9.5391, "step": 315 }, { "epoch": 0.017164341761115955, "eval_accuracy": 0.08465442308180542, "eval_loss": 9.53125, "eval_runtime": 280.6538, "eval_samples_per_second": 120.316, "eval_steps_per_second": 3.342, "step": 315 }, { "epoch": 0.017218831734960766, "grad_norm": 1.2132024765014648, "learning_rate": 9.998370749782042e-06, "loss": 9.5781, "step": 316 }, { "epoch": 0.017218831734960766, "eval_accuracy": 0.08470062550731117, "eval_loss": 9.53125, "eval_runtime": 279.7295, "eval_samples_per_second": 120.713, "eval_steps_per_second": 3.353, "step": 316 }, { "epoch": 0.01727332170880558, "grad_norm": 1.2480347156524658, "learning_rate": 9.998365300784656e-06, "loss": 9.5469, "step": 317 }, { "epoch": 0.01727332170880558, "eval_accuracy": 0.08472815589995276, "eval_loss": 9.53125, "eval_runtime": 278.8002, "eval_samples_per_second": 121.115, "eval_steps_per_second": 3.364, "step": 317 }, { "epoch": 0.017327811682650393, "grad_norm": 1.304702639579773, "learning_rate": 9.998359851787272e-06, "loss": 9.5312, "step": 318 }, { "epoch": 0.017327811682650393, "eval_accuracy": 0.0847501570549555, "eval_loss": 9.5234375, "eval_runtime": 279.8383, "eval_samples_per_second": 120.666, "eval_steps_per_second": 3.352, "step": 318 }, { "epoch": 0.017382301656495205, "grad_norm": 1.221549153327942, "learning_rate": 9.998354402789887e-06, "loss": 9.5703, "step": 319 }, { "epoch": 0.017382301656495205, "eval_accuracy": 0.08476648422787858, "eval_loss": 9.5234375, "eval_runtime": 278.8177, "eval_samples_per_second": 121.108, "eval_steps_per_second": 3.364, "step": 319 }, { "epoch": 0.017436791630340016, "grad_norm": 1.262052297592163, "learning_rate": 9.998348953792502e-06, "loss": 9.5312, "step": 320 }, { "epoch": 0.017436791630340016, "eval_accuracy": 0.08480269928696862, "eval_loss": 9.5234375, "eval_runtime": 278.2907, "eval_samples_per_second": 121.337, "eval_steps_per_second": 3.371, "step": 320 }, { "epoch": 0.01749128160418483, "grad_norm": 1.20883047580719, "learning_rate": 9.998343504795119e-06, "loss": 9.5703, "step": 321 }, { "epoch": 0.01749128160418483, "eval_accuracy": 0.08483929068160476, "eval_loss": 9.515625, "eval_runtime": 278.7692, "eval_samples_per_second": 121.129, "eval_steps_per_second": 3.365, "step": 321 }, { "epoch": 0.017545771578029643, "grad_norm": 1.2607730627059937, "learning_rate": 9.998338055797734e-06, "loss": 9.5312, "step": 322 }, { "epoch": 0.017545771578029643, "eval_accuracy": 0.08487046863415469, "eval_loss": 9.515625, "eval_runtime": 278.4626, "eval_samples_per_second": 121.262, "eval_steps_per_second": 3.368, "step": 322 }, { "epoch": 0.017600261551874455, "grad_norm": 1.2543152570724487, "learning_rate": 9.998332606800349e-06, "loss": 9.5391, "step": 323 }, { "epoch": 0.017600261551874455, "eval_accuracy": 0.08489484359798667, "eval_loss": 9.5078125, "eval_runtime": 278.034, "eval_samples_per_second": 121.449, "eval_steps_per_second": 3.374, "step": 323 }, { "epoch": 0.017654751525719267, "grad_norm": 1.261744737625122, "learning_rate": 9.998327157802966e-06, "loss": 9.5156, "step": 324 }, { "epoch": 0.017654751525719267, "eval_accuracy": 0.08491759742408161, "eval_loss": 9.5078125, "eval_runtime": 278.3626, "eval_samples_per_second": 121.306, "eval_steps_per_second": 3.37, "step": 324 }, { "epoch": 0.01770924149956408, "grad_norm": 1.261576771736145, "learning_rate": 9.998321708805581e-06, "loss": 9.5234, "step": 325 }, { "epoch": 0.01770924149956408, "eval_accuracy": 0.08492923487712253, "eval_loss": 9.5078125, "eval_runtime": 278.7706, "eval_samples_per_second": 121.128, "eval_steps_per_second": 3.365, "step": 325 }, { "epoch": 0.017763731473408893, "grad_norm": 1.2752763032913208, "learning_rate": 9.998316259808196e-06, "loss": 9.5391, "step": 326 }, { "epoch": 0.017763731473408893, "eval_accuracy": 0.08490972332650168, "eval_loss": 9.5, "eval_runtime": 277.4247, "eval_samples_per_second": 121.716, "eval_steps_per_second": 3.381, "step": 326 }, { "epoch": 0.017818221447253705, "grad_norm": 1.2762752771377563, "learning_rate": 9.998310810810811e-06, "loss": 9.5078, "step": 327 }, { "epoch": 0.017818221447253705, "eval_accuracy": 0.08489041441809797, "eval_loss": 9.5, "eval_runtime": 278.791, "eval_samples_per_second": 121.119, "eval_steps_per_second": 3.365, "step": 327 }, { "epoch": 0.017872711421098517, "grad_norm": 1.2357895374298096, "learning_rate": 9.998305361813428e-06, "loss": 9.5312, "step": 328 }, { "epoch": 0.017872711421098517, "eval_accuracy": 0.08484099866600628, "eval_loss": 9.5, "eval_runtime": 278.1228, "eval_samples_per_second": 121.41, "eval_steps_per_second": 3.373, "step": 328 }, { "epoch": 0.017927201394943332, "grad_norm": 1.2696473598480225, "learning_rate": 9.998299912816043e-06, "loss": 9.5078, "step": 329 }, { "epoch": 0.017927201394943332, "eval_accuracy": 0.08476593419900351, "eval_loss": 9.4921875, "eval_runtime": 279.1969, "eval_samples_per_second": 120.943, "eval_steps_per_second": 3.36, "step": 329 }, { "epoch": 0.017981691368788143, "grad_norm": 1.2310556173324585, "learning_rate": 9.998294463818658e-06, "loss": 9.5234, "step": 330 }, { "epoch": 0.017981691368788143, "eval_accuracy": 0.08473466939978909, "eval_loss": 9.4921875, "eval_runtime": 278.5891, "eval_samples_per_second": 121.207, "eval_steps_per_second": 3.367, "step": 330 }, { "epoch": 0.018036181342632955, "grad_norm": 1.2741543054580688, "learning_rate": 9.998289014821274e-06, "loss": 9.5078, "step": 331 }, { "epoch": 0.018036181342632955, "eval_accuracy": 0.08475229927267945, "eval_loss": 9.4921875, "eval_runtime": 278.1693, "eval_samples_per_second": 121.39, "eval_steps_per_second": 3.372, "step": 331 }, { "epoch": 0.018090671316477767, "grad_norm": 1.2879000902175903, "learning_rate": 9.99828356582389e-06, "loss": 9.4922, "step": 332 }, { "epoch": 0.018090671316477767, "eval_accuracy": 0.08481100761787097, "eval_loss": 9.484375, "eval_runtime": 279.2612, "eval_samples_per_second": 120.915, "eval_steps_per_second": 3.359, "step": 332 }, { "epoch": 0.018145161290322582, "grad_norm": 1.2958968877792358, "learning_rate": 9.998278116826505e-06, "loss": 9.5, "step": 333 }, { "epoch": 0.018145161290322582, "eval_accuracy": 0.08491096812869263, "eval_loss": 9.484375, "eval_runtime": 278.8921, "eval_samples_per_second": 121.076, "eval_steps_per_second": 3.363, "step": 333 }, { "epoch": 0.018199651264167394, "grad_norm": 1.2337111234664917, "learning_rate": 9.99827266782912e-06, "loss": 9.5078, "step": 334 }, { "epoch": 0.018199651264167394, "eval_accuracy": 0.08498186395580014, "eval_loss": 9.484375, "eval_runtime": 279.3346, "eval_samples_per_second": 120.884, "eval_steps_per_second": 3.358, "step": 334 }, { "epoch": 0.018254141238012205, "grad_norm": 1.3057183027267456, "learning_rate": 9.998267218831735e-06, "loss": 9.4766, "step": 335 }, { "epoch": 0.018254141238012205, "eval_accuracy": 0.0850502412296376, "eval_loss": 9.4765625, "eval_runtime": 278.5471, "eval_samples_per_second": 121.225, "eval_steps_per_second": 3.367, "step": 335 }, { "epoch": 0.018308631211857017, "grad_norm": 1.2413270473480225, "learning_rate": 9.998261769834351e-06, "loss": 9.5, "step": 336 }, { "epoch": 0.018308631211857017, "eval_accuracy": 0.08509852797509099, "eval_loss": 9.4765625, "eval_runtime": 277.2384, "eval_samples_per_second": 121.798, "eval_steps_per_second": 3.383, "step": 336 }, { "epoch": 0.018308631211857017, "step": 336, "total_flos": 352197518819328.0, "train_loss": 10.035667782738095, "train_runtime": 94125.7847, "train_samples_per_second": 701.873, "train_steps_per_second": 19.497 } ], "logging_steps": 1, "max_steps": 1835200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 352197518819328.0, "train_batch_size": 36, "trial_name": null, "trial_params": null }