{ "best_metric": 0.8401518346689161, "best_model_checkpoint": "MLM_checkpoint/vinai_phobert-base-v2-2024-04-30_01-13-56/checkpoint-140200", "epoch": 9.997413347128816, "eval_steps": 200, "global_step": 154600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 156.40440368652344, "learning_rate": 1.0000000000000001e-07, "loss": 3.205, "step": 100 }, { "epoch": 0.01, "grad_norm": 39.709747314453125, "learning_rate": 2.0000000000000002e-07, "loss": 2.709, "step": 200 }, { "epoch": 0.01, "eval_accuracy": 0.5741666666666667, "eval_loss": 2.260972738265991, "eval_runtime": 7.6532, "eval_samples_per_second": 130.665, "eval_steps_per_second": 8.232, "step": 200 }, { "epoch": 0.02, "grad_norm": 81.07353973388672, "learning_rate": 3.0000000000000004e-07, "loss": 2.2557, "step": 300 }, { "epoch": 0.03, "grad_norm": 35.715919494628906, "learning_rate": 4.0000000000000003e-07, "loss": 1.994, "step": 400 }, { "epoch": 0.03, "eval_accuracy": 0.6437786100707449, "eval_loss": 1.8734396696090698, "eval_runtime": 7.669, "eval_samples_per_second": 130.395, "eval_steps_per_second": 8.215, "step": 400 }, { "epoch": 0.03, "grad_norm": 26.764568328857422, "learning_rate": 5.000000000000001e-07, "loss": 1.8308, "step": 500 }, { "epoch": 0.04, "grad_norm": 14.45901107788086, "learning_rate": 6.000000000000001e-07, "loss": 1.8269, "step": 600 }, { "epoch": 0.04, "eval_accuracy": 0.6686340640809444, "eval_loss": 1.5490355491638184, "eval_runtime": 7.3353, "eval_samples_per_second": 136.327, "eval_steps_per_second": 8.589, "step": 600 }, { "epoch": 0.05, "grad_norm": 26.489681243896484, "learning_rate": 7.000000000000001e-07, "loss": 1.7919, "step": 700 }, { "epoch": 0.05, "grad_norm": 19.49435806274414, "learning_rate": 8.000000000000001e-07, "loss": 1.6877, "step": 800 }, { "epoch": 0.05, "eval_accuracy": 0.6658249158249159, "eval_loss": 1.6493752002716064, "eval_runtime": 7.553, "eval_samples_per_second": 132.397, "eval_steps_per_second": 8.341, "step": 800 }, { "epoch": 0.06, "grad_norm": 17.025978088378906, "learning_rate": 9e-07, "loss": 1.7023, "step": 900 }, { "epoch": 0.06, "grad_norm": 14.813047409057617, "learning_rate": 1.0000000000000002e-06, "loss": 1.642, "step": 1000 }, { "epoch": 0.06, "eval_accuracy": 0.6631711409395973, "eval_loss": 1.612972378730774, "eval_runtime": 7.3229, "eval_samples_per_second": 136.557, "eval_steps_per_second": 8.603, "step": 1000 }, { "epoch": 0.07, "grad_norm": 13.18350887298584, "learning_rate": 1.1e-06, "loss": 1.6782, "step": 1100 }, { "epoch": 0.08, "grad_norm": 15.253301620483398, "learning_rate": 1.2000000000000002e-06, "loss": 1.7055, "step": 1200 }, { "epoch": 0.08, "eval_accuracy": 0.6684873949579831, "eval_loss": 1.6126316785812378, "eval_runtime": 7.8338, "eval_samples_per_second": 127.652, "eval_steps_per_second": 8.042, "step": 1200 }, { "epoch": 0.08, "grad_norm": 56.080711364746094, "learning_rate": 1.3e-06, "loss": 1.6292, "step": 1300 }, { "epoch": 0.09, "grad_norm": 15.526861190795898, "learning_rate": 1.4000000000000001e-06, "loss": 1.5669, "step": 1400 }, { "epoch": 0.09, "eval_accuracy": 0.6861252115059222, "eval_loss": 1.5611977577209473, "eval_runtime": 7.7482, "eval_samples_per_second": 129.063, "eval_steps_per_second": 8.131, "step": 1400 }, { "epoch": 0.1, "grad_norm": 14.525699615478516, "learning_rate": 1.5e-06, "loss": 1.5238, "step": 1500 }, { "epoch": 0.1, "grad_norm": 15.056904792785645, "learning_rate": 1.6000000000000001e-06, "loss": 1.5751, "step": 1600 }, { "epoch": 0.1, "eval_accuracy": 0.675, "eval_loss": 1.576570749282837, "eval_runtime": 7.5186, "eval_samples_per_second": 133.003, "eval_steps_per_second": 8.379, "step": 1600 }, { "epoch": 0.11, "grad_norm": 21.880970001220703, "learning_rate": 1.7000000000000002e-06, "loss": 1.5939, "step": 1700 }, { "epoch": 0.12, "grad_norm": 18.907203674316406, "learning_rate": 1.8e-06, "loss": 1.5305, "step": 1800 }, { "epoch": 0.12, "eval_accuracy": 0.693929173693086, "eval_loss": 1.4478867053985596, "eval_runtime": 7.9597, "eval_samples_per_second": 125.633, "eval_steps_per_second": 7.915, "step": 1800 }, { "epoch": 0.12, "grad_norm": 17.099138259887695, "learning_rate": 1.9e-06, "loss": 1.524, "step": 1900 }, { "epoch": 0.13, "grad_norm": 19.232114791870117, "learning_rate": 2.0000000000000003e-06, "loss": 1.5338, "step": 2000 }, { "epoch": 0.13, "eval_accuracy": 0.6875, "eval_loss": 1.5188933610916138, "eval_runtime": 7.5782, "eval_samples_per_second": 131.958, "eval_steps_per_second": 8.313, "step": 2000 }, { "epoch": 0.14, "grad_norm": 14.048361778259277, "learning_rate": 2.1000000000000002e-06, "loss": 1.5064, "step": 2100 }, { "epoch": 0.14, "grad_norm": 15.210893630981445, "learning_rate": 2.2e-06, "loss": 1.5157, "step": 2200 }, { "epoch": 0.14, "eval_accuracy": 0.6854838709677419, "eval_loss": 1.488882303237915, "eval_runtime": 7.4414, "eval_samples_per_second": 134.383, "eval_steps_per_second": 8.466, "step": 2200 }, { "epoch": 0.15, "grad_norm": 18.424758911132812, "learning_rate": 2.3e-06, "loss": 1.5364, "step": 2300 }, { "epoch": 0.16, "grad_norm": 17.106618881225586, "learning_rate": 2.4000000000000003e-06, "loss": 1.5079, "step": 2400 }, { "epoch": 0.16, "eval_accuracy": 0.70042194092827, "eval_loss": 1.4502460956573486, "eval_runtime": 7.5853, "eval_samples_per_second": 131.833, "eval_steps_per_second": 8.306, "step": 2400 }, { "epoch": 0.16, "grad_norm": 15.618694305419922, "learning_rate": 2.5e-06, "loss": 1.5284, "step": 2500 }, { "epoch": 0.17, "grad_norm": 18.13161849975586, "learning_rate": 2.6e-06, "loss": 1.55, "step": 2600 }, { "epoch": 0.17, "eval_accuracy": 0.6980491942324003, "eval_loss": 1.451174259185791, "eval_runtime": 7.5268, "eval_samples_per_second": 132.858, "eval_steps_per_second": 8.37, "step": 2600 }, { "epoch": 0.17, "grad_norm": 18.6927433013916, "learning_rate": 2.7e-06, "loss": 1.4815, "step": 2700 }, { "epoch": 0.18, "grad_norm": 16.301359176635742, "learning_rate": 2.8000000000000003e-06, "loss": 1.5037, "step": 2800 }, { "epoch": 0.18, "eval_accuracy": 0.6808960270498732, "eval_loss": 1.5238386392593384, "eval_runtime": 7.8956, "eval_samples_per_second": 126.653, "eval_steps_per_second": 7.979, "step": 2800 }, { "epoch": 0.19, "grad_norm": 17.90258026123047, "learning_rate": 2.9e-06, "loss": 1.523, "step": 2900 }, { "epoch": 0.19, "grad_norm": 12.541860580444336, "learning_rate": 3e-06, "loss": 1.495, "step": 3000 }, { "epoch": 0.19, "eval_accuracy": 0.6983993260320135, "eval_loss": 1.4101921319961548, "eval_runtime": 7.94, "eval_samples_per_second": 125.944, "eval_steps_per_second": 7.934, "step": 3000 }, { "epoch": 0.2, "grad_norm": 17.367326736450195, "learning_rate": 3.1e-06, "loss": 1.4121, "step": 3100 }, { "epoch": 0.21, "grad_norm": 11.267683029174805, "learning_rate": 3.2000000000000003e-06, "loss": 1.446, "step": 3200 }, { "epoch": 0.21, "eval_accuracy": 0.7063323416914578, "eval_loss": 1.3454183340072632, "eval_runtime": 7.6514, "eval_samples_per_second": 130.694, "eval_steps_per_second": 8.234, "step": 3200 }, { "epoch": 0.21, "grad_norm": 15.869690895080566, "learning_rate": 3.3e-06, "loss": 1.397, "step": 3300 }, { "epoch": 0.22, "grad_norm": 13.50574016571045, "learning_rate": 3.4000000000000005e-06, "loss": 1.4333, "step": 3400 }, { "epoch": 0.22, "eval_accuracy": 0.7166178317287568, "eval_loss": 1.32160222530365, "eval_runtime": 7.5029, "eval_samples_per_second": 133.281, "eval_steps_per_second": 8.397, "step": 3400 }, { "epoch": 0.23, "grad_norm": 15.4827299118042, "learning_rate": 3.5000000000000004e-06, "loss": 1.4372, "step": 3500 }, { "epoch": 0.23, "grad_norm": 17.598169326782227, "learning_rate": 3.6e-06, "loss": 1.3994, "step": 3600 }, { "epoch": 0.23, "eval_accuracy": 0.7084745762711865, "eval_loss": 1.4431147575378418, "eval_runtime": 7.8934, "eval_samples_per_second": 126.687, "eval_steps_per_second": 7.981, "step": 3600 }, { "epoch": 0.24, "grad_norm": 14.381958961486816, "learning_rate": 3.7e-06, "loss": 1.4467, "step": 3700 }, { "epoch": 0.25, "grad_norm": 13.359013557434082, "learning_rate": 3.8e-06, "loss": 1.3867, "step": 3800 }, { "epoch": 0.25, "eval_accuracy": 0.7152400835073068, "eval_loss": 1.4078177213668823, "eval_runtime": 7.4058, "eval_samples_per_second": 135.028, "eval_steps_per_second": 8.507, "step": 3800 }, { "epoch": 0.25, "grad_norm": 9.557452201843262, "learning_rate": 3.9e-06, "loss": 1.4325, "step": 3900 }, { "epoch": 0.26, "grad_norm": 15.813143730163574, "learning_rate": 4.000000000000001e-06, "loss": 1.4402, "step": 4000 }, { "epoch": 0.26, "eval_accuracy": 0.7165820642978004, "eval_loss": 1.30904221534729, "eval_runtime": 7.4406, "eval_samples_per_second": 134.398, "eval_steps_per_second": 8.467, "step": 4000 }, { "epoch": 0.27, "grad_norm": 16.537343978881836, "learning_rate": 4.1000000000000006e-06, "loss": 1.4095, "step": 4100 }, { "epoch": 0.27, "grad_norm": 17.09938621520996, "learning_rate": 4.2000000000000004e-06, "loss": 1.404, "step": 4200 }, { "epoch": 0.27, "eval_accuracy": 0.7237569060773481, "eval_loss": 1.2822626829147339, "eval_runtime": 8.1063, "eval_samples_per_second": 123.361, "eval_steps_per_second": 7.772, "step": 4200 }, { "epoch": 0.28, "grad_norm": 12.476930618286133, "learning_rate": 4.2999999999999995e-06, "loss": 1.3248, "step": 4300 }, { "epoch": 0.28, "grad_norm": 16.779998779296875, "learning_rate": 4.4e-06, "loss": 1.346, "step": 4400 }, { "epoch": 0.28, "eval_accuracy": 0.7197640117994101, "eval_loss": 1.3060036897659302, "eval_runtime": 7.5604, "eval_samples_per_second": 132.269, "eval_steps_per_second": 8.333, "step": 4400 }, { "epoch": 0.29, "grad_norm": 15.0446195602417, "learning_rate": 4.5e-06, "loss": 1.3594, "step": 4500 }, { "epoch": 0.3, "grad_norm": 16.34784507751465, "learning_rate": 4.6e-06, "loss": 1.3674, "step": 4600 }, { "epoch": 0.3, "eval_accuracy": 0.7126050420168067, "eval_loss": 1.2938878536224365, "eval_runtime": 7.8318, "eval_samples_per_second": 127.684, "eval_steps_per_second": 8.044, "step": 4600 }, { "epoch": 0.3, "grad_norm": 10.535191535949707, "learning_rate": 4.7e-06, "loss": 1.3448, "step": 4700 }, { "epoch": 0.31, "grad_norm": 10.501317977905273, "learning_rate": 4.800000000000001e-06, "loss": 1.3927, "step": 4800 }, { "epoch": 0.31, "eval_accuracy": 0.7189487070792708, "eval_loss": 1.2837878465652466, "eval_runtime": 7.6801, "eval_samples_per_second": 130.206, "eval_steps_per_second": 8.203, "step": 4800 }, { "epoch": 0.32, "grad_norm": 13.949997901916504, "learning_rate": 4.9000000000000005e-06, "loss": 1.3248, "step": 4900 }, { "epoch": 0.32, "grad_norm": 9.897923469543457, "learning_rate": 5e-06, "loss": 1.3361, "step": 5000 }, { "epoch": 0.32, "eval_accuracy": 0.7085980516730199, "eval_loss": 1.3222663402557373, "eval_runtime": 7.7781, "eval_samples_per_second": 128.566, "eval_steps_per_second": 8.1, "step": 5000 }, { "epoch": 0.33, "grad_norm": 16.822832107543945, "learning_rate": 5.1e-06, "loss": 1.3649, "step": 5100 }, { "epoch": 0.34, "grad_norm": 12.656830787658691, "learning_rate": 5.2e-06, "loss": 1.3764, "step": 5200 }, { "epoch": 0.34, "eval_accuracy": 0.7032503165892782, "eval_loss": 1.3635510206222534, "eval_runtime": 8.5569, "eval_samples_per_second": 116.865, "eval_steps_per_second": 7.362, "step": 5200 }, { "epoch": 0.34, "grad_norm": 19.14701271057129, "learning_rate": 5.3e-06, "loss": 1.3508, "step": 5300 }, { "epoch": 0.35, "grad_norm": 17.192317962646484, "learning_rate": 5.4e-06, "loss": 1.3714, "step": 5400 }, { "epoch": 0.35, "eval_accuracy": 0.7200338839474799, "eval_loss": 1.2490332126617432, "eval_runtime": 7.8877, "eval_samples_per_second": 126.78, "eval_steps_per_second": 7.987, "step": 5400 }, { "epoch": 0.36, "grad_norm": 14.284676551818848, "learning_rate": 5.500000000000001e-06, "loss": 1.3004, "step": 5500 }, { "epoch": 0.36, "grad_norm": 13.010009765625, "learning_rate": 5.600000000000001e-06, "loss": 1.3331, "step": 5600 }, { "epoch": 0.36, "eval_accuracy": 0.7102177554438861, "eval_loss": 1.2608476877212524, "eval_runtime": 8.0559, "eval_samples_per_second": 124.133, "eval_steps_per_second": 7.82, "step": 5600 }, { "epoch": 0.37, "grad_norm": 15.056848526000977, "learning_rate": 5.7000000000000005e-06, "loss": 1.3222, "step": 5700 }, { "epoch": 0.38, "grad_norm": 15.013456344604492, "learning_rate": 5.8e-06, "loss": 1.3252, "step": 5800 }, { "epoch": 0.38, "eval_accuracy": 0.7099365750528541, "eval_loss": 1.2714890241622925, "eval_runtime": 7.4527, "eval_samples_per_second": 134.179, "eval_steps_per_second": 8.453, "step": 5800 }, { "epoch": 0.38, "grad_norm": 12.554753303527832, "learning_rate": 5.9e-06, "loss": 1.3235, "step": 5900 }, { "epoch": 0.39, "grad_norm": 18.752586364746094, "learning_rate": 6e-06, "loss": 1.3088, "step": 6000 }, { "epoch": 0.39, "eval_accuracy": 0.7091295116772823, "eval_loss": 1.3206772804260254, "eval_runtime": 7.5438, "eval_samples_per_second": 132.56, "eval_steps_per_second": 8.351, "step": 6000 }, { "epoch": 0.39, "grad_norm": 13.065604209899902, "learning_rate": 6.1e-06, "loss": 1.3135, "step": 6100 }, { "epoch": 0.4, "grad_norm": 11.735289573669434, "learning_rate": 6.2e-06, "loss": 1.2787, "step": 6200 }, { "epoch": 0.4, "eval_accuracy": 0.7031183255019222, "eval_loss": 1.35593581199646, "eval_runtime": 7.6749, "eval_samples_per_second": 130.296, "eval_steps_per_second": 8.209, "step": 6200 }, { "epoch": 0.41, "grad_norm": 19.505950927734375, "learning_rate": 6.300000000000001e-06, "loss": 1.3569, "step": 6300 }, { "epoch": 0.41, "grad_norm": 16.465797424316406, "learning_rate": 6.4000000000000006e-06, "loss": 1.2659, "step": 6400 }, { "epoch": 0.41, "eval_accuracy": 0.6968306922435363, "eval_loss": 1.3438141345977783, "eval_runtime": 7.7541, "eval_samples_per_second": 128.964, "eval_steps_per_second": 8.125, "step": 6400 }, { "epoch": 0.42, "grad_norm": 19.823511123657227, "learning_rate": 6.5000000000000004e-06, "loss": 1.2971, "step": 6500 }, { "epoch": 0.43, "grad_norm": 19.428119659423828, "learning_rate": 6.6e-06, "loss": 1.3097, "step": 6600 }, { "epoch": 0.43, "eval_accuracy": 0.7292101964061847, "eval_loss": 1.270797610282898, "eval_runtime": 7.734, "eval_samples_per_second": 129.3, "eval_steps_per_second": 8.146, "step": 6600 }, { "epoch": 0.43, "grad_norm": 13.666605949401855, "learning_rate": 6.700000000000001e-06, "loss": 1.2968, "step": 6700 }, { "epoch": 0.44, "grad_norm": 15.296406745910645, "learning_rate": 6.800000000000001e-06, "loss": 1.3577, "step": 6800 }, { "epoch": 0.44, "eval_accuracy": 0.7153652392947103, "eval_loss": 1.2499496936798096, "eval_runtime": 7.6289, "eval_samples_per_second": 131.081, "eval_steps_per_second": 8.258, "step": 6800 }, { "epoch": 0.45, "grad_norm": 10.916403770446777, "learning_rate": 6.900000000000001e-06, "loss": 1.35, "step": 6900 }, { "epoch": 0.45, "grad_norm": 12.807979583740234, "learning_rate": 7.000000000000001e-06, "loss": 1.2549, "step": 7000 }, { "epoch": 0.45, "eval_accuracy": 0.742603550295858, "eval_loss": 1.1825101375579834, "eval_runtime": 7.6468, "eval_samples_per_second": 130.774, "eval_steps_per_second": 8.239, "step": 7000 }, { "epoch": 0.46, "grad_norm": 13.269944190979004, "learning_rate": 7.1e-06, "loss": 1.318, "step": 7100 }, { "epoch": 0.47, "grad_norm": 12.807806968688965, "learning_rate": 7.2e-06, "loss": 1.2921, "step": 7200 }, { "epoch": 0.47, "eval_accuracy": 0.7197965239508266, "eval_loss": 1.2917015552520752, "eval_runtime": 8.0184, "eval_samples_per_second": 124.713, "eval_steps_per_second": 7.857, "step": 7200 }, { "epoch": 0.47, "grad_norm": 12.33742904663086, "learning_rate": 7.2999999999999996e-06, "loss": 1.3001, "step": 7300 }, { "epoch": 0.48, "grad_norm": 12.739049911499023, "learning_rate": 7.4e-06, "loss": 1.2714, "step": 7400 }, { "epoch": 0.48, "eval_accuracy": 0.7267736486486487, "eval_loss": 1.2178571224212646, "eval_runtime": 7.8463, "eval_samples_per_second": 127.448, "eval_steps_per_second": 8.029, "step": 7400 }, { "epoch": 0.48, "grad_norm": 12.491984367370605, "learning_rate": 7.5e-06, "loss": 1.281, "step": 7500 }, { "epoch": 0.49, "grad_norm": 19.986621856689453, "learning_rate": 7.6e-06, "loss": 1.317, "step": 7600 }, { "epoch": 0.49, "eval_accuracy": 0.7211942809083264, "eval_loss": 1.2496095895767212, "eval_runtime": 7.6782, "eval_samples_per_second": 130.238, "eval_steps_per_second": 8.205, "step": 7600 }, { "epoch": 0.5, "grad_norm": 13.35618782043457, "learning_rate": 7.7e-06, "loss": 1.2528, "step": 7700 }, { "epoch": 0.5, "grad_norm": 16.770999908447266, "learning_rate": 7.8e-06, "loss": 1.2986, "step": 7800 }, { "epoch": 0.5, "eval_accuracy": 0.7220141489804411, "eval_loss": 1.2436802387237549, "eval_runtime": 7.531, "eval_samples_per_second": 132.785, "eval_steps_per_second": 8.365, "step": 7800 }, { "epoch": 0.51, "grad_norm": 13.932780265808105, "learning_rate": 7.9e-06, "loss": 1.2873, "step": 7900 }, { "epoch": 0.52, "grad_norm": 12.718230247497559, "learning_rate": 8.000000000000001e-06, "loss": 1.2759, "step": 8000 }, { "epoch": 0.52, "eval_accuracy": 0.7353313634444913, "eval_loss": 1.235177993774414, "eval_runtime": 7.6324, "eval_samples_per_second": 131.02, "eval_steps_per_second": 8.254, "step": 8000 }, { "epoch": 0.52, "grad_norm": 22.3817081451416, "learning_rate": 8.1e-06, "loss": 1.2673, "step": 8100 }, { "epoch": 0.53, "grad_norm": 10.6796236038208, "learning_rate": 8.200000000000001e-06, "loss": 1.2783, "step": 8200 }, { "epoch": 0.53, "eval_accuracy": 0.727580372250423, "eval_loss": 1.292532205581665, "eval_runtime": 7.5132, "eval_samples_per_second": 133.099, "eval_steps_per_second": 8.385, "step": 8200 }, { "epoch": 0.54, "grad_norm": 16.627254486083984, "learning_rate": 8.3e-06, "loss": 1.2643, "step": 8300 }, { "epoch": 0.54, "grad_norm": 11.936634063720703, "learning_rate": 8.400000000000001e-06, "loss": 1.2583, "step": 8400 }, { "epoch": 0.54, "eval_accuracy": 0.7167728237791932, "eval_loss": 1.278795599937439, "eval_runtime": 7.3442, "eval_samples_per_second": 136.161, "eval_steps_per_second": 8.578, "step": 8400 }, { "epoch": 0.55, "grad_norm": 9.319226264953613, "learning_rate": 8.500000000000002e-06, "loss": 1.2244, "step": 8500 }, { "epoch": 0.56, "grad_norm": 11.814308166503906, "learning_rate": 8.599999999999999e-06, "loss": 1.2183, "step": 8600 }, { "epoch": 0.56, "eval_accuracy": 0.7247899159663865, "eval_loss": 1.2171862125396729, "eval_runtime": 7.4832, "eval_samples_per_second": 133.632, "eval_steps_per_second": 8.419, "step": 8600 }, { "epoch": 0.56, "grad_norm": 14.227639198303223, "learning_rate": 8.7e-06, "loss": 1.2814, "step": 8700 }, { "epoch": 0.57, "grad_norm": 17.64058494567871, "learning_rate": 8.8e-06, "loss": 1.2652, "step": 8800 }, { "epoch": 0.57, "eval_accuracy": 0.7468776019983348, "eval_loss": 1.1148425340652466, "eval_runtime": 7.537, "eval_samples_per_second": 132.68, "eval_steps_per_second": 8.359, "step": 8800 }, { "epoch": 0.58, "grad_norm": 15.882342338562012, "learning_rate": 8.9e-06, "loss": 1.2227, "step": 8900 }, { "epoch": 0.58, "grad_norm": 13.85197925567627, "learning_rate": 9e-06, "loss": 1.2586, "step": 9000 }, { "epoch": 0.58, "eval_accuracy": 0.7293519695044473, "eval_loss": 1.2517894506454468, "eval_runtime": 7.4951, "eval_samples_per_second": 133.42, "eval_steps_per_second": 8.405, "step": 9000 }, { "epoch": 0.59, "grad_norm": 14.178157806396484, "learning_rate": 9.100000000000001e-06, "loss": 1.2204, "step": 9100 }, { "epoch": 0.59, "grad_norm": 15.603850364685059, "learning_rate": 9.2e-06, "loss": 1.2544, "step": 9200 }, { "epoch": 0.59, "eval_accuracy": 0.7316253674926502, "eval_loss": 1.1838884353637695, "eval_runtime": 7.5421, "eval_samples_per_second": 132.589, "eval_steps_per_second": 8.353, "step": 9200 }, { "epoch": 0.6, "grad_norm": 9.535536766052246, "learning_rate": 9.3e-06, "loss": 1.2598, "step": 9300 }, { "epoch": 0.61, "grad_norm": 9.306912422180176, "learning_rate": 9.4e-06, "loss": 1.2255, "step": 9400 }, { "epoch": 0.61, "eval_accuracy": 0.736235595390525, "eval_loss": 1.1923021078109741, "eval_runtime": 7.6875, "eval_samples_per_second": 130.082, "eval_steps_per_second": 8.195, "step": 9400 }, { "epoch": 0.61, "grad_norm": 12.15056324005127, "learning_rate": 9.5e-06, "loss": 1.2946, "step": 9500 }, { "epoch": 0.62, "grad_norm": 11.175763130187988, "learning_rate": 9.600000000000001e-06, "loss": 1.2248, "step": 9600 }, { "epoch": 0.62, "eval_accuracy": 0.7338099243061396, "eval_loss": 1.2125071287155151, "eval_runtime": 7.8254, "eval_samples_per_second": 127.788, "eval_steps_per_second": 8.051, "step": 9600 }, { "epoch": 0.63, "grad_norm": 12.284031867980957, "learning_rate": 9.7e-06, "loss": 1.2014, "step": 9700 }, { "epoch": 0.63, "grad_norm": 15.557089805603027, "learning_rate": 9.800000000000001e-06, "loss": 1.2569, "step": 9800 }, { "epoch": 0.63, "eval_accuracy": 0.7333333333333333, "eval_loss": 1.1994158029556274, "eval_runtime": 7.6207, "eval_samples_per_second": 131.222, "eval_steps_per_second": 8.267, "step": 9800 }, { "epoch": 0.64, "grad_norm": 15.111207962036133, "learning_rate": 9.900000000000002e-06, "loss": 1.2826, "step": 9900 }, { "epoch": 0.65, "grad_norm": 12.63683032989502, "learning_rate": 1e-05, "loss": 1.2169, "step": 10000 }, { "epoch": 0.65, "eval_accuracy": 0.7263113367174281, "eval_loss": 1.211767554283142, "eval_runtime": 7.706, "eval_samples_per_second": 129.77, "eval_steps_per_second": 8.175, "step": 10000 }, { "epoch": 0.65, "grad_norm": 15.41868782043457, "learning_rate": 1.0100000000000002e-05, "loss": 1.2172, "step": 10100 }, { "epoch": 0.66, "grad_norm": 13.323341369628906, "learning_rate": 1.02e-05, "loss": 1.263, "step": 10200 }, { "epoch": 0.66, "eval_accuracy": 0.7326271186440678, "eval_loss": 1.2031772136688232, "eval_runtime": 7.5159, "eval_samples_per_second": 133.051, "eval_steps_per_second": 8.382, "step": 10200 }, { "epoch": 0.67, "grad_norm": 13.208155632019043, "learning_rate": 1.03e-05, "loss": 1.2079, "step": 10300 }, { "epoch": 0.67, "grad_norm": 10.65457820892334, "learning_rate": 1.04e-05, "loss": 1.2537, "step": 10400 }, { "epoch": 0.67, "eval_accuracy": 0.7174925878864887, "eval_loss": 1.216259241104126, "eval_runtime": 7.655, "eval_samples_per_second": 130.633, "eval_steps_per_second": 8.23, "step": 10400 }, { "epoch": 0.68, "grad_norm": 13.063742637634277, "learning_rate": 1.05e-05, "loss": 1.1957, "step": 10500 }, { "epoch": 0.69, "grad_norm": 14.485901832580566, "learning_rate": 1.06e-05, "loss": 1.1958, "step": 10600 }, { "epoch": 0.69, "eval_accuracy": 0.7372665534804754, "eval_loss": 1.1745487451553345, "eval_runtime": 7.8634, "eval_samples_per_second": 127.171, "eval_steps_per_second": 8.012, "step": 10600 }, { "epoch": 0.69, "grad_norm": 13.576177597045898, "learning_rate": 1.0700000000000001e-05, "loss": 1.2193, "step": 10700 }, { "epoch": 0.7, "grad_norm": 15.742484092712402, "learning_rate": 1.08e-05, "loss": 1.2114, "step": 10800 }, { "epoch": 0.7, "eval_accuracy": 0.7380151387720774, "eval_loss": 1.1721088886260986, "eval_runtime": 7.8706, "eval_samples_per_second": 127.055, "eval_steps_per_second": 8.004, "step": 10800 }, { "epoch": 0.7, "grad_norm": 11.027225494384766, "learning_rate": 1.09e-05, "loss": 1.2598, "step": 10900 }, { "epoch": 0.71, "grad_norm": 10.360236167907715, "learning_rate": 1.1000000000000001e-05, "loss": 1.2417, "step": 11000 }, { "epoch": 0.71, "eval_accuracy": 0.7306566290255123, "eval_loss": 1.187396764755249, "eval_runtime": 8.5096, "eval_samples_per_second": 117.514, "eval_steps_per_second": 7.403, "step": 11000 }, { "epoch": 0.72, "grad_norm": 10.885235786437988, "learning_rate": 1.11e-05, "loss": 1.1955, "step": 11100 }, { "epoch": 0.72, "grad_norm": 13.50262451171875, "learning_rate": 1.1200000000000001e-05, "loss": 1.1913, "step": 11200 }, { "epoch": 0.72, "eval_accuracy": 0.7389844733529165, "eval_loss": 1.1755015850067139, "eval_runtime": 7.7228, "eval_samples_per_second": 129.487, "eval_steps_per_second": 8.158, "step": 11200 }, { "epoch": 0.73, "grad_norm": 8.812333106994629, "learning_rate": 1.13e-05, "loss": 1.2618, "step": 11300 }, { "epoch": 0.74, "grad_norm": 15.112972259521484, "learning_rate": 1.1400000000000001e-05, "loss": 1.2096, "step": 11400 }, { "epoch": 0.74, "eval_accuracy": 0.7413647851727043, "eval_loss": 1.1737055778503418, "eval_runtime": 7.6282, "eval_samples_per_second": 131.092, "eval_steps_per_second": 8.259, "step": 11400 }, { "epoch": 0.74, "grad_norm": 14.144980430603027, "learning_rate": 1.1500000000000002e-05, "loss": 1.1934, "step": 11500 }, { "epoch": 0.75, "grad_norm": 10.164591789245605, "learning_rate": 1.16e-05, "loss": 1.2417, "step": 11600 }, { "epoch": 0.75, "eval_accuracy": 0.728311468472281, "eval_loss": 1.1666622161865234, "eval_runtime": 7.7138, "eval_samples_per_second": 129.637, "eval_steps_per_second": 8.167, "step": 11600 }, { "epoch": 0.76, "grad_norm": 13.889814376831055, "learning_rate": 1.1700000000000001e-05, "loss": 1.1908, "step": 11700 }, { "epoch": 0.76, "grad_norm": 12.097757339477539, "learning_rate": 1.18e-05, "loss": 1.162, "step": 11800 }, { "epoch": 0.76, "eval_accuracy": 0.7376842105263158, "eval_loss": 1.1521003246307373, "eval_runtime": 7.4769, "eval_samples_per_second": 133.746, "eval_steps_per_second": 8.426, "step": 11800 }, { "epoch": 0.77, "grad_norm": 13.475775718688965, "learning_rate": 1.19e-05, "loss": 1.2021, "step": 11900 }, { "epoch": 0.78, "grad_norm": 9.567869186401367, "learning_rate": 1.2e-05, "loss": 1.1981, "step": 12000 }, { "epoch": 0.78, "eval_accuracy": 0.7328853422931542, "eval_loss": 1.1664577722549438, "eval_runtime": 7.3748, "eval_samples_per_second": 135.597, "eval_steps_per_second": 8.543, "step": 12000 }, { "epoch": 0.78, "grad_norm": 10.136677742004395, "learning_rate": 1.2100000000000001e-05, "loss": 1.2105, "step": 12100 }, { "epoch": 0.79, "grad_norm": 14.930061340332031, "learning_rate": 1.22e-05, "loss": 1.2077, "step": 12200 }, { "epoch": 0.79, "eval_accuracy": 0.7607515657620042, "eval_loss": 1.0652177333831787, "eval_runtime": 7.6211, "eval_samples_per_second": 131.215, "eval_steps_per_second": 8.267, "step": 12200 }, { "epoch": 0.8, "grad_norm": 13.338777542114258, "learning_rate": 1.23e-05, "loss": 1.2137, "step": 12300 }, { "epoch": 0.8, "grad_norm": 12.495413780212402, "learning_rate": 1.24e-05, "loss": 1.1953, "step": 12400 }, { "epoch": 0.8, "eval_accuracy": 0.7382154882154882, "eval_loss": 1.1922123432159424, "eval_runtime": 7.6215, "eval_samples_per_second": 131.207, "eval_steps_per_second": 8.266, "step": 12400 }, { "epoch": 0.81, "grad_norm": 11.065675735473633, "learning_rate": 1.25e-05, "loss": 1.2504, "step": 12500 }, { "epoch": 0.81, "grad_norm": 12.817282676696777, "learning_rate": 1.2600000000000001e-05, "loss": 1.1677, "step": 12600 }, { "epoch": 0.81, "eval_accuracy": 0.7436974789915967, "eval_loss": 1.1598080396652222, "eval_runtime": 7.6405, "eval_samples_per_second": 130.882, "eval_steps_per_second": 8.246, "step": 12600 }, { "epoch": 0.82, "grad_norm": 13.032870292663574, "learning_rate": 1.27e-05, "loss": 1.1692, "step": 12700 }, { "epoch": 0.83, "grad_norm": 12.33238410949707, "learning_rate": 1.2800000000000001e-05, "loss": 1.1797, "step": 12800 }, { "epoch": 0.83, "eval_accuracy": 0.7370417193426043, "eval_loss": 1.154846429824829, "eval_runtime": 7.7112, "eval_samples_per_second": 129.681, "eval_steps_per_second": 8.17, "step": 12800 }, { "epoch": 0.83, "grad_norm": 12.509281158447266, "learning_rate": 1.29e-05, "loss": 1.2011, "step": 12900 }, { "epoch": 0.84, "grad_norm": 10.382403373718262, "learning_rate": 1.3000000000000001e-05, "loss": 1.1807, "step": 13000 }, { "epoch": 0.84, "eval_accuracy": 0.7362913352867309, "eval_loss": 1.1921539306640625, "eval_runtime": 7.762, "eval_samples_per_second": 128.832, "eval_steps_per_second": 8.116, "step": 13000 }, { "epoch": 0.85, "grad_norm": 12.345420837402344, "learning_rate": 1.3100000000000002e-05, "loss": 1.2138, "step": 13100 }, { "epoch": 0.85, "grad_norm": 11.673186302185059, "learning_rate": 1.32e-05, "loss": 1.2504, "step": 13200 }, { "epoch": 0.85, "eval_accuracy": 0.7419491525423729, "eval_loss": 1.1572134494781494, "eval_runtime": 8.1566, "eval_samples_per_second": 122.601, "eval_steps_per_second": 7.724, "step": 13200 }, { "epoch": 0.86, "grad_norm": 19.029870986938477, "learning_rate": 1.3300000000000001e-05, "loss": 1.2068, "step": 13300 }, { "epoch": 0.87, "grad_norm": 13.8318510055542, "learning_rate": 1.3400000000000002e-05, "loss": 1.2225, "step": 13400 }, { "epoch": 0.87, "eval_accuracy": 0.7446808510638298, "eval_loss": 1.175363302230835, "eval_runtime": 7.8331, "eval_samples_per_second": 127.663, "eval_steps_per_second": 8.043, "step": 13400 }, { "epoch": 0.87, "grad_norm": 14.477069854736328, "learning_rate": 1.3500000000000001e-05, "loss": 1.2003, "step": 13500 }, { "epoch": 0.88, "grad_norm": 12.100154876708984, "learning_rate": 1.3600000000000002e-05, "loss": 1.2413, "step": 13600 }, { "epoch": 0.88, "eval_accuracy": 0.7559322033898305, "eval_loss": 1.1391655206680298, "eval_runtime": 7.6663, "eval_samples_per_second": 130.44, "eval_steps_per_second": 8.218, "step": 13600 }, { "epoch": 0.89, "grad_norm": 9.015435218811035, "learning_rate": 1.3700000000000001e-05, "loss": 1.1525, "step": 13700 }, { "epoch": 0.89, "grad_norm": 10.668168067932129, "learning_rate": 1.3800000000000002e-05, "loss": 1.1641, "step": 13800 }, { "epoch": 0.89, "eval_accuracy": 0.7334736842105263, "eval_loss": 1.1633808612823486, "eval_runtime": 7.8904, "eval_samples_per_second": 126.736, "eval_steps_per_second": 7.984, "step": 13800 }, { "epoch": 0.9, "grad_norm": 13.636515617370605, "learning_rate": 1.3900000000000002e-05, "loss": 1.1562, "step": 13900 }, { "epoch": 0.91, "grad_norm": 10.076444625854492, "learning_rate": 1.4000000000000001e-05, "loss": 1.166, "step": 14000 }, { "epoch": 0.91, "eval_accuracy": 0.7364893171344784, "eval_loss": 1.172212839126587, "eval_runtime": 7.5552, "eval_samples_per_second": 132.358, "eval_steps_per_second": 8.339, "step": 14000 }, { "epoch": 0.91, "grad_norm": 16.906394958496094, "learning_rate": 1.4099999999999999e-05, "loss": 1.2044, "step": 14100 }, { "epoch": 0.92, "grad_norm": 8.25418758392334, "learning_rate": 1.42e-05, "loss": 1.1512, "step": 14200 }, { "epoch": 0.92, "eval_accuracy": 0.7449748743718593, "eval_loss": 1.200823187828064, "eval_runtime": 7.631, "eval_samples_per_second": 131.045, "eval_steps_per_second": 8.256, "step": 14200 }, { "epoch": 0.92, "grad_norm": 9.793438911437988, "learning_rate": 1.43e-05, "loss": 1.2238, "step": 14300 }, { "epoch": 0.93, "grad_norm": 11.874217987060547, "learning_rate": 1.44e-05, "loss": 1.1916, "step": 14400 }, { "epoch": 0.93, "eval_accuracy": 0.7462437395659433, "eval_loss": 1.1213889122009277, "eval_runtime": 7.7884, "eval_samples_per_second": 128.396, "eval_steps_per_second": 8.089, "step": 14400 }, { "epoch": 0.94, "grad_norm": 9.943106651306152, "learning_rate": 1.45e-05, "loss": 1.1829, "step": 14500 }, { "epoch": 0.94, "grad_norm": 12.617436408996582, "learning_rate": 1.4599999999999999e-05, "loss": 1.1923, "step": 14600 }, { "epoch": 0.94, "eval_accuracy": 0.7366428270929744, "eval_loss": 1.1874052286148071, "eval_runtime": 7.3592, "eval_samples_per_second": 135.885, "eval_steps_per_second": 8.561, "step": 14600 }, { "epoch": 0.95, "grad_norm": 11.707819938659668, "learning_rate": 1.47e-05, "loss": 1.1456, "step": 14700 }, { "epoch": 0.96, "grad_norm": 10.929788589477539, "learning_rate": 1.48e-05, "loss": 1.1432, "step": 14800 }, { "epoch": 0.96, "eval_accuracy": 0.7326607818411097, "eval_loss": 1.2036888599395752, "eval_runtime": 7.6718, "eval_samples_per_second": 130.348, "eval_steps_per_second": 8.212, "step": 14800 }, { "epoch": 0.96, "grad_norm": 10.014945030212402, "learning_rate": 1.49e-05, "loss": 1.2222, "step": 14900 }, { "epoch": 0.97, "grad_norm": 13.785578727722168, "learning_rate": 1.5e-05, "loss": 1.2027, "step": 15000 }, { "epoch": 0.97, "eval_accuracy": 0.732681682957926, "eval_loss": 1.198772668838501, "eval_runtime": 7.7979, "eval_samples_per_second": 128.24, "eval_steps_per_second": 8.079, "step": 15000 }, { "epoch": 0.98, "grad_norm": 12.50344467163086, "learning_rate": 1.51e-05, "loss": 1.2257, "step": 15100 }, { "epoch": 0.98, "grad_norm": 13.581918716430664, "learning_rate": 1.52e-05, "loss": 1.1964, "step": 15200 }, { "epoch": 0.98, "eval_accuracy": 0.7221043699618158, "eval_loss": 1.2152262926101685, "eval_runtime": 7.6917, "eval_samples_per_second": 130.01, "eval_steps_per_second": 8.191, "step": 15200 }, { "epoch": 0.99, "grad_norm": 11.095573425292969, "learning_rate": 1.53e-05, "loss": 1.1643, "step": 15300 }, { "epoch": 1.0, "grad_norm": 14.753010749816895, "learning_rate": 1.54e-05, "loss": 1.2062, "step": 15400 }, { "epoch": 1.0, "eval_accuracy": 0.7407878017789072, "eval_loss": 1.1139509677886963, "eval_runtime": 7.4853, "eval_samples_per_second": 133.596, "eval_steps_per_second": 8.417, "step": 15400 }, { "epoch": 1.0, "grad_norm": 8.08065128326416, "learning_rate": 1.55e-05, "loss": 1.1685, "step": 15500 }, { "epoch": 1.01, "grad_norm": 12.285977363586426, "learning_rate": 1.56e-05, "loss": 1.174, "step": 15600 }, { "epoch": 1.01, "eval_accuracy": 0.7377815554611135, "eval_loss": 1.1095993518829346, "eval_runtime": 7.4868, "eval_samples_per_second": 133.569, "eval_steps_per_second": 8.415, "step": 15600 }, { "epoch": 1.02, "grad_norm": 13.4451265335083, "learning_rate": 1.5700000000000002e-05, "loss": 1.1586, "step": 15700 }, { "epoch": 1.02, "grad_norm": 10.0227632522583, "learning_rate": 1.58e-05, "loss": 1.1631, "step": 15800 }, { "epoch": 1.02, "eval_accuracy": 0.7411467116357504, "eval_loss": 1.1756731271743774, "eval_runtime": 7.5108, "eval_samples_per_second": 133.142, "eval_steps_per_second": 8.388, "step": 15800 }, { "epoch": 1.03, "grad_norm": 11.059329986572266, "learning_rate": 1.59e-05, "loss": 1.1613, "step": 15900 }, { "epoch": 1.03, "grad_norm": 12.956367492675781, "learning_rate": 1.6000000000000003e-05, "loss": 1.1966, "step": 16000 }, { "epoch": 1.03, "eval_accuracy": 0.7306880540312368, "eval_loss": 1.2064409255981445, "eval_runtime": 7.6465, "eval_samples_per_second": 130.779, "eval_steps_per_second": 8.239, "step": 16000 }, { "epoch": 1.04, "grad_norm": 11.291032791137695, "learning_rate": 1.6100000000000002e-05, "loss": 1.1531, "step": 16100 }, { "epoch": 1.05, "grad_norm": 10.946525573730469, "learning_rate": 1.62e-05, "loss": 1.1524, "step": 16200 }, { "epoch": 1.05, "eval_accuracy": 0.7358171041490262, "eval_loss": 1.1751559972763062, "eval_runtime": 7.9075, "eval_samples_per_second": 126.462, "eval_steps_per_second": 7.967, "step": 16200 }, { "epoch": 1.05, "grad_norm": 13.230547904968262, "learning_rate": 1.63e-05, "loss": 1.1603, "step": 16300 }, { "epoch": 1.06, "grad_norm": 13.08030891418457, "learning_rate": 1.6400000000000002e-05, "loss": 1.1534, "step": 16400 }, { "epoch": 1.06, "eval_accuracy": 0.7308970099667774, "eval_loss": 1.1890840530395508, "eval_runtime": 7.4899, "eval_samples_per_second": 133.513, "eval_steps_per_second": 8.411, "step": 16400 }, { "epoch": 1.07, "grad_norm": 11.769508361816406, "learning_rate": 1.65e-05, "loss": 1.1296, "step": 16500 }, { "epoch": 1.07, "grad_norm": 10.801959037780762, "learning_rate": 1.66e-05, "loss": 1.1841, "step": 16600 }, { "epoch": 1.07, "eval_accuracy": 0.7430962343096235, "eval_loss": 1.1619294881820679, "eval_runtime": 7.6487, "eval_samples_per_second": 130.742, "eval_steps_per_second": 8.237, "step": 16600 }, { "epoch": 1.08, "grad_norm": 14.76418399810791, "learning_rate": 1.6700000000000003e-05, "loss": 1.1737, "step": 16700 }, { "epoch": 1.09, "grad_norm": 12.782590866088867, "learning_rate": 1.6800000000000002e-05, "loss": 1.1589, "step": 16800 }, { "epoch": 1.09, "eval_accuracy": 0.7516863406408094, "eval_loss": 1.163698673248291, "eval_runtime": 7.4401, "eval_samples_per_second": 134.407, "eval_steps_per_second": 8.468, "step": 16800 }, { "epoch": 1.09, "grad_norm": 12.755038261413574, "learning_rate": 1.69e-05, "loss": 1.1837, "step": 16900 }, { "epoch": 1.1, "grad_norm": 10.796310424804688, "learning_rate": 1.7000000000000003e-05, "loss": 1.1669, "step": 17000 }, { "epoch": 1.1, "eval_accuracy": 0.7329140461215933, "eval_loss": 1.21775221824646, "eval_runtime": 7.5189, "eval_samples_per_second": 132.999, "eval_steps_per_second": 8.379, "step": 17000 }, { "epoch": 1.11, "grad_norm": 14.265445709228516, "learning_rate": 1.7100000000000002e-05, "loss": 1.1314, "step": 17100 }, { "epoch": 1.11, "grad_norm": 20.69686508178711, "learning_rate": 1.7199999999999998e-05, "loss": 1.158, "step": 17200 }, { "epoch": 1.11, "eval_accuracy": 0.7325680272108843, "eval_loss": 1.2260463237762451, "eval_runtime": 7.4181, "eval_samples_per_second": 134.805, "eval_steps_per_second": 8.493, "step": 17200 }, { "epoch": 1.12, "grad_norm": 8.71137809753418, "learning_rate": 1.73e-05, "loss": 1.1526, "step": 17300 }, { "epoch": 1.13, "grad_norm": 12.519315719604492, "learning_rate": 1.74e-05, "loss": 1.1568, "step": 17400 }, { "epoch": 1.13, "eval_accuracy": 0.751784964300714, "eval_loss": 1.1110048294067383, "eval_runtime": 7.305, "eval_samples_per_second": 136.893, "eval_steps_per_second": 8.624, "step": 17400 }, { "epoch": 1.13, "grad_norm": 8.747578620910645, "learning_rate": 1.75e-05, "loss": 1.1389, "step": 17500 }, { "epoch": 1.14, "grad_norm": 17.33192253112793, "learning_rate": 1.76e-05, "loss": 1.164, "step": 17600 }, { "epoch": 1.14, "eval_accuracy": 0.740276035131744, "eval_loss": 1.1672453880310059, "eval_runtime": 7.3466, "eval_samples_per_second": 136.117, "eval_steps_per_second": 8.575, "step": 17600 }, { "epoch": 1.14, "grad_norm": 11.385080337524414, "learning_rate": 1.77e-05, "loss": 1.1722, "step": 17700 }, { "epoch": 1.15, "grad_norm": 12.3568696975708, "learning_rate": 1.78e-05, "loss": 1.1504, "step": 17800 }, { "epoch": 1.15, "eval_accuracy": 0.7491568296795953, "eval_loss": 1.1096458435058594, "eval_runtime": 7.4455, "eval_samples_per_second": 134.309, "eval_steps_per_second": 8.461, "step": 17800 }, { "epoch": 1.16, "grad_norm": 12.122318267822266, "learning_rate": 1.79e-05, "loss": 1.1562, "step": 17900 }, { "epoch": 1.16, "grad_norm": 13.048823356628418, "learning_rate": 1.8e-05, "loss": 1.1357, "step": 18000 }, { "epoch": 1.16, "eval_accuracy": 0.7356514453288647, "eval_loss": 1.204459309577942, "eval_runtime": 7.552, "eval_samples_per_second": 132.415, "eval_steps_per_second": 8.342, "step": 18000 }, { "epoch": 1.17, "grad_norm": 19.512624740600586, "learning_rate": 1.81e-05, "loss": 1.1317, "step": 18100 }, { "epoch": 1.18, "grad_norm": 10.814772605895996, "learning_rate": 1.8200000000000002e-05, "loss": 1.1211, "step": 18200 }, { "epoch": 1.18, "eval_accuracy": 0.7384161752316765, "eval_loss": 1.1490548849105835, "eval_runtime": 7.5719, "eval_samples_per_second": 132.067, "eval_steps_per_second": 8.32, "step": 18200 }, { "epoch": 1.18, "grad_norm": 13.774679183959961, "learning_rate": 1.83e-05, "loss": 1.1698, "step": 18300 }, { "epoch": 1.19, "grad_norm": 10.213746070861816, "learning_rate": 1.84e-05, "loss": 1.1157, "step": 18400 }, { "epoch": 1.19, "eval_accuracy": 0.7315436241610739, "eval_loss": 1.1492282152175903, "eval_runtime": 7.7796, "eval_samples_per_second": 128.541, "eval_steps_per_second": 8.098, "step": 18400 }, { "epoch": 1.2, "grad_norm": 13.143231391906738, "learning_rate": 1.85e-05, "loss": 1.1945, "step": 18500 }, { "epoch": 1.2, "grad_norm": 12.258892059326172, "learning_rate": 1.86e-05, "loss": 1.1399, "step": 18600 }, { "epoch": 1.2, "eval_accuracy": 0.7291139240506329, "eval_loss": 1.2119060754776, "eval_runtime": 7.5745, "eval_samples_per_second": 132.022, "eval_steps_per_second": 8.317, "step": 18600 }, { "epoch": 1.21, "grad_norm": 14.464848518371582, "learning_rate": 1.87e-05, "loss": 1.1135, "step": 18700 }, { "epoch": 1.22, "grad_norm": 11.347909927368164, "learning_rate": 1.88e-05, "loss": 1.1741, "step": 18800 }, { "epoch": 1.22, "eval_accuracy": 0.7544154751892347, "eval_loss": 1.1045746803283691, "eval_runtime": 7.6209, "eval_samples_per_second": 131.217, "eval_steps_per_second": 8.267, "step": 18800 }, { "epoch": 1.22, "grad_norm": 9.721733093261719, "learning_rate": 1.8900000000000002e-05, "loss": 1.1766, "step": 18900 }, { "epoch": 1.23, "grad_norm": 10.328642845153809, "learning_rate": 1.9e-05, "loss": 1.16, "step": 19000 }, { "epoch": 1.23, "eval_accuracy": 0.7409129332206256, "eval_loss": 1.1312413215637207, "eval_runtime": 7.345, "eval_samples_per_second": 136.146, "eval_steps_per_second": 8.577, "step": 19000 }, { "epoch": 1.24, "grad_norm": 11.374202728271484, "learning_rate": 1.91e-05, "loss": 1.2189, "step": 19100 }, { "epoch": 1.24, "grad_norm": 13.374282836914062, "learning_rate": 1.9200000000000003e-05, "loss": 1.1101, "step": 19200 }, { "epoch": 1.24, "eval_accuracy": 0.7457983193277311, "eval_loss": 1.0982531309127808, "eval_runtime": 7.3813, "eval_samples_per_second": 135.478, "eval_steps_per_second": 8.535, "step": 19200 }, { "epoch": 1.25, "grad_norm": 12.928315162658691, "learning_rate": 1.93e-05, "loss": 1.152, "step": 19300 }, { "epoch": 1.25, "grad_norm": 9.786002159118652, "learning_rate": 1.94e-05, "loss": 1.1563, "step": 19400 }, { "epoch": 1.25, "eval_accuracy": 0.7470338983050847, "eval_loss": 1.1225014925003052, "eval_runtime": 7.4732, "eval_samples_per_second": 133.812, "eval_steps_per_second": 8.43, "step": 19400 }, { "epoch": 1.26, "grad_norm": 13.590020179748535, "learning_rate": 1.9500000000000003e-05, "loss": 1.118, "step": 19500 }, { "epoch": 1.27, "grad_norm": 9.428672790527344, "learning_rate": 1.9600000000000002e-05, "loss": 1.126, "step": 19600 }, { "epoch": 1.27, "eval_accuracy": 0.7514919011082694, "eval_loss": 1.157934308052063, "eval_runtime": 7.7323, "eval_samples_per_second": 129.327, "eval_steps_per_second": 8.148, "step": 19600 }, { "epoch": 1.27, "grad_norm": 8.893349647521973, "learning_rate": 1.97e-05, "loss": 1.157, "step": 19700 }, { "epoch": 1.28, "grad_norm": 11.936838150024414, "learning_rate": 1.9800000000000004e-05, "loss": 1.1539, "step": 19800 }, { "epoch": 1.28, "eval_accuracy": 0.7589170605930382, "eval_loss": 1.1194639205932617, "eval_runtime": 7.6008, "eval_samples_per_second": 131.565, "eval_steps_per_second": 8.289, "step": 19800 }, { "epoch": 1.29, "grad_norm": 8.221677780151367, "learning_rate": 1.9900000000000003e-05, "loss": 1.0897, "step": 19900 }, { "epoch": 1.29, "grad_norm": 14.703478813171387, "learning_rate": 2e-05, "loss": 1.1968, "step": 20000 }, { "epoch": 1.29, "eval_accuracy": 0.7575885328836425, "eval_loss": 1.0268995761871338, "eval_runtime": 7.3597, "eval_samples_per_second": 135.875, "eval_steps_per_second": 8.56, "step": 20000 }, { "epoch": 1.3, "grad_norm": 11.197503089904785, "learning_rate": 2.01e-05, "loss": 1.1742, "step": 20100 }, { "epoch": 1.31, "grad_norm": 13.026580810546875, "learning_rate": 2.0200000000000003e-05, "loss": 1.1385, "step": 20200 }, { "epoch": 1.31, "eval_accuracy": 0.7440878378378378, "eval_loss": 1.1008957624435425, "eval_runtime": 7.7086, "eval_samples_per_second": 129.725, "eval_steps_per_second": 8.173, "step": 20200 }, { "epoch": 1.31, "grad_norm": 9.536117553710938, "learning_rate": 2.0300000000000002e-05, "loss": 1.1016, "step": 20300 }, { "epoch": 1.32, "grad_norm": 12.881437301635742, "learning_rate": 2.04e-05, "loss": 1.1371, "step": 20400 }, { "epoch": 1.32, "eval_accuracy": 0.7487201365187713, "eval_loss": 1.0560767650604248, "eval_runtime": 7.7037, "eval_samples_per_second": 129.807, "eval_steps_per_second": 8.178, "step": 20400 }, { "epoch": 1.33, "grad_norm": 10.493437767028809, "learning_rate": 2.05e-05, "loss": 1.1543, "step": 20500 }, { "epoch": 1.33, "grad_norm": 18.344404220581055, "learning_rate": 2.06e-05, "loss": 1.1359, "step": 20600 }, { "epoch": 1.33, "eval_accuracy": 0.7429651406971861, "eval_loss": 1.1305087804794312, "eval_runtime": 8.0328, "eval_samples_per_second": 124.489, "eval_steps_per_second": 7.843, "step": 20600 }, { "epoch": 1.34, "grad_norm": 11.087918281555176, "learning_rate": 2.07e-05, "loss": 1.1423, "step": 20700 }, { "epoch": 1.35, "grad_norm": 13.202859878540039, "learning_rate": 2.08e-05, "loss": 1.1408, "step": 20800 }, { "epoch": 1.35, "eval_accuracy": 0.7311241610738255, "eval_loss": 1.1746296882629395, "eval_runtime": 7.7069, "eval_samples_per_second": 129.754, "eval_steps_per_second": 8.175, "step": 20800 }, { "epoch": 1.35, "grad_norm": 12.734246253967285, "learning_rate": 2.09e-05, "loss": 1.1701, "step": 20900 }, { "epoch": 1.36, "grad_norm": 12.001795768737793, "learning_rate": 2.1e-05, "loss": 1.0804, "step": 21000 }, { "epoch": 1.36, "eval_accuracy": 0.7455845248107653, "eval_loss": 1.1285468339920044, "eval_runtime": 8.7314, "eval_samples_per_second": 114.529, "eval_steps_per_second": 7.215, "step": 21000 }, { "epoch": 1.36, "grad_norm": 10.932193756103516, "learning_rate": 2.11e-05, "loss": 1.1555, "step": 21100 }, { "epoch": 1.37, "grad_norm": 11.78433609008789, "learning_rate": 2.12e-05, "loss": 1.1198, "step": 21200 }, { "epoch": 1.37, "eval_accuracy": 0.7476635514018691, "eval_loss": 1.1027910709381104, "eval_runtime": 7.6085, "eval_samples_per_second": 131.432, "eval_steps_per_second": 8.28, "step": 21200 }, { "epoch": 1.38, "grad_norm": 15.300663948059082, "learning_rate": 2.13e-05, "loss": 1.1644, "step": 21300 }, { "epoch": 1.38, "grad_norm": 13.899880409240723, "learning_rate": 2.1400000000000002e-05, "loss": 1.4261, "step": 21400 }, { "epoch": 1.38, "eval_accuracy": 0.6564597315436241, "eval_loss": 1.769320011138916, "eval_runtime": 7.4942, "eval_samples_per_second": 133.436, "eval_steps_per_second": 8.406, "step": 21400 }, { "epoch": 1.39, "grad_norm": 11.263799667358398, "learning_rate": 2.15e-05, "loss": 1.1835, "step": 21500 }, { "epoch": 1.4, "grad_norm": 9.347624778747559, "learning_rate": 2.16e-05, "loss": 1.1655, "step": 21600 }, { "epoch": 1.4, "eval_accuracy": 0.7535864978902953, "eval_loss": 1.092917561531067, "eval_runtime": 7.415, "eval_samples_per_second": 134.863, "eval_steps_per_second": 8.496, "step": 21600 }, { "epoch": 1.4, "grad_norm": 9.499000549316406, "learning_rate": 2.1700000000000002e-05, "loss": 1.0895, "step": 21700 }, { "epoch": 1.41, "grad_norm": 12.558609008789062, "learning_rate": 2.18e-05, "loss": 1.1301, "step": 21800 }, { "epoch": 1.41, "eval_accuracy": 0.7359050445103857, "eval_loss": 1.1463769674301147, "eval_runtime": 7.3293, "eval_samples_per_second": 136.439, "eval_steps_per_second": 8.596, "step": 21800 }, { "epoch": 1.42, "grad_norm": 16.517595291137695, "learning_rate": 2.19e-05, "loss": 1.1401, "step": 21900 }, { "epoch": 1.42, "grad_norm": 12.985093116760254, "learning_rate": 2.2000000000000003e-05, "loss": 1.1158, "step": 22000 }, { "epoch": 1.42, "eval_accuracy": 0.7455002092925911, "eval_loss": 1.16570246219635, "eval_runtime": 7.5569, "eval_samples_per_second": 132.33, "eval_steps_per_second": 8.337, "step": 22000 }, { "epoch": 1.43, "grad_norm": 10.754157066345215, "learning_rate": 2.2100000000000002e-05, "loss": 1.1085, "step": 22100 }, { "epoch": 1.44, "grad_norm": 12.940065383911133, "learning_rate": 2.22e-05, "loss": 1.0969, "step": 22200 }, { "epoch": 1.44, "eval_accuracy": 0.7329376854599406, "eval_loss": 1.1993482112884521, "eval_runtime": 7.677, "eval_samples_per_second": 130.259, "eval_steps_per_second": 8.206, "step": 22200 }, { "epoch": 1.44, "grad_norm": 10.116480827331543, "learning_rate": 2.23e-05, "loss": 1.1529, "step": 22300 }, { "epoch": 1.45, "grad_norm": 6.805116176605225, "learning_rate": 2.2400000000000002e-05, "loss": 1.086, "step": 22400 }, { "epoch": 1.45, "eval_accuracy": 0.7561912894961571, "eval_loss": 1.1107536554336548, "eval_runtime": 7.5244, "eval_samples_per_second": 132.901, "eval_steps_per_second": 8.373, "step": 22400 }, { "epoch": 1.45, "grad_norm": 9.98903751373291, "learning_rate": 2.25e-05, "loss": 1.1626, "step": 22500 }, { "epoch": 1.46, "grad_norm": 13.178084373474121, "learning_rate": 2.26e-05, "loss": 1.1413, "step": 22600 }, { "epoch": 1.46, "eval_accuracy": 0.7322635135135135, "eval_loss": 1.1902875900268555, "eval_runtime": 7.6829, "eval_samples_per_second": 130.159, "eval_steps_per_second": 8.2, "step": 22600 }, { "epoch": 1.47, "grad_norm": 13.556154251098633, "learning_rate": 2.2700000000000003e-05, "loss": 1.1843, "step": 22700 }, { "epoch": 1.47, "grad_norm": 9.974491119384766, "learning_rate": 2.2800000000000002e-05, "loss": 1.1272, "step": 22800 }, { "epoch": 1.47, "eval_accuracy": 0.7671404682274248, "eval_loss": 1.0496642589569092, "eval_runtime": 7.4992, "eval_samples_per_second": 133.348, "eval_steps_per_second": 8.401, "step": 22800 }, { "epoch": 1.48, "grad_norm": 9.760144233703613, "learning_rate": 2.29e-05, "loss": 1.1401, "step": 22900 }, { "epoch": 1.49, "grad_norm": 10.568373680114746, "learning_rate": 2.3000000000000003e-05, "loss": 1.115, "step": 23000 }, { "epoch": 1.49, "eval_accuracy": 0.7537154989384289, "eval_loss": 1.1267863512039185, "eval_runtime": 7.69, "eval_samples_per_second": 130.038, "eval_steps_per_second": 8.192, "step": 23000 }, { "epoch": 1.49, "grad_norm": 10.421135902404785, "learning_rate": 2.3100000000000002e-05, "loss": 1.1206, "step": 23100 }, { "epoch": 1.5, "grad_norm": 10.937152862548828, "learning_rate": 2.32e-05, "loss": 1.1165, "step": 23200 }, { "epoch": 1.5, "eval_accuracy": 0.7455546147332769, "eval_loss": 1.150146484375, "eval_runtime": 7.7693, "eval_samples_per_second": 128.712, "eval_steps_per_second": 8.109, "step": 23200 }, { "epoch": 1.51, "grad_norm": 17.392396926879883, "learning_rate": 2.3300000000000004e-05, "loss": 1.1063, "step": 23300 }, { "epoch": 1.51, "grad_norm": 10.282617568969727, "learning_rate": 2.3400000000000003e-05, "loss": 1.1117, "step": 23400 }, { "epoch": 1.51, "eval_accuracy": 0.7535744322960471, "eval_loss": 1.0926055908203125, "eval_runtime": 7.8237, "eval_samples_per_second": 127.817, "eval_steps_per_second": 8.052, "step": 23400 }, { "epoch": 1.52, "grad_norm": 14.730294227600098, "learning_rate": 2.35e-05, "loss": 1.1385, "step": 23500 }, { "epoch": 1.53, "grad_norm": 17.951629638671875, "learning_rate": 2.36e-05, "loss": 1.1125, "step": 23600 }, { "epoch": 1.53, "eval_accuracy": 0.7456622936944562, "eval_loss": 1.1630539894104004, "eval_runtime": 7.6216, "eval_samples_per_second": 131.206, "eval_steps_per_second": 8.266, "step": 23600 }, { "epoch": 1.53, "grad_norm": 10.027335166931152, "learning_rate": 2.37e-05, "loss": 1.12, "step": 23700 }, { "epoch": 1.54, "grad_norm": 9.728846549987793, "learning_rate": 2.38e-05, "loss": 1.0936, "step": 23800 }, { "epoch": 1.54, "eval_accuracy": 0.7413793103448276, "eval_loss": 1.1332049369812012, "eval_runtime": 7.9607, "eval_samples_per_second": 125.617, "eval_steps_per_second": 7.914, "step": 23800 }, { "epoch": 1.55, "grad_norm": 10.458120346069336, "learning_rate": 2.39e-05, "loss": 1.08, "step": 23900 }, { "epoch": 1.55, "grad_norm": 8.669998168945312, "learning_rate": 2.4e-05, "loss": 1.1357, "step": 24000 }, { "epoch": 1.55, "eval_accuracy": 0.758576874205845, "eval_loss": 1.0502053499221802, "eval_runtime": 7.8812, "eval_samples_per_second": 126.884, "eval_steps_per_second": 7.994, "step": 24000 }, { "epoch": 1.56, "grad_norm": 13.406643867492676, "learning_rate": 2.41e-05, "loss": 1.1352, "step": 24100 }, { "epoch": 1.56, "grad_norm": 9.6428861618042, "learning_rate": 2.4200000000000002e-05, "loss": 1.1518, "step": 24200 }, { "epoch": 1.56, "eval_accuracy": 0.7511618081960287, "eval_loss": 1.117685317993164, "eval_runtime": 7.6404, "eval_samples_per_second": 130.883, "eval_steps_per_second": 8.246, "step": 24200 }, { "epoch": 1.57, "grad_norm": 10.48865795135498, "learning_rate": 2.43e-05, "loss": 1.1715, "step": 24300 }, { "epoch": 1.58, "grad_norm": 8.924627304077148, "learning_rate": 2.44e-05, "loss": 1.1151, "step": 24400 }, { "epoch": 1.58, "eval_accuracy": 0.7343684431389006, "eval_loss": 1.1920886039733887, "eval_runtime": 7.7097, "eval_samples_per_second": 129.707, "eval_steps_per_second": 8.172, "step": 24400 }, { "epoch": 1.58, "grad_norm": 30.047603607177734, "learning_rate": 2.45e-05, "loss": 1.2793, "step": 24500 }, { "epoch": 1.59, "grad_norm": 11.05445671081543, "learning_rate": 2.46e-05, "loss": 1.1875, "step": 24600 }, { "epoch": 1.59, "eval_accuracy": 0.7718487394957984, "eval_loss": 1.0164238214492798, "eval_runtime": 7.7781, "eval_samples_per_second": 128.567, "eval_steps_per_second": 8.1, "step": 24600 }, { "epoch": 1.6, "grad_norm": 8.205564498901367, "learning_rate": 2.47e-05, "loss": 1.0428, "step": 24700 }, { "epoch": 1.6, "grad_norm": 11.378378868103027, "learning_rate": 2.48e-05, "loss": 1.2126, "step": 24800 }, { "epoch": 1.6, "eval_accuracy": 0.748395378690629, "eval_loss": 1.1039670705795288, "eval_runtime": 7.7292, "eval_samples_per_second": 129.38, "eval_steps_per_second": 8.151, "step": 24800 }, { "epoch": 1.61, "grad_norm": 13.806981086730957, "learning_rate": 2.4900000000000002e-05, "loss": 1.1398, "step": 24900 }, { "epoch": 1.62, "grad_norm": 13.544731140136719, "learning_rate": 2.5e-05, "loss": 1.1322, "step": 25000 }, { "epoch": 1.62, "eval_accuracy": 0.6930193439865433, "eval_loss": 1.457659125328064, "eval_runtime": 8.0163, "eval_samples_per_second": 124.745, "eval_steps_per_second": 7.859, "step": 25000 }, { "epoch": 1.62, "grad_norm": 9.809267044067383, "learning_rate": 2.51e-05, "loss": 1.2083, "step": 25100 }, { "epoch": 1.63, "grad_norm": 8.45928955078125, "learning_rate": 2.5200000000000003e-05, "loss": 1.1957, "step": 25200 }, { "epoch": 1.63, "eval_accuracy": 0.7506276150627615, "eval_loss": 1.1271296739578247, "eval_runtime": 7.3789, "eval_samples_per_second": 135.521, "eval_steps_per_second": 8.538, "step": 25200 }, { "epoch": 1.64, "grad_norm": 13.24619197845459, "learning_rate": 2.5300000000000002e-05, "loss": 1.1571, "step": 25300 }, { "epoch": 1.64, "grad_norm": 9.83452033996582, "learning_rate": 2.54e-05, "loss": 1.1575, "step": 25400 }, { "epoch": 1.64, "eval_accuracy": 0.7353795105765243, "eval_loss": 1.1618658304214478, "eval_runtime": 7.568, "eval_samples_per_second": 132.135, "eval_steps_per_second": 8.324, "step": 25400 }, { "epoch": 1.65, "grad_norm": 9.756081581115723, "learning_rate": 2.5500000000000003e-05, "loss": 1.1406, "step": 25500 }, { "epoch": 1.66, "grad_norm": 10.580145835876465, "learning_rate": 2.5600000000000002e-05, "loss": 1.1399, "step": 25600 }, { "epoch": 1.66, "eval_accuracy": 0.7291404612159329, "eval_loss": 1.130333423614502, "eval_runtime": 7.5906, "eval_samples_per_second": 131.741, "eval_steps_per_second": 8.3, "step": 25600 }, { "epoch": 1.66, "grad_norm": 11.397805213928223, "learning_rate": 2.57e-05, "loss": 1.1283, "step": 25700 }, { "epoch": 1.67, "grad_norm": 9.237979888916016, "learning_rate": 2.58e-05, "loss": 1.159, "step": 25800 }, { "epoch": 1.67, "eval_accuracy": 0.7570951585976627, "eval_loss": 1.1032042503356934, "eval_runtime": 7.5076, "eval_samples_per_second": 133.198, "eval_steps_per_second": 8.391, "step": 25800 }, { "epoch": 1.67, "grad_norm": 7.186769962310791, "learning_rate": 2.5900000000000003e-05, "loss": 1.3174, "step": 25900 }, { "epoch": 1.68, "grad_norm": 8.622028350830078, "learning_rate": 2.6000000000000002e-05, "loss": 1.2525, "step": 26000 }, { "epoch": 1.68, "eval_accuracy": 0.7331095258078053, "eval_loss": 1.1704577207565308, "eval_runtime": 7.862, "eval_samples_per_second": 127.193, "eval_steps_per_second": 8.013, "step": 26000 }, { "epoch": 1.69, "grad_norm": 13.186617851257324, "learning_rate": 2.61e-05, "loss": 1.1385, "step": 26100 }, { "epoch": 1.69, "grad_norm": 8.932796478271484, "learning_rate": 2.6200000000000003e-05, "loss": 1.1212, "step": 26200 }, { "epoch": 1.69, "eval_accuracy": 0.7449832775919732, "eval_loss": 1.1775517463684082, "eval_runtime": 7.4514, "eval_samples_per_second": 134.203, "eval_steps_per_second": 8.455, "step": 26200 }, { "epoch": 1.7, "grad_norm": 8.868170738220215, "learning_rate": 2.6300000000000002e-05, "loss": 1.2007, "step": 26300 }, { "epoch": 1.71, "grad_norm": 11.973946571350098, "learning_rate": 2.64e-05, "loss": 1.2174, "step": 26400 }, { "epoch": 1.71, "eval_accuracy": 0.7410714285714286, "eval_loss": 1.184034824371338, "eval_runtime": 7.736, "eval_samples_per_second": 129.267, "eval_steps_per_second": 8.144, "step": 26400 }, { "epoch": 1.71, "grad_norm": 12.894060134887695, "learning_rate": 2.6500000000000004e-05, "loss": 1.1789, "step": 26500 }, { "epoch": 1.72, "grad_norm": 11.496538162231445, "learning_rate": 2.6600000000000003e-05, "loss": 1.126, "step": 26600 }, { "epoch": 1.72, "eval_accuracy": 0.7580712788259958, "eval_loss": 1.1284701824188232, "eval_runtime": 7.766, "eval_samples_per_second": 128.767, "eval_steps_per_second": 8.112, "step": 26600 }, { "epoch": 1.73, "grad_norm": 11.373061180114746, "learning_rate": 2.6700000000000002e-05, "loss": 1.1216, "step": 26700 }, { "epoch": 1.73, "grad_norm": 12.828230857849121, "learning_rate": 2.6800000000000004e-05, "loss": 1.1293, "step": 26800 }, { "epoch": 1.73, "eval_accuracy": 0.7550063911376225, "eval_loss": 1.1065834760665894, "eval_runtime": 7.8216, "eval_samples_per_second": 127.851, "eval_steps_per_second": 8.055, "step": 26800 }, { "epoch": 1.74, "grad_norm": 9.637664794921875, "learning_rate": 2.6900000000000003e-05, "loss": 1.1236, "step": 26900 }, { "epoch": 1.75, "grad_norm": 6.880826473236084, "learning_rate": 2.7000000000000002e-05, "loss": 1.1377, "step": 27000 }, { "epoch": 1.75, "eval_accuracy": 0.7458773784355179, "eval_loss": 1.1328409910202026, "eval_runtime": 7.3538, "eval_samples_per_second": 135.984, "eval_steps_per_second": 8.567, "step": 27000 }, { "epoch": 1.75, "grad_norm": 12.280415534973145, "learning_rate": 2.7100000000000005e-05, "loss": 1.1213, "step": 27100 }, { "epoch": 1.76, "grad_norm": 11.152043342590332, "learning_rate": 2.7200000000000004e-05, "loss": 1.151, "step": 27200 }, { "epoch": 1.76, "eval_accuracy": 0.7522123893805309, "eval_loss": 1.03293776512146, "eval_runtime": 7.716, "eval_samples_per_second": 129.6, "eval_steps_per_second": 8.165, "step": 27200 }, { "epoch": 1.77, "grad_norm": 11.722097396850586, "learning_rate": 2.7300000000000003e-05, "loss": 1.1103, "step": 27300 }, { "epoch": 1.77, "grad_norm": 12.057791709899902, "learning_rate": 2.7400000000000002e-05, "loss": 1.1623, "step": 27400 }, { "epoch": 1.77, "eval_accuracy": 0.7508333333333334, "eval_loss": 1.059880256652832, "eval_runtime": 7.4958, "eval_samples_per_second": 133.409, "eval_steps_per_second": 8.405, "step": 27400 }, { "epoch": 1.78, "grad_norm": 12.654067039489746, "learning_rate": 2.7500000000000004e-05, "loss": 1.1562, "step": 27500 }, { "epoch": 1.78, "grad_norm": 9.439441680908203, "learning_rate": 2.7600000000000003e-05, "loss": 1.0716, "step": 27600 }, { "epoch": 1.78, "eval_accuracy": 0.7651515151515151, "eval_loss": 1.056451678276062, "eval_runtime": 7.5751, "eval_samples_per_second": 132.011, "eval_steps_per_second": 8.317, "step": 27600 }, { "epoch": 1.79, "grad_norm": 8.25975227355957, "learning_rate": 2.7700000000000002e-05, "loss": 1.1035, "step": 27700 }, { "epoch": 1.8, "grad_norm": 10.720022201538086, "learning_rate": 2.7800000000000005e-05, "loss": 1.1471, "step": 27800 }, { "epoch": 1.8, "eval_accuracy": 0.7577768560763168, "eval_loss": 1.0603574514389038, "eval_runtime": 7.5987, "eval_samples_per_second": 131.601, "eval_steps_per_second": 8.291, "step": 27800 }, { "epoch": 1.8, "grad_norm": 8.874134063720703, "learning_rate": 2.7900000000000004e-05, "loss": 1.1357, "step": 27900 }, { "epoch": 1.81, "grad_norm": 13.509000778198242, "learning_rate": 2.8000000000000003e-05, "loss": 1.1396, "step": 28000 }, { "epoch": 1.81, "eval_accuracy": 0.751364972700546, "eval_loss": 1.136346459388733, "eval_runtime": 7.6424, "eval_samples_per_second": 130.848, "eval_steps_per_second": 8.243, "step": 28000 }, { "epoch": 1.82, "grad_norm": 10.136117935180664, "learning_rate": 2.8100000000000005e-05, "loss": 1.1243, "step": 28100 }, { "epoch": 1.82, "grad_norm": 10.646615028381348, "learning_rate": 2.8199999999999998e-05, "loss": 1.0524, "step": 28200 }, { "epoch": 1.82, "eval_accuracy": 0.7406473308112652, "eval_loss": 1.1281919479370117, "eval_runtime": 7.4445, "eval_samples_per_second": 134.327, "eval_steps_per_second": 8.463, "step": 28200 }, { "epoch": 1.83, "grad_norm": 12.16605281829834, "learning_rate": 2.83e-05, "loss": 1.1325, "step": 28300 }, { "epoch": 1.84, "grad_norm": 9.168853759765625, "learning_rate": 2.84e-05, "loss": 1.0997, "step": 28400 }, { "epoch": 1.84, "eval_accuracy": 0.759764804703906, "eval_loss": 1.0691206455230713, "eval_runtime": 7.5619, "eval_samples_per_second": 132.242, "eval_steps_per_second": 8.331, "step": 28400 }, { "epoch": 1.84, "grad_norm": 11.044485092163086, "learning_rate": 2.8499999999999998e-05, "loss": 1.1238, "step": 28500 }, { "epoch": 1.85, "grad_norm": 8.619512557983398, "learning_rate": 2.86e-05, "loss": 1.0538, "step": 28600 }, { "epoch": 1.85, "eval_accuracy": 0.7455770850884583, "eval_loss": 1.1327553987503052, "eval_runtime": 7.5946, "eval_samples_per_second": 131.673, "eval_steps_per_second": 8.295, "step": 28600 }, { "epoch": 1.86, "grad_norm": 11.542591094970703, "learning_rate": 2.87e-05, "loss": 1.1291, "step": 28700 }, { "epoch": 1.86, "grad_norm": 9.123075485229492, "learning_rate": 2.88e-05, "loss": 1.0894, "step": 28800 }, { "epoch": 1.86, "eval_accuracy": 0.7412410299704517, "eval_loss": 1.1725280284881592, "eval_runtime": 7.6725, "eval_samples_per_second": 130.336, "eval_steps_per_second": 8.211, "step": 28800 }, { "epoch": 1.87, "grad_norm": 11.408528327941895, "learning_rate": 2.8899999999999998e-05, "loss": 1.144, "step": 28900 }, { "epoch": 1.88, "grad_norm": 10.130888938903809, "learning_rate": 2.9e-05, "loss": 1.1425, "step": 29000 }, { "epoch": 1.88, "eval_accuracy": 0.7409129332206256, "eval_loss": 1.1403433084487915, "eval_runtime": 7.7827, "eval_samples_per_second": 128.49, "eval_steps_per_second": 8.095, "step": 29000 }, { "epoch": 1.88, "grad_norm": 11.617779731750488, "learning_rate": 2.91e-05, "loss": 1.1899, "step": 29100 }, { "epoch": 1.89, "grad_norm": 10.055193901062012, "learning_rate": 2.9199999999999998e-05, "loss": 1.1475, "step": 29200 }, { "epoch": 1.89, "eval_accuracy": 0.7436219155165202, "eval_loss": 1.1234990358352661, "eval_runtime": 7.488, "eval_samples_per_second": 133.547, "eval_steps_per_second": 8.413, "step": 29200 }, { "epoch": 1.89, "grad_norm": 11.876099586486816, "learning_rate": 2.93e-05, "loss": 1.1455, "step": 29300 }, { "epoch": 1.9, "grad_norm": 8.671998023986816, "learning_rate": 2.94e-05, "loss": 1.2081, "step": 29400 }, { "epoch": 1.9, "eval_accuracy": 0.760268231349539, "eval_loss": 1.1001174449920654, "eval_runtime": 7.783, "eval_samples_per_second": 128.485, "eval_steps_per_second": 8.095, "step": 29400 }, { "epoch": 1.91, "grad_norm": 13.381967544555664, "learning_rate": 2.95e-05, "loss": 1.0936, "step": 29500 }, { "epoch": 1.91, "grad_norm": 12.834200859069824, "learning_rate": 2.96e-05, "loss": 1.2018, "step": 29600 }, { "epoch": 1.91, "eval_accuracy": 0.729936305732484, "eval_loss": 1.18804931640625, "eval_runtime": 7.7635, "eval_samples_per_second": 128.807, "eval_steps_per_second": 8.115, "step": 29600 }, { "epoch": 1.92, "grad_norm": 11.972233772277832, "learning_rate": 2.97e-05, "loss": 1.2104, "step": 29700 }, { "epoch": 1.93, "grad_norm": 12.10122299194336, "learning_rate": 2.98e-05, "loss": 1.2012, "step": 29800 }, { "epoch": 1.93, "eval_accuracy": 0.7623637887678122, "eval_loss": 1.0554293394088745, "eval_runtime": 7.9312, "eval_samples_per_second": 126.084, "eval_steps_per_second": 7.943, "step": 29800 }, { "epoch": 1.93, "grad_norm": 12.172104835510254, "learning_rate": 2.9900000000000002e-05, "loss": 1.157, "step": 29900 }, { "epoch": 1.94, "grad_norm": 10.62611198425293, "learning_rate": 3e-05, "loss": 1.124, "step": 30000 }, { "epoch": 1.94, "eval_accuracy": 0.7442148760330578, "eval_loss": 1.0906718969345093, "eval_runtime": 7.8461, "eval_samples_per_second": 127.453, "eval_steps_per_second": 8.03, "step": 30000 }, { "epoch": 1.95, "grad_norm": 12.013021469116211, "learning_rate": 3.01e-05, "loss": 1.114, "step": 30100 }, { "epoch": 1.95, "grad_norm": 8.232975959777832, "learning_rate": 3.02e-05, "loss": 1.1256, "step": 30200 }, { "epoch": 1.95, "eval_accuracy": 0.7737195633921075, "eval_loss": 0.981860339641571, "eval_runtime": 8.0953, "eval_samples_per_second": 123.528, "eval_steps_per_second": 7.782, "step": 30200 }, { "epoch": 1.96, "grad_norm": 12.06392765045166, "learning_rate": 3.03e-05, "loss": 1.1148, "step": 30300 }, { "epoch": 1.97, "grad_norm": 10.367729187011719, "learning_rate": 3.04e-05, "loss": 1.1459, "step": 30400 }, { "epoch": 1.97, "eval_accuracy": 0.7435133985538069, "eval_loss": 1.1510205268859863, "eval_runtime": 7.9644, "eval_samples_per_second": 125.559, "eval_steps_per_second": 7.91, "step": 30400 }, { "epoch": 1.97, "grad_norm": 6.962823867797852, "learning_rate": 3.05e-05, "loss": 1.0973, "step": 30500 }, { "epoch": 1.98, "grad_norm": 6.782848834991455, "learning_rate": 3.06e-05, "loss": 1.0834, "step": 30600 }, { "epoch": 1.98, "eval_accuracy": 0.7382352941176471, "eval_loss": 1.1189897060394287, "eval_runtime": 7.562, "eval_samples_per_second": 132.241, "eval_steps_per_second": 8.331, "step": 30600 }, { "epoch": 1.99, "grad_norm": 13.43688678741455, "learning_rate": 3.07e-05, "loss": 1.1178, "step": 30700 }, { "epoch": 1.99, "grad_norm": 9.378607749938965, "learning_rate": 3.08e-05, "loss": 1.1417, "step": 30800 }, { "epoch": 1.99, "eval_accuracy": 0.7416943521594684, "eval_loss": 1.1617268323898315, "eval_runtime": 7.5308, "eval_samples_per_second": 132.788, "eval_steps_per_second": 8.366, "step": 30800 }, { "epoch": 2.0, "grad_norm": 11.303683280944824, "learning_rate": 3.09e-05, "loss": 1.1223, "step": 30900 }, { "epoch": 2.0, "grad_norm": 6.087547779083252, "learning_rate": 3.1e-05, "loss": 1.0331, "step": 31000 }, { "epoch": 2.0, "eval_accuracy": 0.7523012552301255, "eval_loss": 1.055106520652771, "eval_runtime": 7.8392, "eval_samples_per_second": 127.564, "eval_steps_per_second": 8.037, "step": 31000 }, { "epoch": 2.01, "grad_norm": 9.039276123046875, "learning_rate": 3.1100000000000004e-05, "loss": 1.1099, "step": 31100 }, { "epoch": 2.02, "grad_norm": 14.244400024414062, "learning_rate": 3.12e-05, "loss": 1.1455, "step": 31200 }, { "epoch": 2.02, "eval_accuracy": 0.7577927548441449, "eval_loss": 1.0954103469848633, "eval_runtime": 7.5506, "eval_samples_per_second": 132.439, "eval_steps_per_second": 8.344, "step": 31200 }, { "epoch": 2.02, "grad_norm": 38.228694915771484, "learning_rate": 3.13e-05, "loss": 1.06, "step": 31300 }, { "epoch": 2.03, "grad_norm": 11.858959197998047, "learning_rate": 3.1400000000000004e-05, "loss": 1.1086, "step": 31400 }, { "epoch": 2.03, "eval_accuracy": 0.7462750106428268, "eval_loss": 1.1535460948944092, "eval_runtime": 7.7947, "eval_samples_per_second": 128.292, "eval_steps_per_second": 8.082, "step": 31400 }, { "epoch": 2.04, "grad_norm": 8.420319557189941, "learning_rate": 3.15e-05, "loss": 1.1241, "step": 31500 }, { "epoch": 2.04, "grad_norm": 7.6553053855896, "learning_rate": 3.16e-05, "loss": 1.1027, "step": 31600 }, { "epoch": 2.04, "eval_accuracy": 0.7574862927035007, "eval_loss": 1.0462350845336914, "eval_runtime": 7.7134, "eval_samples_per_second": 129.644, "eval_steps_per_second": 8.168, "step": 31600 }, { "epoch": 2.05, "grad_norm": 11.27103328704834, "learning_rate": 3.1700000000000005e-05, "loss": 1.0904, "step": 31700 }, { "epoch": 2.06, "grad_norm": 10.934687614440918, "learning_rate": 3.18e-05, "loss": 1.0852, "step": 31800 }, { "epoch": 2.06, "eval_accuracy": 0.7545493017350825, "eval_loss": 1.0606493949890137, "eval_runtime": 7.557, "eval_samples_per_second": 132.327, "eval_steps_per_second": 8.337, "step": 31800 }, { "epoch": 2.06, "grad_norm": 8.664750099182129, "learning_rate": 3.19e-05, "loss": 1.077, "step": 31900 }, { "epoch": 2.07, "grad_norm": 9.220561027526855, "learning_rate": 3.2000000000000005e-05, "loss": 1.0752, "step": 32000 }, { "epoch": 2.07, "eval_accuracy": 0.7676595744680851, "eval_loss": 1.0547922849655151, "eval_runtime": 7.665, "eval_samples_per_second": 130.463, "eval_steps_per_second": 8.219, "step": 32000 }, { "epoch": 2.08, "grad_norm": 7.378421783447266, "learning_rate": 3.21e-05, "loss": 1.0949, "step": 32100 }, { "epoch": 2.08, "grad_norm": 8.4024658203125, "learning_rate": 3.2200000000000003e-05, "loss": 1.1399, "step": 32200 }, { "epoch": 2.08, "eval_accuracy": 0.757996632996633, "eval_loss": 1.0713438987731934, "eval_runtime": 7.5462, "eval_samples_per_second": 132.516, "eval_steps_per_second": 8.349, "step": 32200 }, { "epoch": 2.09, "grad_norm": 12.625773429870605, "learning_rate": 3.2300000000000006e-05, "loss": 1.097, "step": 32300 }, { "epoch": 2.1, "grad_norm": 8.04590892791748, "learning_rate": 3.24e-05, "loss": 1.1162, "step": 32400 }, { "epoch": 2.1, "eval_accuracy": 0.7551884794578568, "eval_loss": 1.142746090888977, "eval_runtime": 7.9266, "eval_samples_per_second": 126.157, "eval_steps_per_second": 7.948, "step": 32400 }, { "epoch": 2.1, "grad_norm": 12.640024185180664, "learning_rate": 3.2500000000000004e-05, "loss": 1.0972, "step": 32500 }, { "epoch": 2.11, "grad_norm": 11.165950775146484, "learning_rate": 3.26e-05, "loss": 1.0808, "step": 32600 }, { "epoch": 2.11, "eval_accuracy": 0.7418810628426824, "eval_loss": 1.1398913860321045, "eval_runtime": 7.5681, "eval_samples_per_second": 132.134, "eval_steps_per_second": 8.324, "step": 32600 }, { "epoch": 2.11, "grad_norm": 11.864821434020996, "learning_rate": 3.27e-05, "loss": 1.1259, "step": 32700 }, { "epoch": 2.12, "grad_norm": 13.403204917907715, "learning_rate": 3.2800000000000004e-05, "loss": 1.0975, "step": 32800 }, { "epoch": 2.12, "eval_accuracy": 0.7447168216398986, "eval_loss": 1.1701534986495972, "eval_runtime": 7.7017, "eval_samples_per_second": 129.842, "eval_steps_per_second": 8.18, "step": 32800 }, { "epoch": 2.13, "grad_norm": 12.985321044921875, "learning_rate": 3.29e-05, "loss": 1.1246, "step": 32900 }, { "epoch": 2.13, "grad_norm": 12.446426391601562, "learning_rate": 3.3e-05, "loss": 1.0942, "step": 33000 }, { "epoch": 2.13, "eval_accuracy": 0.7699412258606213, "eval_loss": 0.9965775012969971, "eval_runtime": 7.8063, "eval_samples_per_second": 128.101, "eval_steps_per_second": 8.07, "step": 33000 }, { "epoch": 2.14, "grad_norm": 11.182689666748047, "learning_rate": 3.3100000000000005e-05, "loss": 1.0712, "step": 33100 }, { "epoch": 2.15, "grad_norm": 11.654029846191406, "learning_rate": 3.32e-05, "loss": 1.1358, "step": 33200 }, { "epoch": 2.15, "eval_accuracy": 0.7515683814303639, "eval_loss": 1.1039252281188965, "eval_runtime": 7.848, "eval_samples_per_second": 127.422, "eval_steps_per_second": 8.028, "step": 33200 }, { "epoch": 2.15, "grad_norm": 12.950011253356934, "learning_rate": 3.33e-05, "loss": 1.0796, "step": 33300 }, { "epoch": 2.16, "grad_norm": 8.27685546875, "learning_rate": 3.3400000000000005e-05, "loss": 1.1059, "step": 33400 }, { "epoch": 2.16, "eval_accuracy": 0.7335036185610898, "eval_loss": 1.1502079963684082, "eval_runtime": 7.5219, "eval_samples_per_second": 132.945, "eval_steps_per_second": 8.376, "step": 33400 }, { "epoch": 2.17, "grad_norm": 11.640264511108398, "learning_rate": 3.35e-05, "loss": 1.1054, "step": 33500 }, { "epoch": 2.17, "grad_norm": 8.917283058166504, "learning_rate": 3.3600000000000004e-05, "loss": 1.1367, "step": 33600 }, { "epoch": 2.17, "eval_accuracy": 0.7472848788638262, "eval_loss": 1.1379823684692383, "eval_runtime": 7.7552, "eval_samples_per_second": 128.947, "eval_steps_per_second": 8.124, "step": 33600 }, { "epoch": 2.18, "grad_norm": 9.214638710021973, "learning_rate": 3.3700000000000006e-05, "loss": 1.0948, "step": 33700 }, { "epoch": 2.19, "grad_norm": 10.602049827575684, "learning_rate": 3.38e-05, "loss": 1.0949, "step": 33800 }, { "epoch": 2.19, "eval_accuracy": 0.7566204287515763, "eval_loss": 1.0765697956085205, "eval_runtime": 7.7442, "eval_samples_per_second": 129.128, "eval_steps_per_second": 8.135, "step": 33800 }, { "epoch": 2.19, "grad_norm": 10.224041938781738, "learning_rate": 3.3900000000000004e-05, "loss": 1.0751, "step": 33900 }, { "epoch": 2.2, "grad_norm": 11.632186889648438, "learning_rate": 3.4000000000000007e-05, "loss": 1.0852, "step": 34000 }, { "epoch": 2.2, "eval_accuracy": 0.7399486740804107, "eval_loss": 1.09164559841156, "eval_runtime": 6.76, "eval_samples_per_second": 147.928, "eval_steps_per_second": 9.319, "step": 34000 }, { "epoch": 2.21, "grad_norm": 12.6830472946167, "learning_rate": 3.41e-05, "loss": 1.0923, "step": 34100 }, { "epoch": 2.21, "grad_norm": 9.087968826293945, "learning_rate": 3.4200000000000005e-05, "loss": 1.064, "step": 34200 }, { "epoch": 2.21, "eval_accuracy": 0.7523206751054853, "eval_loss": 1.0744291543960571, "eval_runtime": 7.7949, "eval_samples_per_second": 128.288, "eval_steps_per_second": 8.082, "step": 34200 }, { "epoch": 2.22, "grad_norm": 9.449531555175781, "learning_rate": 3.430000000000001e-05, "loss": 1.1124, "step": 34300 }, { "epoch": 2.22, "grad_norm": 6.715410232543945, "learning_rate": 3.4399999999999996e-05, "loss": 1.1172, "step": 34400 }, { "epoch": 2.22, "eval_accuracy": 0.7645065650148243, "eval_loss": 1.0395547151565552, "eval_runtime": 7.7403, "eval_samples_per_second": 129.194, "eval_steps_per_second": 8.139, "step": 34400 }, { "epoch": 2.23, "grad_norm": 11.503610610961914, "learning_rate": 3.45e-05, "loss": 1.08, "step": 34500 }, { "epoch": 2.24, "grad_norm": 12.468766212463379, "learning_rate": 3.46e-05, "loss": 1.0931, "step": 34600 }, { "epoch": 2.24, "eval_accuracy": 0.7633012149141182, "eval_loss": 1.0587788820266724, "eval_runtime": 7.6893, "eval_samples_per_second": 130.051, "eval_steps_per_second": 8.193, "step": 34600 }, { "epoch": 2.24, "grad_norm": 8.791740417480469, "learning_rate": 3.4699999999999996e-05, "loss": 1.1221, "step": 34700 }, { "epoch": 2.25, "grad_norm": 7.91445255279541, "learning_rate": 3.48e-05, "loss": 1.0528, "step": 34800 }, { "epoch": 2.25, "eval_accuracy": 0.760705289672544, "eval_loss": 1.058076024055481, "eval_runtime": 7.7444, "eval_samples_per_second": 129.126, "eval_steps_per_second": 8.135, "step": 34800 }, { "epoch": 2.26, "grad_norm": 10.103760719299316, "learning_rate": 3.49e-05, "loss": 1.1044, "step": 34900 }, { "epoch": 2.26, "grad_norm": 9.25192928314209, "learning_rate": 3.5e-05, "loss": 1.0899, "step": 35000 }, { "epoch": 2.26, "eval_accuracy": 0.7493702770780857, "eval_loss": 1.096838355064392, "eval_runtime": 7.4416, "eval_samples_per_second": 134.379, "eval_steps_per_second": 8.466, "step": 35000 }, { "epoch": 2.27, "grad_norm": 12.285004615783691, "learning_rate": 3.51e-05, "loss": 1.0919, "step": 35100 }, { "epoch": 2.28, "grad_norm": 8.19707202911377, "learning_rate": 3.52e-05, "loss": 1.1016, "step": 35200 }, { "epoch": 2.28, "eval_accuracy": 0.7318718381112985, "eval_loss": 1.143736481666565, "eval_runtime": 8.0431, "eval_samples_per_second": 124.331, "eval_steps_per_second": 7.833, "step": 35200 }, { "epoch": 2.28, "grad_norm": 11.359623908996582, "learning_rate": 3.53e-05, "loss": 1.1153, "step": 35300 }, { "epoch": 2.29, "grad_norm": 8.557467460632324, "learning_rate": 3.54e-05, "loss": 1.1164, "step": 35400 }, { "epoch": 2.29, "eval_accuracy": 0.7552417629439452, "eval_loss": 1.0568954944610596, "eval_runtime": 7.6868, "eval_samples_per_second": 130.093, "eval_steps_per_second": 8.196, "step": 35400 }, { "epoch": 2.3, "grad_norm": 7.062678813934326, "learning_rate": 3.55e-05, "loss": 1.0714, "step": 35500 }, { "epoch": 2.3, "grad_norm": 10.855430603027344, "learning_rate": 3.56e-05, "loss": 1.1311, "step": 35600 }, { "epoch": 2.3, "eval_accuracy": 0.7555742532604123, "eval_loss": 1.1163753271102905, "eval_runtime": 7.6264, "eval_samples_per_second": 131.123, "eval_steps_per_second": 8.261, "step": 35600 }, { "epoch": 2.31, "grad_norm": 11.73338794708252, "learning_rate": 3.57e-05, "loss": 1.0964, "step": 35700 }, { "epoch": 2.32, "grad_norm": 9.526000022888184, "learning_rate": 3.58e-05, "loss": 1.1198, "step": 35800 }, { "epoch": 2.32, "eval_accuracy": 0.7507368421052631, "eval_loss": 1.0975955724716187, "eval_runtime": 7.8644, "eval_samples_per_second": 127.156, "eval_steps_per_second": 8.011, "step": 35800 }, { "epoch": 2.32, "grad_norm": 10.167031288146973, "learning_rate": 3.59e-05, "loss": 1.0631, "step": 35900 }, { "epoch": 2.33, "grad_norm": 10.823612213134766, "learning_rate": 3.6e-05, "loss": 1.0745, "step": 36000 }, { "epoch": 2.33, "eval_accuracy": 0.755247691015953, "eval_loss": 1.0984485149383545, "eval_runtime": 7.427, "eval_samples_per_second": 134.644, "eval_steps_per_second": 8.483, "step": 36000 }, { "epoch": 2.33, "grad_norm": 10.233257293701172, "learning_rate": 3.61e-05, "loss": 1.1011, "step": 36100 }, { "epoch": 2.34, "grad_norm": 11.928458213806152, "learning_rate": 3.62e-05, "loss": 1.1038, "step": 36200 }, { "epoch": 2.34, "eval_accuracy": 0.7479235880398671, "eval_loss": 1.0806572437286377, "eval_runtime": 7.7228, "eval_samples_per_second": 129.486, "eval_steps_per_second": 8.158, "step": 36200 }, { "epoch": 2.35, "grad_norm": 14.004356384277344, "learning_rate": 3.63e-05, "loss": 1.0319, "step": 36300 }, { "epoch": 2.35, "grad_norm": 9.574338912963867, "learning_rate": 3.6400000000000004e-05, "loss": 1.1316, "step": 36400 }, { "epoch": 2.35, "eval_accuracy": 0.7413355874894336, "eval_loss": 1.1153086423873901, "eval_runtime": 7.3917, "eval_samples_per_second": 135.287, "eval_steps_per_second": 8.523, "step": 36400 }, { "epoch": 2.36, "grad_norm": 10.358060836791992, "learning_rate": 3.65e-05, "loss": 1.1179, "step": 36500 }, { "epoch": 2.37, "grad_norm": 10.381155967712402, "learning_rate": 3.66e-05, "loss": 1.1289, "step": 36600 }, { "epoch": 2.37, "eval_accuracy": 0.7534707614640302, "eval_loss": 1.1133631467819214, "eval_runtime": 7.736, "eval_samples_per_second": 129.265, "eval_steps_per_second": 8.144, "step": 36600 }, { "epoch": 2.37, "grad_norm": 8.149444580078125, "learning_rate": 3.6700000000000004e-05, "loss": 1.0878, "step": 36700 }, { "epoch": 2.38, "grad_norm": 7.2654032707214355, "learning_rate": 3.68e-05, "loss": 1.1009, "step": 36800 }, { "epoch": 2.38, "eval_accuracy": 0.7479096989966555, "eval_loss": 1.104419231414795, "eval_runtime": 8.0129, "eval_samples_per_second": 124.799, "eval_steps_per_second": 7.862, "step": 36800 }, { "epoch": 2.39, "grad_norm": 11.244707107543945, "learning_rate": 3.69e-05, "loss": 1.0489, "step": 36900 }, { "epoch": 2.39, "grad_norm": 9.0493745803833, "learning_rate": 3.7e-05, "loss": 1.1371, "step": 37000 }, { "epoch": 2.39, "eval_accuracy": 0.7525083612040134, "eval_loss": 1.0959182977676392, "eval_runtime": 7.8223, "eval_samples_per_second": 127.84, "eval_steps_per_second": 8.054, "step": 37000 }, { "epoch": 2.4, "grad_norm": 11.599352836608887, "learning_rate": 3.71e-05, "loss": 1.0655, "step": 37100 }, { "epoch": 2.41, "grad_norm": 9.981133460998535, "learning_rate": 3.72e-05, "loss": 1.1291, "step": 37200 }, { "epoch": 2.41, "eval_accuracy": 0.7551709582102153, "eval_loss": 1.1107460260391235, "eval_runtime": 7.8313, "eval_samples_per_second": 127.692, "eval_steps_per_second": 8.045, "step": 37200 }, { "epoch": 2.41, "grad_norm": 12.121379852294922, "learning_rate": 3.73e-05, "loss": 1.0995, "step": 37300 }, { "epoch": 2.42, "grad_norm": 11.133427619934082, "learning_rate": 3.74e-05, "loss": 1.0671, "step": 37400 }, { "epoch": 2.42, "eval_accuracy": 0.7518828451882845, "eval_loss": 1.065983533859253, "eval_runtime": 7.7489, "eval_samples_per_second": 129.05, "eval_steps_per_second": 8.13, "step": 37400 }, { "epoch": 2.42, "grad_norm": 10.433098793029785, "learning_rate": 3.7500000000000003e-05, "loss": 1.1293, "step": 37500 }, { "epoch": 2.43, "grad_norm": 9.084454536437988, "learning_rate": 3.76e-05, "loss": 1.0897, "step": 37600 }, { "epoch": 2.43, "eval_accuracy": 0.7457059069962296, "eval_loss": 1.1104426383972168, "eval_runtime": 7.5669, "eval_samples_per_second": 132.154, "eval_steps_per_second": 8.326, "step": 37600 }, { "epoch": 2.44, "grad_norm": 10.528663635253906, "learning_rate": 3.77e-05, "loss": 1.1086, "step": 37700 }, { "epoch": 2.44, "grad_norm": 9.945916175842285, "learning_rate": 3.7800000000000004e-05, "loss": 1.0603, "step": 37800 }, { "epoch": 2.44, "eval_accuracy": 0.7536842105263157, "eval_loss": 1.0666491985321045, "eval_runtime": 7.7722, "eval_samples_per_second": 128.664, "eval_steps_per_second": 8.106, "step": 37800 }, { "epoch": 2.45, "grad_norm": 15.356410026550293, "learning_rate": 3.79e-05, "loss": 1.1101, "step": 37900 }, { "epoch": 2.46, "grad_norm": 10.070672988891602, "learning_rate": 3.8e-05, "loss": 1.1295, "step": 38000 }, { "epoch": 2.46, "eval_accuracy": 0.7538461538461538, "eval_loss": 1.0828039646148682, "eval_runtime": 7.692, "eval_samples_per_second": 130.005, "eval_steps_per_second": 8.19, "step": 38000 }, { "epoch": 2.46, "grad_norm": 14.158147811889648, "learning_rate": 3.8100000000000005e-05, "loss": 1.0718, "step": 38100 }, { "epoch": 2.47, "grad_norm": 10.959081649780273, "learning_rate": 3.82e-05, "loss": 1.1213, "step": 38200 }, { "epoch": 2.47, "eval_accuracy": 0.7624842635333613, "eval_loss": 1.0719354152679443, "eval_runtime": 7.9715, "eval_samples_per_second": 125.448, "eval_steps_per_second": 7.903, "step": 38200 }, { "epoch": 2.48, "grad_norm": 10.373156547546387, "learning_rate": 3.83e-05, "loss": 1.1181, "step": 38300 }, { "epoch": 2.48, "grad_norm": 13.541415214538574, "learning_rate": 3.8400000000000005e-05, "loss": 1.1357, "step": 38400 }, { "epoch": 2.48, "eval_accuracy": 0.7470389170896785, "eval_loss": 1.1772955656051636, "eval_runtime": 7.7962, "eval_samples_per_second": 128.267, "eval_steps_per_second": 8.081, "step": 38400 }, { "epoch": 2.49, "grad_norm": 10.464506149291992, "learning_rate": 3.85e-05, "loss": 1.1006, "step": 38500 }, { "epoch": 2.5, "grad_norm": 12.202038764953613, "learning_rate": 3.86e-05, "loss": 1.1634, "step": 38600 }, { "epoch": 2.5, "eval_accuracy": 0.747585048299034, "eval_loss": 1.1097861528396606, "eval_runtime": 7.8152, "eval_samples_per_second": 127.956, "eval_steps_per_second": 8.061, "step": 38600 }, { "epoch": 2.5, "grad_norm": 13.328570365905762, "learning_rate": 3.8700000000000006e-05, "loss": 1.14, "step": 38700 }, { "epoch": 2.51, "grad_norm": 9.957459449768066, "learning_rate": 3.88e-05, "loss": 1.0878, "step": 38800 }, { "epoch": 2.51, "eval_accuracy": 0.7559121621621622, "eval_loss": 1.1080191135406494, "eval_runtime": 7.8614, "eval_samples_per_second": 127.204, "eval_steps_per_second": 8.014, "step": 38800 }, { "epoch": 2.52, "grad_norm": 10.643271446228027, "learning_rate": 3.8900000000000004e-05, "loss": 1.0822, "step": 38900 }, { "epoch": 2.52, "grad_norm": 10.243337631225586, "learning_rate": 3.9000000000000006e-05, "loss": 1.1342, "step": 39000 }, { "epoch": 2.52, "eval_accuracy": 0.7510530749789385, "eval_loss": 1.124193549156189, "eval_runtime": 7.5876, "eval_samples_per_second": 131.794, "eval_steps_per_second": 8.303, "step": 39000 }, { "epoch": 2.53, "grad_norm": 6.701710224151611, "learning_rate": 3.91e-05, "loss": 1.0981, "step": 39100 }, { "epoch": 2.53, "grad_norm": 8.547497749328613, "learning_rate": 3.9200000000000004e-05, "loss": 1.0913, "step": 39200 }, { "epoch": 2.53, "eval_accuracy": 0.7535211267605634, "eval_loss": 1.140058994293213, "eval_runtime": 7.6489, "eval_samples_per_second": 130.739, "eval_steps_per_second": 8.237, "step": 39200 }, { "epoch": 2.54, "grad_norm": 6.717520236968994, "learning_rate": 3.9300000000000007e-05, "loss": 1.1443, "step": 39300 }, { "epoch": 2.55, "grad_norm": 9.493661880493164, "learning_rate": 3.94e-05, "loss": 1.119, "step": 39400 }, { "epoch": 2.55, "eval_accuracy": 0.7305916911456147, "eval_loss": 1.1504932641983032, "eval_runtime": 7.6106, "eval_samples_per_second": 131.395, "eval_steps_per_second": 8.278, "step": 39400 }, { "epoch": 2.55, "grad_norm": 8.087804794311523, "learning_rate": 3.9500000000000005e-05, "loss": 1.1136, "step": 39500 }, { "epoch": 2.56, "grad_norm": 7.295677661895752, "learning_rate": 3.960000000000001e-05, "loss": 1.0652, "step": 39600 }, { "epoch": 2.56, "eval_accuracy": 0.7593448131037379, "eval_loss": 1.084837794303894, "eval_runtime": 7.4956, "eval_samples_per_second": 133.411, "eval_steps_per_second": 8.405, "step": 39600 }, { "epoch": 2.57, "grad_norm": 8.077702522277832, "learning_rate": 3.97e-05, "loss": 1.1009, "step": 39700 }, { "epoch": 2.57, "grad_norm": 15.48818588256836, "learning_rate": 3.9800000000000005e-05, "loss": 1.136, "step": 39800 }, { "epoch": 2.57, "eval_accuracy": 0.7615838247683235, "eval_loss": 0.99775230884552, "eval_runtime": 7.476, "eval_samples_per_second": 133.762, "eval_steps_per_second": 8.427, "step": 39800 }, { "epoch": 2.58, "grad_norm": 13.369636535644531, "learning_rate": 3.99e-05, "loss": 1.1156, "step": 39900 }, { "epoch": 2.59, "grad_norm": 13.321182250976562, "learning_rate": 4e-05, "loss": 1.0696, "step": 40000 }, { "epoch": 2.59, "eval_accuracy": 0.7405999155048585, "eval_loss": 1.0958701372146606, "eval_runtime": 7.7723, "eval_samples_per_second": 128.662, "eval_steps_per_second": 8.106, "step": 40000 }, { "epoch": 2.59, "grad_norm": 7.979740619659424, "learning_rate": 4.0100000000000006e-05, "loss": 1.1042, "step": 40100 }, { "epoch": 2.6, "grad_norm": 7.108384609222412, "learning_rate": 4.02e-05, "loss": 1.093, "step": 40200 }, { "epoch": 2.6, "eval_accuracy": 0.7613255033557047, "eval_loss": 1.0795037746429443, "eval_runtime": 7.8028, "eval_samples_per_second": 128.159, "eval_steps_per_second": 8.074, "step": 40200 }, { "epoch": 2.61, "grad_norm": 9.330721855163574, "learning_rate": 4.0300000000000004e-05, "loss": 1.1165, "step": 40300 }, { "epoch": 2.61, "grad_norm": 12.292078971862793, "learning_rate": 4.0400000000000006e-05, "loss": 1.1003, "step": 40400 }, { "epoch": 2.61, "eval_accuracy": 0.7495784148397976, "eval_loss": 1.090809941291809, "eval_runtime": 7.7285, "eval_samples_per_second": 129.391, "eval_steps_per_second": 8.152, "step": 40400 }, { "epoch": 2.62, "grad_norm": 8.170096397399902, "learning_rate": 4.05e-05, "loss": 1.1189, "step": 40500 }, { "epoch": 2.63, "grad_norm": 9.297009468078613, "learning_rate": 4.0600000000000004e-05, "loss": 1.1012, "step": 40600 }, { "epoch": 2.63, "eval_accuracy": 0.7636594663278272, "eval_loss": 1.0440165996551514, "eval_runtime": 7.7623, "eval_samples_per_second": 128.827, "eval_steps_per_second": 8.116, "step": 40600 }, { "epoch": 2.63, "grad_norm": 14.187376022338867, "learning_rate": 4.07e-05, "loss": 1.0778, "step": 40700 }, { "epoch": 2.64, "grad_norm": 12.531803131103516, "learning_rate": 4.08e-05, "loss": 1.1015, "step": 40800 }, { "epoch": 2.64, "eval_accuracy": 0.7600337980566118, "eval_loss": 1.0454752445220947, "eval_runtime": 7.5957, "eval_samples_per_second": 131.653, "eval_steps_per_second": 8.294, "step": 40800 }, { "epoch": 2.64, "grad_norm": 10.577985763549805, "learning_rate": 4.09e-05, "loss": 1.1141, "step": 40900 }, { "epoch": 2.65, "grad_norm": 13.15516185760498, "learning_rate": 4.1e-05, "loss": 1.0848, "step": 41000 }, { "epoch": 2.65, "eval_accuracy": 0.7489486963835156, "eval_loss": 1.0633922815322876, "eval_runtime": 7.7456, "eval_samples_per_second": 129.106, "eval_steps_per_second": 8.134, "step": 41000 }, { "epoch": 2.66, "grad_norm": 7.842673301696777, "learning_rate": 4.11e-05, "loss": 1.0883, "step": 41100 }, { "epoch": 2.66, "grad_norm": 10.88692855834961, "learning_rate": 4.12e-05, "loss": 1.1261, "step": 41200 }, { "epoch": 2.66, "eval_accuracy": 0.7599164926931107, "eval_loss": 1.1041170358657837, "eval_runtime": 7.8879, "eval_samples_per_second": 126.777, "eval_steps_per_second": 7.987, "step": 41200 }, { "epoch": 2.67, "grad_norm": 10.067037582397461, "learning_rate": 4.13e-05, "loss": 1.1043, "step": 41300 }, { "epoch": 2.68, "grad_norm": 9.606361389160156, "learning_rate": 4.14e-05, "loss": 1.05, "step": 41400 }, { "epoch": 2.68, "eval_accuracy": 0.7587663709336713, "eval_loss": 1.0718350410461426, "eval_runtime": 7.7316, "eval_samples_per_second": 129.34, "eval_steps_per_second": 8.148, "step": 41400 }, { "epoch": 2.68, "grad_norm": 8.934192657470703, "learning_rate": 4.15e-05, "loss": 1.0964, "step": 41500 }, { "epoch": 2.69, "grad_norm": 10.315465927124023, "learning_rate": 4.16e-05, "loss": 1.1012, "step": 41600 }, { "epoch": 2.69, "eval_accuracy": 0.7526337968815845, "eval_loss": 1.0622448921203613, "eval_runtime": 7.3547, "eval_samples_per_second": 135.968, "eval_steps_per_second": 8.566, "step": 41600 }, { "epoch": 2.7, "grad_norm": 9.315914154052734, "learning_rate": 4.17e-05, "loss": 1.0678, "step": 41700 }, { "epoch": 2.7, "grad_norm": 12.095351219177246, "learning_rate": 4.18e-05, "loss": 1.102, "step": 41800 }, { "epoch": 2.7, "eval_accuracy": 0.7484197218710493, "eval_loss": 1.0895180702209473, "eval_runtime": 7.8085, "eval_samples_per_second": 128.065, "eval_steps_per_second": 8.068, "step": 41800 }, { "epoch": 2.71, "grad_norm": 5.950236797332764, "learning_rate": 4.19e-05, "loss": 1.0611, "step": 41900 }, { "epoch": 2.72, "grad_norm": 7.806346893310547, "learning_rate": 4.2e-05, "loss": 1.1318, "step": 42000 }, { "epoch": 2.72, "eval_accuracy": 0.7427966101694915, "eval_loss": 1.1227227449417114, "eval_runtime": 7.5729, "eval_samples_per_second": 132.05, "eval_steps_per_second": 8.319, "step": 42000 }, { "epoch": 2.72, "grad_norm": 10.583379745483398, "learning_rate": 4.21e-05, "loss": 1.1032, "step": 42100 }, { "epoch": 2.73, "grad_norm": 10.153890609741211, "learning_rate": 4.22e-05, "loss": 1.1057, "step": 42200 }, { "epoch": 2.73, "eval_accuracy": 0.7569037656903765, "eval_loss": 1.0890016555786133, "eval_runtime": 7.6004, "eval_samples_per_second": 131.571, "eval_steps_per_second": 8.289, "step": 42200 }, { "epoch": 2.74, "grad_norm": 8.20639705657959, "learning_rate": 4.23e-05, "loss": 1.1089, "step": 42300 }, { "epoch": 2.74, "grad_norm": 10.519354820251465, "learning_rate": 4.24e-05, "loss": 1.1067, "step": 42400 }, { "epoch": 2.74, "eval_accuracy": 0.7523484201537147, "eval_loss": 1.0930747985839844, "eval_runtime": 7.5282, "eval_samples_per_second": 132.834, "eval_steps_per_second": 8.369, "step": 42400 }, { "epoch": 2.75, "grad_norm": 10.942216873168945, "learning_rate": 4.25e-05, "loss": 1.1124, "step": 42500 }, { "epoch": 2.75, "grad_norm": 9.135039329528809, "learning_rate": 4.26e-05, "loss": 1.1844, "step": 42600 }, { "epoch": 2.75, "eval_accuracy": 0.7441077441077442, "eval_loss": 1.1915295124053955, "eval_runtime": 7.5001, "eval_samples_per_second": 133.331, "eval_steps_per_second": 8.4, "step": 42600 }, { "epoch": 2.76, "grad_norm": 11.652338981628418, "learning_rate": 4.27e-05, "loss": 1.1388, "step": 42700 }, { "epoch": 2.77, "grad_norm": 8.130218505859375, "learning_rate": 4.2800000000000004e-05, "loss": 1.1263, "step": 42800 }, { "epoch": 2.77, "eval_accuracy": 0.7473639814424293, "eval_loss": 1.1032050848007202, "eval_runtime": 7.5393, "eval_samples_per_second": 132.638, "eval_steps_per_second": 8.356, "step": 42800 }, { "epoch": 2.77, "grad_norm": 11.946032524108887, "learning_rate": 4.29e-05, "loss": 1.1135, "step": 42900 }, { "epoch": 2.78, "grad_norm": 13.916319847106934, "learning_rate": 4.3e-05, "loss": 1.0821, "step": 43000 }, { "epoch": 2.78, "eval_accuracy": 0.75, "eval_loss": 1.0566545724868774, "eval_runtime": 7.4915, "eval_samples_per_second": 133.485, "eval_steps_per_second": 8.41, "step": 43000 }, { "epoch": 2.79, "grad_norm": 12.735472679138184, "learning_rate": 4.3100000000000004e-05, "loss": 1.1195, "step": 43100 }, { "epoch": 2.79, "grad_norm": 9.368264198303223, "learning_rate": 4.32e-05, "loss": 1.1142, "step": 43200 }, { "epoch": 2.79, "eval_accuracy": 0.7506382978723404, "eval_loss": 1.1277565956115723, "eval_runtime": 7.4156, "eval_samples_per_second": 134.85, "eval_steps_per_second": 8.496, "step": 43200 }, { "epoch": 2.8, "grad_norm": 10.667840003967285, "learning_rate": 4.33e-05, "loss": 1.1477, "step": 43300 }, { "epoch": 2.81, "grad_norm": 10.895872116088867, "learning_rate": 4.3400000000000005e-05, "loss": 1.1129, "step": 43400 }, { "epoch": 2.81, "eval_accuracy": 0.7658227848101266, "eval_loss": 1.041944980621338, "eval_runtime": 7.3094, "eval_samples_per_second": 136.811, "eval_steps_per_second": 8.619, "step": 43400 }, { "epoch": 2.81, "grad_norm": 14.71028995513916, "learning_rate": 4.35e-05, "loss": 1.0964, "step": 43500 }, { "epoch": 2.82, "grad_norm": 7.094457149505615, "learning_rate": 4.36e-05, "loss": 1.1125, "step": 43600 }, { "epoch": 2.82, "eval_accuracy": 0.746218487394958, "eval_loss": 1.1333186626434326, "eval_runtime": 8.4108, "eval_samples_per_second": 118.895, "eval_steps_per_second": 7.49, "step": 43600 }, { "epoch": 2.83, "grad_norm": 9.003418922424316, "learning_rate": 4.3700000000000005e-05, "loss": 1.1112, "step": 43700 }, { "epoch": 2.83, "grad_norm": 9.069958686828613, "learning_rate": 4.38e-05, "loss": 1.1416, "step": 43800 }, { "epoch": 2.83, "eval_accuracy": 0.7511520737327189, "eval_loss": 1.1540571451187134, "eval_runtime": 7.6127, "eval_samples_per_second": 131.36, "eval_steps_per_second": 8.276, "step": 43800 }, { "epoch": 2.84, "grad_norm": 9.585530281066895, "learning_rate": 4.39e-05, "loss": 1.125, "step": 43900 }, { "epoch": 2.85, "grad_norm": 6.809269428253174, "learning_rate": 4.4000000000000006e-05, "loss": 1.107, "step": 44000 }, { "epoch": 2.85, "eval_accuracy": 0.7582878724297104, "eval_loss": 1.0346765518188477, "eval_runtime": 7.4729, "eval_samples_per_second": 133.818, "eval_steps_per_second": 8.431, "step": 44000 }, { "epoch": 2.85, "grad_norm": 9.628134727478027, "learning_rate": 4.41e-05, "loss": 1.1332, "step": 44100 }, { "epoch": 2.86, "grad_norm": 10.028313636779785, "learning_rate": 4.4200000000000004e-05, "loss": 1.1271, "step": 44200 }, { "epoch": 2.86, "eval_accuracy": 0.7685497470489039, "eval_loss": 1.0826408863067627, "eval_runtime": 7.6887, "eval_samples_per_second": 130.062, "eval_steps_per_second": 8.194, "step": 44200 }, { "epoch": 2.86, "grad_norm": 12.443769454956055, "learning_rate": 4.43e-05, "loss": 1.0881, "step": 44300 }, { "epoch": 2.87, "grad_norm": 10.289525032043457, "learning_rate": 4.44e-05, "loss": 1.1017, "step": 44400 }, { "epoch": 2.87, "eval_accuracy": 0.7640591966173361, "eval_loss": 1.0918318033218384, "eval_runtime": 7.494, "eval_samples_per_second": 133.441, "eval_steps_per_second": 8.407, "step": 44400 }, { "epoch": 2.88, "grad_norm": 8.332845687866211, "learning_rate": 4.4500000000000004e-05, "loss": 1.0857, "step": 44500 }, { "epoch": 2.88, "grad_norm": 8.218147277832031, "learning_rate": 4.46e-05, "loss": 1.0822, "step": 44600 }, { "epoch": 2.88, "eval_accuracy": 0.7597977243994943, "eval_loss": 1.06198251247406, "eval_runtime": 7.667, "eval_samples_per_second": 130.429, "eval_steps_per_second": 8.217, "step": 44600 }, { "epoch": 2.89, "grad_norm": 13.623686790466309, "learning_rate": 4.47e-05, "loss": 1.0637, "step": 44700 }, { "epoch": 2.9, "grad_norm": 9.567965507507324, "learning_rate": 4.4800000000000005e-05, "loss": 1.1057, "step": 44800 }, { "epoch": 2.9, "eval_accuracy": 0.7635756056808688, "eval_loss": 1.0926151275634766, "eval_runtime": 7.3969, "eval_samples_per_second": 135.192, "eval_steps_per_second": 8.517, "step": 44800 }, { "epoch": 2.9, "grad_norm": 11.995281219482422, "learning_rate": 4.49e-05, "loss": 1.0943, "step": 44900 }, { "epoch": 2.91, "grad_norm": 7.070038795471191, "learning_rate": 4.5e-05, "loss": 1.0934, "step": 45000 }, { "epoch": 2.91, "eval_accuracy": 0.7485232067510549, "eval_loss": 1.1024444103240967, "eval_runtime": 7.6812, "eval_samples_per_second": 130.188, "eval_steps_per_second": 8.202, "step": 45000 }, { "epoch": 2.92, "grad_norm": 8.663105010986328, "learning_rate": 4.5100000000000005e-05, "loss": 1.1021, "step": 45100 }, { "epoch": 2.92, "grad_norm": 10.228675842285156, "learning_rate": 4.52e-05, "loss": 1.0526, "step": 45200 }, { "epoch": 2.92, "eval_accuracy": 0.7457983193277311, "eval_loss": 1.1439721584320068, "eval_runtime": 7.37, "eval_samples_per_second": 135.686, "eval_steps_per_second": 8.548, "step": 45200 }, { "epoch": 2.93, "grad_norm": 12.349609375, "learning_rate": 4.53e-05, "loss": 1.1205, "step": 45300 }, { "epoch": 2.94, "grad_norm": 10.934174537658691, "learning_rate": 4.5400000000000006e-05, "loss": 1.1149, "step": 45400 }, { "epoch": 2.94, "eval_accuracy": 0.7541736227045075, "eval_loss": 1.142331838607788, "eval_runtime": 7.4991, "eval_samples_per_second": 133.349, "eval_steps_per_second": 8.401, "step": 45400 }, { "epoch": 2.94, "grad_norm": 6.647462368011475, "learning_rate": 4.55e-05, "loss": 1.0724, "step": 45500 }, { "epoch": 2.95, "grad_norm": 9.966377258300781, "learning_rate": 4.5600000000000004e-05, "loss": 1.0981, "step": 45600 }, { "epoch": 2.95, "eval_accuracy": 0.7528373266078184, "eval_loss": 1.1308395862579346, "eval_runtime": 7.3358, "eval_samples_per_second": 136.317, "eval_steps_per_second": 8.588, "step": 45600 }, { "epoch": 2.96, "grad_norm": 11.750727653503418, "learning_rate": 4.5700000000000006e-05, "loss": 1.0862, "step": 45700 }, { "epoch": 2.96, "grad_norm": 6.274352550506592, "learning_rate": 4.58e-05, "loss": 1.0945, "step": 45800 }, { "epoch": 2.96, "eval_accuracy": 0.7562315166877904, "eval_loss": 1.0805420875549316, "eval_runtime": 7.5716, "eval_samples_per_second": 132.073, "eval_steps_per_second": 8.321, "step": 45800 }, { "epoch": 2.97, "grad_norm": 9.97281551361084, "learning_rate": 4.5900000000000004e-05, "loss": 1.1058, "step": 45900 }, { "epoch": 2.97, "grad_norm": 15.384854316711426, "learning_rate": 4.600000000000001e-05, "loss": 1.0634, "step": 46000 }, { "epoch": 2.97, "eval_accuracy": 0.7542407943731899, "eval_loss": 1.058525562286377, "eval_runtime": 7.79, "eval_samples_per_second": 128.37, "eval_steps_per_second": 8.087, "step": 46000 }, { "epoch": 2.98, "grad_norm": 7.97821044921875, "learning_rate": 4.61e-05, "loss": 1.1217, "step": 46100 }, { "epoch": 2.99, "grad_norm": 11.064678192138672, "learning_rate": 4.6200000000000005e-05, "loss": 1.1364, "step": 46200 }, { "epoch": 2.99, "eval_accuracy": 0.7592748735244519, "eval_loss": 1.0892812013626099, "eval_runtime": 7.6511, "eval_samples_per_second": 130.699, "eval_steps_per_second": 8.234, "step": 46200 }, { "epoch": 2.99, "grad_norm": 11.729290962219238, "learning_rate": 4.630000000000001e-05, "loss": 1.0635, "step": 46300 }, { "epoch": 3.0, "grad_norm": 9.48564338684082, "learning_rate": 4.64e-05, "loss": 1.1017, "step": 46400 }, { "epoch": 3.0, "eval_accuracy": 0.7639593908629442, "eval_loss": 1.086236834526062, "eval_runtime": 8.0149, "eval_samples_per_second": 124.768, "eval_steps_per_second": 7.86, "step": 46400 }, { "epoch": 3.01, "grad_norm": 10.503665924072266, "learning_rate": 4.6500000000000005e-05, "loss": 1.1182, "step": 46500 }, { "epoch": 3.01, "grad_norm": 8.116458892822266, "learning_rate": 4.660000000000001e-05, "loss": 1.0955, "step": 46600 }, { "epoch": 3.01, "eval_accuracy": 0.7559021922428331, "eval_loss": 1.1823790073394775, "eval_runtime": 7.3501, "eval_samples_per_second": 136.053, "eval_steps_per_second": 8.571, "step": 46600 }, { "epoch": 3.02, "grad_norm": 10.496063232421875, "learning_rate": 4.6700000000000003e-05, "loss": 1.0541, "step": 46700 }, { "epoch": 3.03, "grad_norm": 9.730162620544434, "learning_rate": 4.6800000000000006e-05, "loss": 1.0823, "step": 46800 }, { "epoch": 3.03, "eval_accuracy": 0.752204955900882, "eval_loss": 1.1216745376586914, "eval_runtime": 7.633, "eval_samples_per_second": 131.01, "eval_steps_per_second": 8.254, "step": 46800 }, { "epoch": 3.03, "grad_norm": 13.034643173217773, "learning_rate": 4.69e-05, "loss": 1.1126, "step": 46900 }, { "epoch": 3.04, "grad_norm": 12.074089050292969, "learning_rate": 4.7e-05, "loss": 1.1024, "step": 47000 }, { "epoch": 3.04, "eval_accuracy": 0.7477894736842106, "eval_loss": 1.0727558135986328, "eval_runtime": 8.0325, "eval_samples_per_second": 124.494, "eval_steps_per_second": 7.843, "step": 47000 }, { "epoch": 3.05, "grad_norm": 6.91002082824707, "learning_rate": 4.71e-05, "loss": 1.093, "step": 47100 }, { "epoch": 3.05, "grad_norm": 11.491669654846191, "learning_rate": 4.72e-05, "loss": 1.0652, "step": 47200 }, { "epoch": 3.05, "eval_accuracy": 0.7492637778712663, "eval_loss": 1.0745832920074463, "eval_runtime": 7.8673, "eval_samples_per_second": 127.108, "eval_steps_per_second": 8.008, "step": 47200 }, { "epoch": 3.06, "grad_norm": 11.350371360778809, "learning_rate": 4.73e-05, "loss": 1.093, "step": 47300 }, { "epoch": 3.07, "grad_norm": 9.675811767578125, "learning_rate": 4.74e-05, "loss": 1.0796, "step": 47400 }, { "epoch": 3.07, "eval_accuracy": 0.7505258729490955, "eval_loss": 1.0906462669372559, "eval_runtime": 7.6523, "eval_samples_per_second": 130.68, "eval_steps_per_second": 8.233, "step": 47400 }, { "epoch": 3.07, "grad_norm": 9.536125183105469, "learning_rate": 4.75e-05, "loss": 1.1082, "step": 47500 }, { "epoch": 3.08, "grad_norm": 11.2163724899292, "learning_rate": 4.76e-05, "loss": 1.146, "step": 47600 }, { "epoch": 3.08, "eval_accuracy": 0.7498952660242982, "eval_loss": 1.1773439645767212, "eval_runtime": 7.7609, "eval_samples_per_second": 128.851, "eval_steps_per_second": 8.118, "step": 47600 }, { "epoch": 3.08, "grad_norm": 11.2960786819458, "learning_rate": 4.77e-05, "loss": 1.0904, "step": 47700 }, { "epoch": 3.09, "grad_norm": 8.260519981384277, "learning_rate": 4.78e-05, "loss": 1.0746, "step": 47800 }, { "epoch": 3.09, "eval_accuracy": 0.7473639814424293, "eval_loss": 1.1027828454971313, "eval_runtime": 7.9355, "eval_samples_per_second": 126.016, "eval_steps_per_second": 7.939, "step": 47800 }, { "epoch": 3.1, "grad_norm": 9.213499069213867, "learning_rate": 4.79e-05, "loss": 1.0681, "step": 47900 }, { "epoch": 3.1, "grad_norm": 12.201976776123047, "learning_rate": 4.8e-05, "loss": 1.1104, "step": 48000 }, { "epoch": 3.1, "eval_accuracy": 0.7519899455383326, "eval_loss": 1.094162106513977, "eval_runtime": 7.5944, "eval_samples_per_second": 131.675, "eval_steps_per_second": 8.296, "step": 48000 }, { "epoch": 3.11, "grad_norm": 8.49892807006836, "learning_rate": 4.8100000000000004e-05, "loss": 1.079, "step": 48100 }, { "epoch": 3.12, "grad_norm": 11.533262252807617, "learning_rate": 4.82e-05, "loss": 1.1028, "step": 48200 }, { "epoch": 3.12, "eval_accuracy": 0.7523167649536647, "eval_loss": 1.0598485469818115, "eval_runtime": 7.7616, "eval_samples_per_second": 128.839, "eval_steps_per_second": 8.117, "step": 48200 }, { "epoch": 3.12, "grad_norm": 13.448162078857422, "learning_rate": 4.83e-05, "loss": 1.0736, "step": 48300 }, { "epoch": 3.13, "grad_norm": 9.824170112609863, "learning_rate": 4.8400000000000004e-05, "loss": 1.0314, "step": 48400 }, { "epoch": 3.13, "eval_accuracy": 0.7477894736842106, "eval_loss": 1.094994068145752, "eval_runtime": 7.5662, "eval_samples_per_second": 132.166, "eval_steps_per_second": 8.326, "step": 48400 }, { "epoch": 3.14, "grad_norm": 10.677599906921387, "learning_rate": 4.85e-05, "loss": 1.1087, "step": 48500 }, { "epoch": 3.14, "grad_norm": 10.457268714904785, "learning_rate": 4.86e-05, "loss": 1.1043, "step": 48600 }, { "epoch": 3.14, "eval_accuracy": 0.7427474402730375, "eval_loss": 1.189258337020874, "eval_runtime": 7.5288, "eval_samples_per_second": 132.824, "eval_steps_per_second": 8.368, "step": 48600 }, { "epoch": 3.15, "grad_norm": 9.427942276000977, "learning_rate": 4.87e-05, "loss": 1.1128, "step": 48700 }, { "epoch": 3.16, "grad_norm": 10.203646659851074, "learning_rate": 4.88e-05, "loss": 1.1311, "step": 48800 }, { "epoch": 3.16, "eval_accuracy": 0.7520067596113224, "eval_loss": 1.08732271194458, "eval_runtime": 7.9165, "eval_samples_per_second": 126.318, "eval_steps_per_second": 7.958, "step": 48800 }, { "epoch": 3.16, "grad_norm": 7.653264999389648, "learning_rate": 4.89e-05, "loss": 1.0982, "step": 48900 }, { "epoch": 3.17, "grad_norm": 12.875405311584473, "learning_rate": 4.9e-05, "loss": 1.0939, "step": 49000 }, { "epoch": 3.17, "eval_accuracy": 0.7524129248845992, "eval_loss": 1.1097220182418823, "eval_runtime": 7.8602, "eval_samples_per_second": 127.224, "eval_steps_per_second": 8.015, "step": 49000 }, { "epoch": 3.18, "grad_norm": 9.38310718536377, "learning_rate": 4.91e-05, "loss": 1.1145, "step": 49100 }, { "epoch": 3.18, "grad_norm": 10.719407081604004, "learning_rate": 4.92e-05, "loss": 1.1067, "step": 49200 }, { "epoch": 3.18, "eval_accuracy": 0.7335874629394324, "eval_loss": 1.1549091339111328, "eval_runtime": 8.1064, "eval_samples_per_second": 123.359, "eval_steps_per_second": 7.772, "step": 49200 }, { "epoch": 3.19, "grad_norm": 8.970664978027344, "learning_rate": 4.93e-05, "loss": 1.0829, "step": 49300 }, { "epoch": 3.19, "grad_norm": 14.89026927947998, "learning_rate": 4.94e-05, "loss": 1.1037, "step": 49400 }, { "epoch": 3.19, "eval_accuracy": 0.7534533277521975, "eval_loss": 1.0937861204147339, "eval_runtime": 7.4457, "eval_samples_per_second": 134.305, "eval_steps_per_second": 8.461, "step": 49400 }, { "epoch": 3.2, "grad_norm": 9.5610990524292, "learning_rate": 4.9500000000000004e-05, "loss": 1.05, "step": 49500 }, { "epoch": 3.21, "grad_norm": 8.428321838378906, "learning_rate": 4.96e-05, "loss": 1.0947, "step": 49600 }, { "epoch": 3.21, "eval_accuracy": 0.7497872340425532, "eval_loss": 1.106512188911438, "eval_runtime": 7.6193, "eval_samples_per_second": 131.246, "eval_steps_per_second": 8.269, "step": 49600 }, { "epoch": 3.21, "grad_norm": 9.604186058044434, "learning_rate": 4.97e-05, "loss": 1.0691, "step": 49700 }, { "epoch": 3.22, "grad_norm": 8.213460922241211, "learning_rate": 4.9800000000000004e-05, "loss": 1.1154, "step": 49800 }, { "epoch": 3.22, "eval_accuracy": 0.7657961246840775, "eval_loss": 1.0275248289108276, "eval_runtime": 7.5429, "eval_samples_per_second": 132.575, "eval_steps_per_second": 8.352, "step": 49800 }, { "epoch": 3.23, "grad_norm": 9.115046501159668, "learning_rate": 4.99e-05, "loss": 1.0976, "step": 49900 }, { "epoch": 3.23, "grad_norm": 8.49641227722168, "learning_rate": 5e-05, "loss": 1.0734, "step": 50000 }, { "epoch": 3.23, "eval_accuracy": 0.7477707006369426, "eval_loss": 1.099561333656311, "eval_runtime": 7.5784, "eval_samples_per_second": 131.953, "eval_steps_per_second": 8.313, "step": 50000 }, { "epoch": 3.24, "grad_norm": 8.749719619750977, "learning_rate": 4.995221712538227e-05, "loss": 1.0941, "step": 50100 }, { "epoch": 3.25, "grad_norm": 7.778064727783203, "learning_rate": 4.990443425076453e-05, "loss": 1.0794, "step": 50200 }, { "epoch": 3.25, "eval_accuracy": 0.755741127348643, "eval_loss": 1.0712189674377441, "eval_runtime": 8.0621, "eval_samples_per_second": 124.037, "eval_steps_per_second": 7.814, "step": 50200 }, { "epoch": 3.25, "grad_norm": 7.256039619445801, "learning_rate": 4.985665137614679e-05, "loss": 1.0948, "step": 50300 }, { "epoch": 3.26, "grad_norm": 8.533563613891602, "learning_rate": 4.980886850152906e-05, "loss": 1.0599, "step": 50400 }, { "epoch": 3.26, "eval_accuracy": 0.7587075115400755, "eval_loss": 1.0337634086608887, "eval_runtime": 7.7334, "eval_samples_per_second": 129.31, "eval_steps_per_second": 8.147, "step": 50400 }, { "epoch": 3.27, "grad_norm": 8.167562484741211, "learning_rate": 4.9761085626911316e-05, "loss": 1.0827, "step": 50500 }, { "epoch": 3.27, "grad_norm": 8.494314193725586, "learning_rate": 4.9713302752293575e-05, "loss": 1.0841, "step": 50600 }, { "epoch": 3.27, "eval_accuracy": 0.7442636629119733, "eval_loss": 1.0798754692077637, "eval_runtime": 7.6464, "eval_samples_per_second": 130.781, "eval_steps_per_second": 8.239, "step": 50600 }, { "epoch": 3.28, "grad_norm": 11.059128761291504, "learning_rate": 4.966551987767584e-05, "loss": 1.0942, "step": 50700 }, { "epoch": 3.29, "grad_norm": 10.59390640258789, "learning_rate": 4.9617737003058106e-05, "loss": 1.1163, "step": 50800 }, { "epoch": 3.29, "eval_accuracy": 0.7463126843657817, "eval_loss": 1.1159123182296753, "eval_runtime": 7.573, "eval_samples_per_second": 132.047, "eval_steps_per_second": 8.319, "step": 50800 }, { "epoch": 3.29, "grad_norm": 9.684558868408203, "learning_rate": 4.956995412844037e-05, "loss": 1.0176, "step": 50900 }, { "epoch": 3.3, "grad_norm": 10.771846771240234, "learning_rate": 4.952217125382263e-05, "loss": 1.0802, "step": 51000 }, { "epoch": 3.3, "eval_accuracy": 0.7507356031946196, "eval_loss": 1.087752103805542, "eval_runtime": 7.7037, "eval_samples_per_second": 129.808, "eval_steps_per_second": 8.178, "step": 51000 }, { "epoch": 3.3, "grad_norm": 9.980944633483887, "learning_rate": 4.9474388379204896e-05, "loss": 1.0827, "step": 51100 }, { "epoch": 3.31, "grad_norm": 9.02600383758545, "learning_rate": 4.942660550458716e-05, "loss": 1.0658, "step": 51200 }, { "epoch": 3.31, "eval_accuracy": 0.7668789808917198, "eval_loss": 1.0151196718215942, "eval_runtime": 8.5326, "eval_samples_per_second": 117.198, "eval_steps_per_second": 7.383, "step": 51200 }, { "epoch": 3.32, "grad_norm": 10.482467651367188, "learning_rate": 4.937882262996942e-05, "loss": 1.053, "step": 51300 }, { "epoch": 3.32, "grad_norm": 10.7754487991333, "learning_rate": 4.933103975535168e-05, "loss": 1.0826, "step": 51400 }, { "epoch": 3.32, "eval_accuracy": 0.755500207555002, "eval_loss": 1.0952320098876953, "eval_runtime": 7.8126, "eval_samples_per_second": 127.999, "eval_steps_per_second": 8.064, "step": 51400 }, { "epoch": 3.33, "grad_norm": 10.212149620056152, "learning_rate": 4.9283256880733945e-05, "loss": 1.0775, "step": 51500 }, { "epoch": 3.34, "grad_norm": 8.147611618041992, "learning_rate": 4.923547400611621e-05, "loss": 1.0556, "step": 51600 }, { "epoch": 3.34, "eval_accuracy": 0.7426999576809141, "eval_loss": 1.1387273073196411, "eval_runtime": 8.0444, "eval_samples_per_second": 124.31, "eval_steps_per_second": 7.832, "step": 51600 }, { "epoch": 3.34, "grad_norm": 8.843451499938965, "learning_rate": 4.9187691131498476e-05, "loss": 1.068, "step": 51700 }, { "epoch": 3.35, "grad_norm": 9.595107078552246, "learning_rate": 4.9139908256880734e-05, "loss": 1.0829, "step": 51800 }, { "epoch": 3.35, "eval_accuracy": 0.7485380116959064, "eval_loss": 1.0434927940368652, "eval_runtime": 7.6624, "eval_samples_per_second": 130.508, "eval_steps_per_second": 8.222, "step": 51800 }, { "epoch": 3.36, "grad_norm": 11.910065650939941, "learning_rate": 4.9092125382263e-05, "loss": 1.0489, "step": 51900 }, { "epoch": 3.36, "grad_norm": 9.84347915649414, "learning_rate": 4.9044342507645265e-05, "loss": 1.0813, "step": 52000 }, { "epoch": 3.36, "eval_accuracy": 0.7582325969153814, "eval_loss": 1.1042767763137817, "eval_runtime": 7.8835, "eval_samples_per_second": 126.847, "eval_steps_per_second": 7.991, "step": 52000 }, { "epoch": 3.37, "grad_norm": 8.326629638671875, "learning_rate": 4.899655963302753e-05, "loss": 1.1325, "step": 52100 }, { "epoch": 3.38, "grad_norm": 10.416407585144043, "learning_rate": 4.894877675840979e-05, "loss": 1.1169, "step": 52200 }, { "epoch": 3.38, "eval_accuracy": 0.7723611699872828, "eval_loss": 0.992525041103363, "eval_runtime": 7.909, "eval_samples_per_second": 126.439, "eval_steps_per_second": 7.966, "step": 52200 }, { "epoch": 3.38, "grad_norm": 8.360919952392578, "learning_rate": 4.890099388379205e-05, "loss": 1.1516, "step": 52300 }, { "epoch": 3.39, "grad_norm": 18.127708435058594, "learning_rate": 4.8853211009174314e-05, "loss": 1.2196, "step": 52400 }, { "epoch": 3.39, "eval_accuracy": 0.6604493429419246, "eval_loss": 1.846261739730835, "eval_runtime": 7.7724, "eval_samples_per_second": 128.66, "eval_steps_per_second": 8.106, "step": 52400 }, { "epoch": 3.39, "grad_norm": 10.633153915405273, "learning_rate": 4.880542813455658e-05, "loss": 1.6355, "step": 52500 }, { "epoch": 3.4, "grad_norm": 7.995849132537842, "learning_rate": 4.875764525993884e-05, "loss": 1.1472, "step": 52600 }, { "epoch": 3.4, "eval_accuracy": 0.751167728237792, "eval_loss": 1.109209418296814, "eval_runtime": 7.5642, "eval_samples_per_second": 132.201, "eval_steps_per_second": 8.329, "step": 52600 }, { "epoch": 3.41, "grad_norm": 9.700560569763184, "learning_rate": 4.8709862385321104e-05, "loss": 1.1671, "step": 52700 }, { "epoch": 3.41, "grad_norm": 8.057334899902344, "learning_rate": 4.866207951070337e-05, "loss": 1.1211, "step": 52800 }, { "epoch": 3.41, "eval_accuracy": 0.7507356031946196, "eval_loss": 1.1020898818969727, "eval_runtime": 7.4826, "eval_samples_per_second": 133.643, "eval_steps_per_second": 8.42, "step": 52800 }, { "epoch": 3.42, "grad_norm": 8.370006561279297, "learning_rate": 4.861429663608563e-05, "loss": 1.0852, "step": 52900 }, { "epoch": 3.43, "grad_norm": 8.273048400878906, "learning_rate": 4.8566513761467894e-05, "loss": 1.1418, "step": 53000 }, { "epoch": 3.43, "eval_accuracy": 0.7487394957983193, "eval_loss": 1.1890805959701538, "eval_runtime": 8.2009, "eval_samples_per_second": 121.939, "eval_steps_per_second": 7.682, "step": 53000 }, { "epoch": 3.43, "grad_norm": 11.383450508117676, "learning_rate": 4.851873088685015e-05, "loss": 1.0664, "step": 53100 }, { "epoch": 3.44, "grad_norm": 7.2834930419921875, "learning_rate": 4.847094801223242e-05, "loss": 1.152, "step": 53200 }, { "epoch": 3.44, "eval_accuracy": 0.7516863406408094, "eval_loss": 1.0853623151779175, "eval_runtime": 6.1661, "eval_samples_per_second": 162.176, "eval_steps_per_second": 10.217, "step": 53200 }, { "epoch": 3.45, "grad_norm": 8.712018966674805, "learning_rate": 4.8423165137614677e-05, "loss": 1.0813, "step": 53300 }, { "epoch": 3.45, "grad_norm": 9.760322570800781, "learning_rate": 4.837538226299694e-05, "loss": 1.0727, "step": 53400 }, { "epoch": 3.45, "eval_accuracy": 0.7369082530372852, "eval_loss": 1.1139088869094849, "eval_runtime": 7.9722, "eval_samples_per_second": 125.436, "eval_steps_per_second": 7.902, "step": 53400 }, { "epoch": 3.46, "grad_norm": 7.164800643920898, "learning_rate": 4.832759938837921e-05, "loss": 1.0666, "step": 53500 }, { "epoch": 3.47, "grad_norm": 8.785635948181152, "learning_rate": 4.827981651376147e-05, "loss": 1.0558, "step": 53600 }, { "epoch": 3.47, "eval_accuracy": 0.7493692178301093, "eval_loss": 1.1060575246810913, "eval_runtime": 7.9053, "eval_samples_per_second": 126.497, "eval_steps_per_second": 7.969, "step": 53600 }, { "epoch": 3.47, "grad_norm": 11.133256912231445, "learning_rate": 4.823203363914373e-05, "loss": 1.0718, "step": 53700 }, { "epoch": 3.48, "grad_norm": 13.262248992919922, "learning_rate": 4.8184250764526e-05, "loss": 1.0884, "step": 53800 }, { "epoch": 3.48, "eval_accuracy": 0.7624010004168403, "eval_loss": 1.0213675498962402, "eval_runtime": 7.4042, "eval_samples_per_second": 135.058, "eval_steps_per_second": 8.509, "step": 53800 }, { "epoch": 3.49, "grad_norm": 9.733431816101074, "learning_rate": 4.813646788990826e-05, "loss": 1.0935, "step": 53900 }, { "epoch": 3.49, "grad_norm": 9.724926948547363, "learning_rate": 4.808868501529052e-05, "loss": 1.1193, "step": 54000 }, { "epoch": 3.49, "eval_accuracy": 0.7465263157894737, "eval_loss": 1.0979362726211548, "eval_runtime": 7.7522, "eval_samples_per_second": 128.996, "eval_steps_per_second": 8.127, "step": 54000 }, { "epoch": 3.5, "grad_norm": 9.661033630371094, "learning_rate": 4.804090214067278e-05, "loss": 1.1035, "step": 54100 }, { "epoch": 3.5, "grad_norm": 7.575204372406006, "learning_rate": 4.7993119266055046e-05, "loss": 1.0549, "step": 54200 }, { "epoch": 3.5, "eval_accuracy": 0.7633684210526316, "eval_loss": 1.031359076499939, "eval_runtime": 7.8559, "eval_samples_per_second": 127.294, "eval_steps_per_second": 8.019, "step": 54200 }, { "epoch": 3.51, "grad_norm": 7.711136817932129, "learning_rate": 4.794533639143731e-05, "loss": 1.0835, "step": 54300 }, { "epoch": 3.52, "grad_norm": 10.317800521850586, "learning_rate": 4.789755351681958e-05, "loss": 1.086, "step": 54400 }, { "epoch": 3.52, "eval_accuracy": 0.7449748743718593, "eval_loss": 1.1570295095443726, "eval_runtime": 7.8027, "eval_samples_per_second": 128.161, "eval_steps_per_second": 8.074, "step": 54400 }, { "epoch": 3.52, "grad_norm": 10.803679466247559, "learning_rate": 4.7849770642201836e-05, "loss": 1.0837, "step": 54500 }, { "epoch": 3.53, "grad_norm": 9.72790241241455, "learning_rate": 4.78019877675841e-05, "loss": 1.1324, "step": 54600 }, { "epoch": 3.53, "eval_accuracy": 0.7474789915966387, "eval_loss": 1.1258342266082764, "eval_runtime": 7.5357, "eval_samples_per_second": 132.702, "eval_steps_per_second": 8.36, "step": 54600 }, { "epoch": 3.54, "grad_norm": 13.201569557189941, "learning_rate": 4.775420489296637e-05, "loss": 1.0819, "step": 54700 }, { "epoch": 3.54, "grad_norm": 10.45574951171875, "learning_rate": 4.7706422018348626e-05, "loss": 1.0607, "step": 54800 }, { "epoch": 3.54, "eval_accuracy": 0.7553282182438192, "eval_loss": 1.1277656555175781, "eval_runtime": 7.5032, "eval_samples_per_second": 133.276, "eval_steps_per_second": 8.396, "step": 54800 }, { "epoch": 3.55, "grad_norm": 10.999916076660156, "learning_rate": 4.7658639143730884e-05, "loss": 1.0747, "step": 54900 }, { "epoch": 3.56, "grad_norm": 9.783334732055664, "learning_rate": 4.761085626911315e-05, "loss": 1.0802, "step": 55000 }, { "epoch": 3.56, "eval_accuracy": 0.7523444160272805, "eval_loss": 1.1184240579605103, "eval_runtime": 7.5391, "eval_samples_per_second": 132.641, "eval_steps_per_second": 8.356, "step": 55000 }, { "epoch": 3.56, "grad_norm": 5.806687831878662, "learning_rate": 4.7563073394495415e-05, "loss": 1.0797, "step": 55100 }, { "epoch": 3.57, "grad_norm": 9.69478988647461, "learning_rate": 4.7515290519877674e-05, "loss": 1.1152, "step": 55200 }, { "epoch": 3.57, "eval_accuracy": 0.7443037974683544, "eval_loss": 1.1427620649337769, "eval_runtime": 8.3604, "eval_samples_per_second": 119.612, "eval_steps_per_second": 7.536, "step": 55200 }, { "epoch": 3.58, "grad_norm": 12.861019134521484, "learning_rate": 4.746750764525994e-05, "loss": 1.0704, "step": 55300 }, { "epoch": 3.58, "grad_norm": 12.82900619506836, "learning_rate": 4.7419724770642205e-05, "loss": 1.0567, "step": 55400 }, { "epoch": 3.58, "eval_accuracy": 0.7771236333052985, "eval_loss": 0.9889304041862488, "eval_runtime": 7.8908, "eval_samples_per_second": 126.73, "eval_steps_per_second": 7.984, "step": 55400 }, { "epoch": 3.59, "grad_norm": 9.796523094177246, "learning_rate": 4.737194189602447e-05, "loss": 1.0705, "step": 55500 }, { "epoch": 3.6, "grad_norm": 7.853588104248047, "learning_rate": 4.732415902140673e-05, "loss": 1.1064, "step": 55600 }, { "epoch": 3.6, "eval_accuracy": 0.7527242246437552, "eval_loss": 1.1006696224212646, "eval_runtime": 7.9336, "eval_samples_per_second": 126.046, "eval_steps_per_second": 7.941, "step": 55600 }, { "epoch": 3.6, "grad_norm": 9.183232307434082, "learning_rate": 4.7276376146788995e-05, "loss": 0.9929, "step": 55700 }, { "epoch": 3.61, "grad_norm": 7.295994758605957, "learning_rate": 4.7228593272171254e-05, "loss": 1.0731, "step": 55800 }, { "epoch": 3.61, "eval_accuracy": 0.7505294366793731, "eval_loss": 1.1186912059783936, "eval_runtime": 7.83, "eval_samples_per_second": 127.714, "eval_steps_per_second": 8.046, "step": 55800 }, { "epoch": 3.61, "grad_norm": 31.710182189941406, "learning_rate": 4.718081039755352e-05, "loss": 1.0483, "step": 55900 }, { "epoch": 3.62, "grad_norm": 10.91593074798584, "learning_rate": 4.713302752293578e-05, "loss": 1.0206, "step": 56000 }, { "epoch": 3.62, "eval_accuracy": 0.7650273224043715, "eval_loss": 1.0444929599761963, "eval_runtime": 7.8389, "eval_samples_per_second": 127.569, "eval_steps_per_second": 8.037, "step": 56000 }, { "epoch": 3.63, "grad_norm": 9.551151275634766, "learning_rate": 4.7085244648318044e-05, "loss": 1.1503, "step": 56100 }, { "epoch": 3.63, "grad_norm": 9.768546104431152, "learning_rate": 4.703746177370031e-05, "loss": 1.0852, "step": 56200 }, { "epoch": 3.63, "eval_accuracy": 0.740787269681742, "eval_loss": 1.1176525354385376, "eval_runtime": 8.0793, "eval_samples_per_second": 123.773, "eval_steps_per_second": 7.798, "step": 56200 }, { "epoch": 3.64, "grad_norm": 11.523877143859863, "learning_rate": 4.6989678899082575e-05, "loss": 1.0826, "step": 56300 }, { "epoch": 3.65, "grad_norm": 24.44474220275879, "learning_rate": 4.694189602446483e-05, "loss": 1.3795, "step": 56400 }, { "epoch": 3.65, "eval_accuracy": 0.6322084579239641, "eval_loss": 1.7602471113204956, "eval_runtime": 7.6485, "eval_samples_per_second": 130.744, "eval_steps_per_second": 8.237, "step": 56400 }, { "epoch": 3.65, "grad_norm": 11.254035949707031, "learning_rate": 4.68941131498471e-05, "loss": 1.2582, "step": 56500 }, { "epoch": 3.66, "grad_norm": 9.820259094238281, "learning_rate": 4.684633027522936e-05, "loss": 1.6473, "step": 56600 }, { "epoch": 3.66, "eval_accuracy": 0.7061293031066331, "eval_loss": 1.3152257204055786, "eval_runtime": 7.7306, "eval_samples_per_second": 129.356, "eval_steps_per_second": 8.149, "step": 56600 }, { "epoch": 3.67, "grad_norm": 16.91270637512207, "learning_rate": 4.679854740061162e-05, "loss": 1.2617, "step": 56700 }, { "epoch": 3.67, "grad_norm": 9.77519416809082, "learning_rate": 4.675076452599388e-05, "loss": 1.261, "step": 56800 }, { "epoch": 3.67, "eval_accuracy": 0.7495867768595041, "eval_loss": 1.1347088813781738, "eval_runtime": 7.6204, "eval_samples_per_second": 131.226, "eval_steps_per_second": 8.267, "step": 56800 }, { "epoch": 3.68, "grad_norm": 9.02775764465332, "learning_rate": 4.670298165137615e-05, "loss": 1.0929, "step": 56900 }, { "epoch": 3.69, "grad_norm": 9.367201805114746, "learning_rate": 4.665519877675841e-05, "loss": 1.0474, "step": 57000 }, { "epoch": 3.69, "eval_accuracy": 0.7580645161290323, "eval_loss": 1.1005109548568726, "eval_runtime": 7.9374, "eval_samples_per_second": 125.985, "eval_steps_per_second": 7.937, "step": 57000 }, { "epoch": 3.69, "grad_norm": 7.824542045593262, "learning_rate": 4.660741590214068e-05, "loss": 1.0577, "step": 57100 }, { "epoch": 3.7, "grad_norm": 9.367128372192383, "learning_rate": 4.655963302752294e-05, "loss": 1.1096, "step": 57200 }, { "epoch": 3.7, "eval_accuracy": 0.7543786488740617, "eval_loss": 1.080618143081665, "eval_runtime": 8.1222, "eval_samples_per_second": 123.119, "eval_steps_per_second": 7.756, "step": 57200 }, { "epoch": 3.71, "grad_norm": 8.868852615356445, "learning_rate": 4.65118501529052e-05, "loss": 1.0459, "step": 57300 }, { "epoch": 3.71, "grad_norm": 10.00037956237793, "learning_rate": 4.646406727828747e-05, "loss": 1.0256, "step": 57400 }, { "epoch": 3.71, "eval_accuracy": 0.7531592249368155, "eval_loss": 1.0906511545181274, "eval_runtime": 8.3077, "eval_samples_per_second": 120.37, "eval_steps_per_second": 7.583, "step": 57400 }, { "epoch": 3.72, "grad_norm": 13.429753303527832, "learning_rate": 4.641628440366973e-05, "loss": 1.0834, "step": 57500 }, { "epoch": 3.72, "grad_norm": 7.028581142425537, "learning_rate": 4.6368501529051986e-05, "loss": 1.0153, "step": 57600 }, { "epoch": 3.72, "eval_accuracy": 0.7590717299578059, "eval_loss": 1.0553700923919678, "eval_runtime": 7.5695, "eval_samples_per_second": 132.109, "eval_steps_per_second": 8.323, "step": 57600 }, { "epoch": 3.73, "grad_norm": 7.833512783050537, "learning_rate": 4.632071865443425e-05, "loss": 1.0315, "step": 57700 }, { "epoch": 3.74, "grad_norm": 7.073586463928223, "learning_rate": 4.627293577981652e-05, "loss": 1.0552, "step": 57800 }, { "epoch": 3.74, "eval_accuracy": 0.7441471571906354, "eval_loss": 1.1244982481002808, "eval_runtime": 8.3511, "eval_samples_per_second": 119.745, "eval_steps_per_second": 7.544, "step": 57800 }, { "epoch": 3.74, "grad_norm": 13.926541328430176, "learning_rate": 4.6225152905198776e-05, "loss": 1.0826, "step": 57900 }, { "epoch": 3.75, "grad_norm": 10.42960262298584, "learning_rate": 4.617737003058104e-05, "loss": 1.0496, "step": 58000 }, { "epoch": 3.75, "eval_accuracy": 0.7656641604010025, "eval_loss": 1.0922205448150635, "eval_runtime": 7.7397, "eval_samples_per_second": 129.204, "eval_steps_per_second": 8.14, "step": 58000 }, { "epoch": 3.76, "grad_norm": 10.133858680725098, "learning_rate": 4.6129587155963307e-05, "loss": 1.0548, "step": 58100 }, { "epoch": 3.76, "grad_norm": 10.383139610290527, "learning_rate": 4.608180428134557e-05, "loss": 1.046, "step": 58200 }, { "epoch": 3.76, "eval_accuracy": 0.7423799582463465, "eval_loss": 1.1240386962890625, "eval_runtime": 7.4974, "eval_samples_per_second": 133.38, "eval_steps_per_second": 8.403, "step": 58200 }, { "epoch": 3.77, "grad_norm": 9.541145324707031, "learning_rate": 4.603402140672783e-05, "loss": 1.0704, "step": 58300 }, { "epoch": 3.78, "grad_norm": 9.198772430419922, "learning_rate": 4.5986238532110096e-05, "loss": 1.0352, "step": 58400 }, { "epoch": 3.78, "eval_accuracy": 0.7708508845829823, "eval_loss": 1.025152564048767, "eval_runtime": 7.8359, "eval_samples_per_second": 127.618, "eval_steps_per_second": 8.04, "step": 58400 }, { "epoch": 3.78, "grad_norm": 11.298945426940918, "learning_rate": 4.5938455657492355e-05, "loss": 1.0484, "step": 58500 }, { "epoch": 3.79, "grad_norm": 9.25523567199707, "learning_rate": 4.589067278287462e-05, "loss": 1.0713, "step": 58600 }, { "epoch": 3.79, "eval_accuracy": 0.7614447711045779, "eval_loss": 1.0602465867996216, "eval_runtime": 7.8318, "eval_samples_per_second": 127.685, "eval_steps_per_second": 8.044, "step": 58600 }, { "epoch": 3.8, "grad_norm": 7.596445083618164, "learning_rate": 4.584288990825688e-05, "loss": 1.0568, "step": 58700 }, { "epoch": 3.8, "grad_norm": 8.321885108947754, "learning_rate": 4.5795107033639145e-05, "loss": 1.0865, "step": 58800 }, { "epoch": 3.8, "eval_accuracy": 0.7639358108108109, "eval_loss": 1.0062285661697388, "eval_runtime": 7.7815, "eval_samples_per_second": 128.51, "eval_steps_per_second": 8.096, "step": 58800 }, { "epoch": 3.81, "grad_norm": 8.764801979064941, "learning_rate": 4.574732415902141e-05, "loss": 1.0427, "step": 58900 }, { "epoch": 3.82, "grad_norm": 7.0769944190979, "learning_rate": 4.5699541284403676e-05, "loss": 1.0456, "step": 59000 }, { "epoch": 3.82, "eval_accuracy": 0.7709473684210526, "eval_loss": 1.0416101217269897, "eval_runtime": 7.7351, "eval_samples_per_second": 129.28, "eval_steps_per_second": 8.145, "step": 59000 }, { "epoch": 3.82, "grad_norm": 7.2752532958984375, "learning_rate": 4.5651758409785935e-05, "loss": 1.0213, "step": 59100 }, { "epoch": 3.83, "grad_norm": 14.521905899047852, "learning_rate": 4.56039755351682e-05, "loss": 0.9981, "step": 59200 }, { "epoch": 3.83, "eval_accuracy": 0.7725358045492839, "eval_loss": 0.9810414910316467, "eval_runtime": 7.7994, "eval_samples_per_second": 128.215, "eval_steps_per_second": 8.078, "step": 59200 }, { "epoch": 3.83, "grad_norm": 8.639435768127441, "learning_rate": 4.555619266055046e-05, "loss": 1.0492, "step": 59300 }, { "epoch": 3.84, "grad_norm": 9.351419448852539, "learning_rate": 4.5508409785932724e-05, "loss": 0.9991, "step": 59400 }, { "epoch": 3.84, "eval_accuracy": 0.7781970649895178, "eval_loss": 1.0035151243209839, "eval_runtime": 8.1491, "eval_samples_per_second": 122.712, "eval_steps_per_second": 7.731, "step": 59400 }, { "epoch": 3.85, "grad_norm": 12.961902618408203, "learning_rate": 4.546062691131498e-05, "loss": 1.0894, "step": 59500 }, { "epoch": 3.85, "grad_norm": 8.489011764526367, "learning_rate": 4.541284403669725e-05, "loss": 1.009, "step": 59600 }, { "epoch": 3.85, "eval_accuracy": 0.7660297239915075, "eval_loss": 1.065154790878296, "eval_runtime": 8.3879, "eval_samples_per_second": 119.22, "eval_steps_per_second": 7.511, "step": 59600 }, { "epoch": 3.86, "grad_norm": 10.644379615783691, "learning_rate": 4.5365061162079514e-05, "loss": 1.0193, "step": 59700 }, { "epoch": 3.87, "grad_norm": 11.224438667297363, "learning_rate": 4.531727828746178e-05, "loss": 0.9928, "step": 59800 }, { "epoch": 3.87, "eval_accuracy": 0.7462121212121212, "eval_loss": 1.0793315172195435, "eval_runtime": 8.2629, "eval_samples_per_second": 121.023, "eval_steps_per_second": 7.624, "step": 59800 }, { "epoch": 3.87, "grad_norm": 7.241248607635498, "learning_rate": 4.526949541284404e-05, "loss": 1.0614, "step": 59900 }, { "epoch": 3.88, "grad_norm": 8.136162757873535, "learning_rate": 4.5221712538226304e-05, "loss": 1.004, "step": 60000 }, { "epoch": 3.88, "eval_accuracy": 0.7762326169405815, "eval_loss": 0.9910482168197632, "eval_runtime": 7.9585, "eval_samples_per_second": 125.651, "eval_steps_per_second": 7.916, "step": 60000 }, { "epoch": 3.89, "grad_norm": 13.011497497558594, "learning_rate": 4.517392966360857e-05, "loss": 1.0328, "step": 60100 }, { "epoch": 3.89, "grad_norm": 8.685037612915039, "learning_rate": 4.512614678899083e-05, "loss": 1.0641, "step": 60200 }, { "epoch": 3.89, "eval_accuracy": 0.7535744322960471, "eval_loss": 1.0718779563903809, "eval_runtime": 7.9852, "eval_samples_per_second": 125.232, "eval_steps_per_second": 7.89, "step": 60200 }, { "epoch": 3.9, "grad_norm": 17.49058723449707, "learning_rate": 4.507836391437309e-05, "loss": 1.0718, "step": 60300 }, { "epoch": 3.91, "grad_norm": 45.173362731933594, "learning_rate": 4.503058103975535e-05, "loss": 1.1109, "step": 60400 }, { "epoch": 3.91, "eval_accuracy": 0.7388724035608308, "eval_loss": 1.188225507736206, "eval_runtime": 7.4676, "eval_samples_per_second": 133.911, "eval_steps_per_second": 8.436, "step": 60400 }, { "epoch": 3.91, "grad_norm": 8.36913776397705, "learning_rate": 4.498279816513762e-05, "loss": 1.1557, "step": 60500 }, { "epoch": 3.92, "grad_norm": 10.927862167358398, "learning_rate": 4.493501529051988e-05, "loss": 1.1252, "step": 60600 }, { "epoch": 3.92, "eval_accuracy": 0.7636061487328625, "eval_loss": 1.0712857246398926, "eval_runtime": 7.466, "eval_samples_per_second": 133.941, "eval_steps_per_second": 8.438, "step": 60600 }, { "epoch": 3.93, "grad_norm": 9.629243850708008, "learning_rate": 4.488723241590214e-05, "loss": 1.0044, "step": 60700 }, { "epoch": 3.93, "grad_norm": 7.9458465576171875, "learning_rate": 4.483944954128441e-05, "loss": 1.0069, "step": 60800 }, { "epoch": 3.93, "eval_accuracy": 0.7723991507430998, "eval_loss": 1.0241872072219849, "eval_runtime": 7.6432, "eval_samples_per_second": 130.834, "eval_steps_per_second": 8.243, "step": 60800 }, { "epoch": 3.94, "grad_norm": 11.803157806396484, "learning_rate": 4.4791666666666673e-05, "loss": 1.0294, "step": 60900 }, { "epoch": 3.94, "grad_norm": 8.053366661071777, "learning_rate": 4.474388379204893e-05, "loss": 1.0178, "step": 61000 }, { "epoch": 3.94, "eval_accuracy": 0.7698242606086584, "eval_loss": 0.9822267293930054, "eval_runtime": 6.2399, "eval_samples_per_second": 160.258, "eval_steps_per_second": 10.096, "step": 61000 }, { "epoch": 3.95, "grad_norm": 9.405101776123047, "learning_rate": 4.469610091743119e-05, "loss": 1.0232, "step": 61100 }, { "epoch": 3.96, "grad_norm": 9.385193824768066, "learning_rate": 4.4648318042813456e-05, "loss": 1.0328, "step": 61200 }, { "epoch": 3.96, "eval_accuracy": 0.766750418760469, "eval_loss": 0.9997309446334839, "eval_runtime": 8.0527, "eval_samples_per_second": 124.181, "eval_steps_per_second": 7.823, "step": 61200 }, { "epoch": 3.96, "grad_norm": 7.465459823608398, "learning_rate": 4.460053516819572e-05, "loss": 1.0556, "step": 61300 }, { "epoch": 3.97, "grad_norm": 9.944977760314941, "learning_rate": 4.455275229357798e-05, "loss": 1.0709, "step": 61400 }, { "epoch": 3.97, "eval_accuracy": 0.7724399494310998, "eval_loss": 1.0083246231079102, "eval_runtime": 8.1122, "eval_samples_per_second": 123.271, "eval_steps_per_second": 7.766, "step": 61400 }, { "epoch": 3.98, "grad_norm": 10.79748249053955, "learning_rate": 4.4504969418960246e-05, "loss": 1.0559, "step": 61500 }, { "epoch": 3.98, "grad_norm": 7.305400371551514, "learning_rate": 4.445718654434251e-05, "loss": 1.0136, "step": 61600 }, { "epoch": 3.98, "eval_accuracy": 0.77428810720268, "eval_loss": 0.9401131868362427, "eval_runtime": 7.5312, "eval_samples_per_second": 132.782, "eval_steps_per_second": 8.365, "step": 61600 }, { "epoch": 3.99, "grad_norm": 11.166438102722168, "learning_rate": 4.440940366972478e-05, "loss": 1.0089, "step": 61700 }, { "epoch": 4.0, "grad_norm": 7.1273932456970215, "learning_rate": 4.4361620795107036e-05, "loss": 1.0513, "step": 61800 }, { "epoch": 4.0, "eval_accuracy": 0.759235668789809, "eval_loss": 1.0744144916534424, "eval_runtime": 7.5916, "eval_samples_per_second": 131.724, "eval_steps_per_second": 8.299, "step": 61800 }, { "epoch": 4.0, "grad_norm": 8.272632598876953, "learning_rate": 4.43138379204893e-05, "loss": 1.0443, "step": 61900 }, { "epoch": 4.01, "grad_norm": 6.36247444152832, "learning_rate": 4.426605504587156e-05, "loss": 0.9628, "step": 62000 }, { "epoch": 4.01, "eval_accuracy": 0.7582001682085786, "eval_loss": 1.0813343524932861, "eval_runtime": 7.4488, "eval_samples_per_second": 134.25, "eval_steps_per_second": 8.458, "step": 62000 }, { "epoch": 4.02, "grad_norm": 8.797049522399902, "learning_rate": 4.4218272171253826e-05, "loss": 1.0016, "step": 62100 }, { "epoch": 4.02, "grad_norm": 8.456846237182617, "learning_rate": 4.4170489296636085e-05, "loss": 0.9725, "step": 62200 }, { "epoch": 4.02, "eval_accuracy": 0.7573715248525695, "eval_loss": 1.1145051717758179, "eval_runtime": 7.9176, "eval_samples_per_second": 126.301, "eval_steps_per_second": 7.957, "step": 62200 }, { "epoch": 4.03, "grad_norm": 9.03228759765625, "learning_rate": 4.412270642201835e-05, "loss": 0.9483, "step": 62300 }, { "epoch": 4.04, "grad_norm": 8.897140502929688, "learning_rate": 4.4074923547400616e-05, "loss": 0.9814, "step": 62400 }, { "epoch": 4.04, "eval_accuracy": 0.7647804054054054, "eval_loss": 1.0255275964736938, "eval_runtime": 7.7274, "eval_samples_per_second": 129.409, "eval_steps_per_second": 8.153, "step": 62400 }, { "epoch": 4.04, "grad_norm": 6.818185329437256, "learning_rate": 4.402714067278288e-05, "loss": 0.9949, "step": 62500 }, { "epoch": 4.05, "grad_norm": 8.852161407470703, "learning_rate": 4.397935779816514e-05, "loss": 1.0205, "step": 62600 }, { "epoch": 4.05, "eval_accuracy": 0.7617039223956137, "eval_loss": 1.0527292490005493, "eval_runtime": 7.8681, "eval_samples_per_second": 127.096, "eval_steps_per_second": 8.007, "step": 62600 }, { "epoch": 4.05, "grad_norm": 6.465674877166748, "learning_rate": 4.3931574923547405e-05, "loss": 0.9978, "step": 62700 }, { "epoch": 4.06, "grad_norm": 4.8625593185424805, "learning_rate": 4.3883792048929664e-05, "loss": 1.0196, "step": 62800 }, { "epoch": 4.06, "eval_accuracy": 0.7667231160033869, "eval_loss": 1.0493184328079224, "eval_runtime": 7.9925, "eval_samples_per_second": 125.118, "eval_steps_per_second": 7.882, "step": 62800 }, { "epoch": 4.07, "grad_norm": 8.816508293151855, "learning_rate": 4.383600917431192e-05, "loss": 0.9851, "step": 62900 }, { "epoch": 4.07, "grad_norm": 10.162187576293945, "learning_rate": 4.378822629969419e-05, "loss": 0.9812, "step": 63000 }, { "epoch": 4.07, "eval_accuracy": 0.7726890756302521, "eval_loss": 1.0193111896514893, "eval_runtime": 8.3402, "eval_samples_per_second": 119.901, "eval_steps_per_second": 7.554, "step": 63000 }, { "epoch": 4.08, "grad_norm": 8.398265838623047, "learning_rate": 4.3740443425076454e-05, "loss": 1.0087, "step": 63100 }, { "epoch": 4.09, "grad_norm": 7.310011386871338, "learning_rate": 4.369266055045872e-05, "loss": 1.0041, "step": 63200 }, { "epoch": 4.09, "eval_accuracy": 0.7662337662337663, "eval_loss": 1.0352623462677002, "eval_runtime": 7.7178, "eval_samples_per_second": 129.57, "eval_steps_per_second": 8.163, "step": 63200 }, { "epoch": 4.09, "grad_norm": 5.977463722229004, "learning_rate": 4.364487767584098e-05, "loss": 0.9731, "step": 63300 }, { "epoch": 4.1, "grad_norm": 11.757678985595703, "learning_rate": 4.3597094801223244e-05, "loss": 1.0116, "step": 63400 }, { "epoch": 4.1, "eval_accuracy": 0.7707808564231738, "eval_loss": 1.0053455829620361, "eval_runtime": 7.8535, "eval_samples_per_second": 127.333, "eval_steps_per_second": 8.022, "step": 63400 }, { "epoch": 4.11, "grad_norm": 6.815821647644043, "learning_rate": 4.354931192660551e-05, "loss": 1.0514, "step": 63500 }, { "epoch": 4.11, "grad_norm": 7.498113632202148, "learning_rate": 4.3501529051987775e-05, "loss": 0.9778, "step": 63600 }, { "epoch": 4.11, "eval_accuracy": 0.7710021321961621, "eval_loss": 0.9971421957015991, "eval_runtime": 7.9692, "eval_samples_per_second": 125.483, "eval_steps_per_second": 7.905, "step": 63600 }, { "epoch": 4.12, "grad_norm": 7.811466693878174, "learning_rate": 4.3453746177370034e-05, "loss": 1.039, "step": 63700 }, { "epoch": 4.13, "grad_norm": 7.920493125915527, "learning_rate": 4.340596330275229e-05, "loss": 0.9701, "step": 63800 }, { "epoch": 4.13, "eval_accuracy": 0.7584905660377359, "eval_loss": 1.0754740238189697, "eval_runtime": 8.2732, "eval_samples_per_second": 120.872, "eval_steps_per_second": 7.615, "step": 63800 }, { "epoch": 4.13, "grad_norm": 9.953316688537598, "learning_rate": 4.335818042813456e-05, "loss": 1.0232, "step": 63900 }, { "epoch": 4.14, "grad_norm": 8.336577415466309, "learning_rate": 4.3310397553516823e-05, "loss": 1.045, "step": 64000 }, { "epoch": 4.14, "eval_accuracy": 0.7622641509433963, "eval_loss": 1.0283209085464478, "eval_runtime": 8.2049, "eval_samples_per_second": 121.878, "eval_steps_per_second": 7.678, "step": 64000 }, { "epoch": 4.15, "grad_norm": 13.166764259338379, "learning_rate": 4.326261467889908e-05, "loss": 1.0154, "step": 64100 }, { "epoch": 4.15, "grad_norm": 9.266497611999512, "learning_rate": 4.321483180428135e-05, "loss": 1.0344, "step": 64200 }, { "epoch": 4.15, "eval_accuracy": 0.7615097469929489, "eval_loss": 1.057710886001587, "eval_runtime": 8.144, "eval_samples_per_second": 122.789, "eval_steps_per_second": 7.736, "step": 64200 }, { "epoch": 4.16, "grad_norm": 23.1909236907959, "learning_rate": 4.316704892966361e-05, "loss": 0.9845, "step": 64300 }, { "epoch": 4.16, "grad_norm": 9.27863597869873, "learning_rate": 4.311926605504588e-05, "loss": 1.0069, "step": 64400 }, { "epoch": 4.16, "eval_accuracy": 0.7592592592592593, "eval_loss": 1.0770145654678345, "eval_runtime": 7.5449, "eval_samples_per_second": 132.539, "eval_steps_per_second": 8.35, "step": 64400 }, { "epoch": 4.17, "grad_norm": 9.161259651184082, "learning_rate": 4.307148318042814e-05, "loss": 0.9927, "step": 64500 }, { "epoch": 4.18, "grad_norm": 6.4487409591674805, "learning_rate": 4.3023700305810396e-05, "loss": 1.021, "step": 64600 }, { "epoch": 4.18, "eval_accuracy": 0.7632583792957149, "eval_loss": 1.0408591032028198, "eval_runtime": 7.738, "eval_samples_per_second": 129.232, "eval_steps_per_second": 8.142, "step": 64600 }, { "epoch": 4.18, "grad_norm": 8.323763847351074, "learning_rate": 4.297591743119266e-05, "loss": 0.996, "step": 64700 }, { "epoch": 4.19, "grad_norm": 13.744288444519043, "learning_rate": 4.292813455657493e-05, "loss": 0.9755, "step": 64800 }, { "epoch": 4.19, "eval_accuracy": 0.7767519932857743, "eval_loss": 1.0298298597335815, "eval_runtime": 8.3558, "eval_samples_per_second": 119.677, "eval_steps_per_second": 7.54, "step": 64800 }, { "epoch": 4.2, "grad_norm": 7.578272819519043, "learning_rate": 4.2880351681957186e-05, "loss": 1.0077, "step": 64900 }, { "epoch": 4.2, "grad_norm": 123.07152557373047, "learning_rate": 4.283256880733945e-05, "loss": 1.0358, "step": 65000 }, { "epoch": 4.2, "eval_accuracy": 0.7766091712242322, "eval_loss": 0.9902980923652649, "eval_runtime": 8.3068, "eval_samples_per_second": 120.384, "eval_steps_per_second": 7.584, "step": 65000 }, { "epoch": 4.21, "grad_norm": 9.0626220703125, "learning_rate": 4.278478593272172e-05, "loss": 0.9713, "step": 65100 }, { "epoch": 4.22, "grad_norm": 10.862616539001465, "learning_rate": 4.2737003058103976e-05, "loss": 0.9529, "step": 65200 }, { "epoch": 4.22, "eval_accuracy": 0.7713692946058092, "eval_loss": 1.0319756269454956, "eval_runtime": 7.9375, "eval_samples_per_second": 125.985, "eval_steps_per_second": 7.937, "step": 65200 }, { "epoch": 4.22, "grad_norm": 8.288646697998047, "learning_rate": 4.268922018348624e-05, "loss": 0.9703, "step": 65300 }, { "epoch": 4.23, "grad_norm": 6.460119247436523, "learning_rate": 4.264143730886851e-05, "loss": 1.0254, "step": 65400 }, { "epoch": 4.23, "eval_accuracy": 0.751892346509672, "eval_loss": 1.0815229415893555, "eval_runtime": 7.5864, "eval_samples_per_second": 131.815, "eval_steps_per_second": 8.304, "step": 65400 }, { "epoch": 4.24, "grad_norm": 10.511157035827637, "learning_rate": 4.2593654434250766e-05, "loss": 0.9966, "step": 65500 }, { "epoch": 4.24, "grad_norm": 6.696932792663574, "learning_rate": 4.2545871559633024e-05, "loss": 1.0491, "step": 65600 }, { "epoch": 4.24, "eval_accuracy": 0.7635447291054179, "eval_loss": 1.0510512590408325, "eval_runtime": 7.6305, "eval_samples_per_second": 131.053, "eval_steps_per_second": 8.256, "step": 65600 }, { "epoch": 4.25, "grad_norm": 11.408631324768066, "learning_rate": 4.249808868501529e-05, "loss": 0.9789, "step": 65700 }, { "epoch": 4.26, "grad_norm": 11.041851997375488, "learning_rate": 4.2450305810397555e-05, "loss": 0.9874, "step": 65800 }, { "epoch": 4.26, "eval_accuracy": 0.7659484579636671, "eval_loss": 1.0184226036071777, "eval_runtime": 8.0877, "eval_samples_per_second": 123.645, "eval_steps_per_second": 7.79, "step": 65800 }, { "epoch": 4.26, "grad_norm": 10.023484230041504, "learning_rate": 4.240252293577982e-05, "loss": 1.0154, "step": 65900 }, { "epoch": 4.27, "grad_norm": 11.408568382263184, "learning_rate": 4.235474006116208e-05, "loss": 1.0018, "step": 66000 }, { "epoch": 4.27, "eval_accuracy": 0.7666385846672283, "eval_loss": 0.9983900189399719, "eval_runtime": 7.9762, "eval_samples_per_second": 125.373, "eval_steps_per_second": 7.899, "step": 66000 }, { "epoch": 4.27, "grad_norm": 9.738842010498047, "learning_rate": 4.2306957186544345e-05, "loss": 1.0185, "step": 66100 }, { "epoch": 4.28, "grad_norm": 16.175048828125, "learning_rate": 4.225917431192661e-05, "loss": 0.9921, "step": 66200 }, { "epoch": 4.28, "eval_accuracy": 0.7591819699499165, "eval_loss": 1.0717992782592773, "eval_runtime": 7.9182, "eval_samples_per_second": 126.292, "eval_steps_per_second": 7.956, "step": 66200 }, { "epoch": 4.29, "grad_norm": 7.359999179840088, "learning_rate": 4.221139143730887e-05, "loss": 1.0255, "step": 66300 }, { "epoch": 4.29, "grad_norm": 8.455622673034668, "learning_rate": 4.216360856269113e-05, "loss": 0.9559, "step": 66400 }, { "epoch": 4.29, "eval_accuracy": 0.776890756302521, "eval_loss": 1.0402127504348755, "eval_runtime": 7.7267, "eval_samples_per_second": 129.421, "eval_steps_per_second": 8.154, "step": 66400 }, { "epoch": 4.3, "grad_norm": 6.954375267028809, "learning_rate": 4.2115825688073394e-05, "loss": 0.9488, "step": 66500 }, { "epoch": 4.31, "grad_norm": 9.676368713378906, "learning_rate": 4.206804281345566e-05, "loss": 1.0038, "step": 66600 }, { "epoch": 4.31, "eval_accuracy": 0.7763157894736842, "eval_loss": 1.030269742012024, "eval_runtime": 8.3311, "eval_samples_per_second": 120.033, "eval_steps_per_second": 7.562, "step": 66600 }, { "epoch": 4.31, "grad_norm": 6.689573287963867, "learning_rate": 4.2020259938837925e-05, "loss": 0.9876, "step": 66700 }, { "epoch": 4.32, "grad_norm": 10.445016860961914, "learning_rate": 4.1972477064220184e-05, "loss": 1.0217, "step": 66800 }, { "epoch": 4.32, "eval_accuracy": 0.7770700636942676, "eval_loss": 1.0110101699829102, "eval_runtime": 7.9165, "eval_samples_per_second": 126.318, "eval_steps_per_second": 7.958, "step": 66800 }, { "epoch": 4.33, "grad_norm": 8.755440711975098, "learning_rate": 4.192469418960245e-05, "loss": 0.9668, "step": 66900 }, { "epoch": 4.33, "grad_norm": 8.265666961669922, "learning_rate": 4.1876911314984715e-05, "loss": 0.9986, "step": 67000 }, { "epoch": 4.33, "eval_accuracy": 0.7621824239900041, "eval_loss": 1.0677824020385742, "eval_runtime": 7.904, "eval_samples_per_second": 126.518, "eval_steps_per_second": 7.971, "step": 67000 }, { "epoch": 4.34, "grad_norm": 10.270692825317383, "learning_rate": 4.182912844036698e-05, "loss": 0.9884, "step": 67100 }, { "epoch": 4.35, "grad_norm": 8.45980167388916, "learning_rate": 4.178134556574924e-05, "loss": 1.0475, "step": 67200 }, { "epoch": 4.35, "eval_accuracy": 0.7761506276150628, "eval_loss": 1.0028026103973389, "eval_runtime": 8.14, "eval_samples_per_second": 122.85, "eval_steps_per_second": 7.74, "step": 67200 }, { "epoch": 4.35, "grad_norm": 6.8346357345581055, "learning_rate": 4.17335626911315e-05, "loss": 0.9768, "step": 67300 }, { "epoch": 4.36, "grad_norm": 10.122456550598145, "learning_rate": 4.168577981651376e-05, "loss": 0.9703, "step": 67400 }, { "epoch": 4.36, "eval_accuracy": 0.7602854743912678, "eval_loss": 1.0382269620895386, "eval_runtime": 7.8004, "eval_samples_per_second": 128.198, "eval_steps_per_second": 8.077, "step": 67400 }, { "epoch": 4.36, "grad_norm": 7.192403793334961, "learning_rate": 4.163799694189603e-05, "loss": 1.0159, "step": 67500 }, { "epoch": 4.37, "grad_norm": 9.270212173461914, "learning_rate": 4.159021406727829e-05, "loss": 1.0178, "step": 67600 }, { "epoch": 4.37, "eval_accuracy": 0.7648054145516074, "eval_loss": 1.038000226020813, "eval_runtime": 7.6871, "eval_samples_per_second": 130.089, "eval_steps_per_second": 8.196, "step": 67600 }, { "epoch": 4.38, "grad_norm": 8.335596084594727, "learning_rate": 4.154243119266055e-05, "loss": 1.0309, "step": 67700 }, { "epoch": 4.38, "grad_norm": 10.838501930236816, "learning_rate": 4.149464831804282e-05, "loss": 0.9872, "step": 67800 }, { "epoch": 4.38, "eval_accuracy": 0.7658148303309593, "eval_loss": 1.0197337865829468, "eval_runtime": 7.957, "eval_samples_per_second": 125.676, "eval_steps_per_second": 7.918, "step": 67800 }, { "epoch": 4.39, "grad_norm": 12.41421127319336, "learning_rate": 4.144686544342508e-05, "loss": 1.0116, "step": 67900 }, { "epoch": 4.4, "grad_norm": 11.448400497436523, "learning_rate": 4.139908256880734e-05, "loss": 0.9974, "step": 68000 }, { "epoch": 4.4, "eval_accuracy": 0.7712966848510281, "eval_loss": 0.9887294769287109, "eval_runtime": 7.4571, "eval_samples_per_second": 134.1, "eval_steps_per_second": 8.448, "step": 68000 }, { "epoch": 4.4, "grad_norm": 5.945106506347656, "learning_rate": 4.13512996941896e-05, "loss": 0.9693, "step": 68100 }, { "epoch": 4.41, "grad_norm": 7.767606735229492, "learning_rate": 4.130351681957187e-05, "loss": 0.9747, "step": 68200 }, { "epoch": 4.41, "eval_accuracy": 0.7746419545071609, "eval_loss": 0.9959123730659485, "eval_runtime": 8.2183, "eval_samples_per_second": 121.68, "eval_steps_per_second": 7.666, "step": 68200 }, { "epoch": 4.42, "grad_norm": 9.02371597290039, "learning_rate": 4.1255733944954126e-05, "loss": 0.9983, "step": 68300 }, { "epoch": 4.42, "grad_norm": 8.155447959899902, "learning_rate": 4.120795107033639e-05, "loss": 0.9677, "step": 68400 }, { "epoch": 4.42, "eval_accuracy": 0.7727463312368973, "eval_loss": 0.9675217270851135, "eval_runtime": 7.7939, "eval_samples_per_second": 128.305, "eval_steps_per_second": 8.083, "step": 68400 }, { "epoch": 4.43, "grad_norm": 9.404272079467773, "learning_rate": 4.116016819571866e-05, "loss": 0.9513, "step": 68500 }, { "epoch": 4.44, "grad_norm": 10.09421443939209, "learning_rate": 4.111238532110092e-05, "loss": 0.9988, "step": 68600 }, { "epoch": 4.44, "eval_accuracy": 0.7652464494569757, "eval_loss": 1.0191264152526855, "eval_runtime": 7.3044, "eval_samples_per_second": 136.903, "eval_steps_per_second": 8.625, "step": 68600 }, { "epoch": 4.44, "grad_norm": 9.399133682250977, "learning_rate": 4.106460244648318e-05, "loss": 0.967, "step": 68700 }, { "epoch": 4.45, "grad_norm": 10.803784370422363, "learning_rate": 4.1016819571865447e-05, "loss": 0.9658, "step": 68800 }, { "epoch": 4.45, "eval_accuracy": 0.7827181208053692, "eval_loss": 0.9787444472312927, "eval_runtime": 6.6935, "eval_samples_per_second": 149.4, "eval_steps_per_second": 9.412, "step": 68800 }, { "epoch": 4.46, "grad_norm": 9.836651802062988, "learning_rate": 4.096903669724771e-05, "loss": 0.9487, "step": 68900 }, { "epoch": 4.46, "grad_norm": 8.918514251708984, "learning_rate": 4.092125382262997e-05, "loss": 1.0008, "step": 69000 }, { "epoch": 4.46, "eval_accuracy": 0.7699412258606213, "eval_loss": 1.0747356414794922, "eval_runtime": 7.8987, "eval_samples_per_second": 126.604, "eval_steps_per_second": 7.976, "step": 69000 }, { "epoch": 4.47, "grad_norm": 4.558477401733398, "learning_rate": 4.087347094801223e-05, "loss": 0.8773, "step": 69100 }, { "epoch": 4.47, "grad_norm": 8.765199661254883, "learning_rate": 4.0825688073394495e-05, "loss": 1.0313, "step": 69200 }, { "epoch": 4.47, "eval_accuracy": 0.7629255989911727, "eval_loss": 1.0748530626296997, "eval_runtime": 7.2132, "eval_samples_per_second": 138.635, "eval_steps_per_second": 8.734, "step": 69200 }, { "epoch": 4.48, "grad_norm": 12.341398239135742, "learning_rate": 4.077790519877676e-05, "loss": 1.0027, "step": 69300 }, { "epoch": 4.49, "grad_norm": 10.638558387756348, "learning_rate": 4.0730122324159026e-05, "loss": 0.9476, "step": 69400 }, { "epoch": 4.49, "eval_accuracy": 0.7643824027072758, "eval_loss": 1.0196694135665894, "eval_runtime": 7.8454, "eval_samples_per_second": 127.463, "eval_steps_per_second": 8.03, "step": 69400 }, { "epoch": 4.49, "grad_norm": 8.990233421325684, "learning_rate": 4.0682339449541285e-05, "loss": 0.9734, "step": 69500 }, { "epoch": 4.5, "grad_norm": 10.145893096923828, "learning_rate": 4.063455657492355e-05, "loss": 0.9691, "step": 69600 }, { "epoch": 4.5, "eval_accuracy": 0.7844387755102041, "eval_loss": 0.9472571611404419, "eval_runtime": 7.7937, "eval_samples_per_second": 128.308, "eval_steps_per_second": 8.083, "step": 69600 }, { "epoch": 4.51, "grad_norm": 4.93452787399292, "learning_rate": 4.0586773700305816e-05, "loss": 0.9566, "step": 69700 }, { "epoch": 4.51, "grad_norm": 7.357171058654785, "learning_rate": 4.0538990825688075e-05, "loss": 0.9642, "step": 69800 }, { "epoch": 4.51, "eval_accuracy": 0.7786644267114657, "eval_loss": 0.9629709124565125, "eval_runtime": 7.9366, "eval_samples_per_second": 125.998, "eval_steps_per_second": 7.938, "step": 69800 }, { "epoch": 4.52, "grad_norm": 9.370051383972168, "learning_rate": 4.0491207951070333e-05, "loss": 0.9283, "step": 69900 }, { "epoch": 4.53, "grad_norm": 7.569904804229736, "learning_rate": 4.04434250764526e-05, "loss": 1.0233, "step": 70000 }, { "epoch": 4.53, "eval_accuracy": 0.7628607277289837, "eval_loss": 1.0460400581359863, "eval_runtime": 7.8896, "eval_samples_per_second": 126.749, "eval_steps_per_second": 7.985, "step": 70000 }, { "epoch": 4.53, "grad_norm": 8.774346351623535, "learning_rate": 4.0395642201834865e-05, "loss": 0.9904, "step": 70100 }, { "epoch": 4.54, "grad_norm": 7.50544548034668, "learning_rate": 4.034785932721712e-05, "loss": 0.9832, "step": 70200 }, { "epoch": 4.54, "eval_accuracy": 0.775963149078727, "eval_loss": 0.9848018884658813, "eval_runtime": 8.4852, "eval_samples_per_second": 117.853, "eval_steps_per_second": 7.425, "step": 70200 }, { "epoch": 4.55, "grad_norm": 8.70753288269043, "learning_rate": 4.030007645259939e-05, "loss": 0.9805, "step": 70300 }, { "epoch": 4.55, "grad_norm": 7.547196865081787, "learning_rate": 4.0252293577981654e-05, "loss": 0.975, "step": 70400 }, { "epoch": 4.55, "eval_accuracy": 0.7824989482541018, "eval_loss": 0.9843223094940186, "eval_runtime": 8.074, "eval_samples_per_second": 123.855, "eval_steps_per_second": 7.803, "step": 70400 }, { "epoch": 4.56, "grad_norm": 9.106057167053223, "learning_rate": 4.020451070336392e-05, "loss": 0.9732, "step": 70500 }, { "epoch": 4.57, "grad_norm": 6.807091236114502, "learning_rate": 4.015672782874618e-05, "loss": 0.9463, "step": 70600 }, { "epoch": 4.57, "eval_accuracy": 0.767819924968737, "eval_loss": 0.978091299533844, "eval_runtime": 8.2655, "eval_samples_per_second": 120.985, "eval_steps_per_second": 7.622, "step": 70600 }, { "epoch": 4.57, "grad_norm": 8.480233192443848, "learning_rate": 4.0108944954128444e-05, "loss": 0.9969, "step": 70700 }, { "epoch": 4.58, "grad_norm": 7.747964859008789, "learning_rate": 4.00611620795107e-05, "loss": 0.9864, "step": 70800 }, { "epoch": 4.58, "eval_accuracy": 0.7877637130801688, "eval_loss": 0.9459249377250671, "eval_runtime": 8.1991, "eval_samples_per_second": 121.965, "eval_steps_per_second": 7.684, "step": 70800 }, { "epoch": 4.58, "grad_norm": 8.146021842956543, "learning_rate": 4.001337920489297e-05, "loss": 0.9416, "step": 70900 }, { "epoch": 4.59, "grad_norm": 7.969533443450928, "learning_rate": 3.996559633027523e-05, "loss": 0.9547, "step": 71000 }, { "epoch": 4.59, "eval_accuracy": 0.7768456375838926, "eval_loss": 0.9732393026351929, "eval_runtime": 7.5465, "eval_samples_per_second": 132.512, "eval_steps_per_second": 8.348, "step": 71000 }, { "epoch": 4.6, "grad_norm": 9.68906021118164, "learning_rate": 3.991781345565749e-05, "loss": 0.9769, "step": 71100 }, { "epoch": 4.6, "grad_norm": 8.883282661437988, "learning_rate": 3.987003058103976e-05, "loss": 1.0073, "step": 71200 }, { "epoch": 4.6, "eval_accuracy": 0.7531539108494533, "eval_loss": 1.0698323249816895, "eval_runtime": 8.2083, "eval_samples_per_second": 121.828, "eval_steps_per_second": 7.675, "step": 71200 }, { "epoch": 4.61, "grad_norm": 15.839872360229492, "learning_rate": 3.9822247706422024e-05, "loss": 0.9973, "step": 71300 }, { "epoch": 4.62, "grad_norm": 9.487722396850586, "learning_rate": 3.977446483180428e-05, "loss": 1.0028, "step": 71400 }, { "epoch": 4.62, "eval_accuracy": 0.7735690235690236, "eval_loss": 1.0179448127746582, "eval_runtime": 8.4119, "eval_samples_per_second": 118.88, "eval_steps_per_second": 7.489, "step": 71400 }, { "epoch": 4.62, "grad_norm": 12.71741771697998, "learning_rate": 3.972668195718655e-05, "loss": 1.0301, "step": 71500 }, { "epoch": 4.63, "grad_norm": 7.949224948883057, "learning_rate": 3.967889908256881e-05, "loss": 1.0047, "step": 71600 }, { "epoch": 4.63, "eval_accuracy": 0.770568278201866, "eval_loss": 0.9725746512413025, "eval_runtime": 7.652, "eval_samples_per_second": 130.685, "eval_steps_per_second": 8.233, "step": 71600 }, { "epoch": 4.64, "grad_norm": 12.07673168182373, "learning_rate": 3.963111620795107e-05, "loss": 1.0005, "step": 71700 }, { "epoch": 4.64, "grad_norm": 9.03133773803711, "learning_rate": 3.958333333333333e-05, "loss": 0.9762, "step": 71800 }, { "epoch": 4.64, "eval_accuracy": 0.7797319932998324, "eval_loss": 1.0198901891708374, "eval_runtime": 8.1725, "eval_samples_per_second": 122.362, "eval_steps_per_second": 7.709, "step": 71800 }, { "epoch": 4.65, "grad_norm": 9.173961639404297, "learning_rate": 3.9535550458715597e-05, "loss": 0.9689, "step": 71900 }, { "epoch": 4.66, "grad_norm": 8.073040962219238, "learning_rate": 3.948776758409786e-05, "loss": 0.9936, "step": 72000 }, { "epoch": 4.66, "eval_accuracy": 0.7756653992395437, "eval_loss": 0.9666666388511658, "eval_runtime": 8.2464, "eval_samples_per_second": 121.266, "eval_steps_per_second": 7.64, "step": 72000 }, { "epoch": 4.66, "grad_norm": 14.806440353393555, "learning_rate": 3.943998470948013e-05, "loss": 0.9394, "step": 72100 }, { "epoch": 4.67, "grad_norm": 7.622402667999268, "learning_rate": 3.9392201834862386e-05, "loss": 0.9938, "step": 72200 }, { "epoch": 4.67, "eval_accuracy": 0.7787197965239508, "eval_loss": 1.0114634037017822, "eval_runtime": 7.6765, "eval_samples_per_second": 130.267, "eval_steps_per_second": 8.207, "step": 72200 }, { "epoch": 4.68, "grad_norm": 9.757914543151855, "learning_rate": 3.934441896024465e-05, "loss": 0.9789, "step": 72300 }, { "epoch": 4.68, "grad_norm": 7.3266496658325195, "learning_rate": 3.929663608562692e-05, "loss": 0.9639, "step": 72400 }, { "epoch": 4.68, "eval_accuracy": 0.7724801338352154, "eval_loss": 1.001409888267517, "eval_runtime": 8.2681, "eval_samples_per_second": 120.946, "eval_steps_per_second": 7.62, "step": 72400 }, { "epoch": 4.69, "grad_norm": 6.522299289703369, "learning_rate": 3.9248853211009176e-05, "loss": 0.9339, "step": 72500 }, { "epoch": 4.69, "grad_norm": 6.694498062133789, "learning_rate": 3.9201070336391435e-05, "loss": 1.015, "step": 72600 }, { "epoch": 4.69, "eval_accuracy": 0.7887678122380554, "eval_loss": 0.9502221345901489, "eval_runtime": 8.3775, "eval_samples_per_second": 119.367, "eval_steps_per_second": 7.52, "step": 72600 }, { "epoch": 4.7, "grad_norm": 10.880620002746582, "learning_rate": 3.91532874617737e-05, "loss": 1.0112, "step": 72700 }, { "epoch": 4.71, "grad_norm": 8.189009666442871, "learning_rate": 3.9105504587155966e-05, "loss": 0.9808, "step": 72800 }, { "epoch": 4.71, "eval_accuracy": 0.7862920391656024, "eval_loss": 0.9433594942092896, "eval_runtime": 7.8871, "eval_samples_per_second": 126.79, "eval_steps_per_second": 7.988, "step": 72800 }, { "epoch": 4.71, "grad_norm": 9.456833839416504, "learning_rate": 3.9057721712538225e-05, "loss": 0.9791, "step": 72900 }, { "epoch": 4.72, "grad_norm": 11.8285493850708, "learning_rate": 3.900993883792049e-05, "loss": 0.9628, "step": 73000 }, { "epoch": 4.72, "eval_accuracy": 0.7612876254180602, "eval_loss": 1.0842130184173584, "eval_runtime": 7.3388, "eval_samples_per_second": 136.262, "eval_steps_per_second": 8.585, "step": 73000 }, { "epoch": 4.73, "grad_norm": 7.6782708168029785, "learning_rate": 3.8962155963302756e-05, "loss": 0.9594, "step": 73100 }, { "epoch": 4.73, "grad_norm": 10.721222877502441, "learning_rate": 3.891437308868502e-05, "loss": 0.9884, "step": 73200 }, { "epoch": 4.73, "eval_accuracy": 0.7600840336134453, "eval_loss": 1.1069767475128174, "eval_runtime": 8.0507, "eval_samples_per_second": 124.213, "eval_steps_per_second": 7.825, "step": 73200 }, { "epoch": 4.74, "grad_norm": 19.396963119506836, "learning_rate": 3.886659021406728e-05, "loss": 1.1128, "step": 73300 }, { "epoch": 4.75, "grad_norm": 9.731720924377441, "learning_rate": 3.8818807339449545e-05, "loss": 1.1239, "step": 73400 }, { "epoch": 4.75, "eval_accuracy": 0.7353187919463087, "eval_loss": 1.2144737243652344, "eval_runtime": 7.809, "eval_samples_per_second": 128.057, "eval_steps_per_second": 8.068, "step": 73400 }, { "epoch": 4.75, "grad_norm": 11.014097213745117, "learning_rate": 3.8771024464831804e-05, "loss": 1.0812, "step": 73500 }, { "epoch": 4.76, "grad_norm": 10.584521293640137, "learning_rate": 3.872324159021407e-05, "loss": 0.976, "step": 73600 }, { "epoch": 4.76, "eval_accuracy": 0.7680976430976431, "eval_loss": 1.026493787765503, "eval_runtime": 7.9179, "eval_samples_per_second": 126.296, "eval_steps_per_second": 7.957, "step": 73600 }, { "epoch": 4.77, "grad_norm": 9.65191650390625, "learning_rate": 3.867545871559633e-05, "loss": 0.9508, "step": 73700 }, { "epoch": 4.77, "grad_norm": 6.49526834487915, "learning_rate": 3.8627675840978594e-05, "loss": 1.0132, "step": 73800 }, { "epoch": 4.77, "eval_accuracy": 0.7806014400677679, "eval_loss": 0.9851163625717163, "eval_runtime": 7.7706, "eval_samples_per_second": 128.69, "eval_steps_per_second": 8.107, "step": 73800 }, { "epoch": 4.78, "grad_norm": 10.15756607055664, "learning_rate": 3.857989296636086e-05, "loss": 1.0897, "step": 73900 }, { "epoch": 4.79, "grad_norm": 8.659029960632324, "learning_rate": 3.8532110091743125e-05, "loss": 1.1237, "step": 74000 }, { "epoch": 4.79, "eval_accuracy": 0.7636971978251778, "eval_loss": 1.030717372894287, "eval_runtime": 7.9387, "eval_samples_per_second": 125.966, "eval_steps_per_second": 7.936, "step": 74000 }, { "epoch": 4.79, "grad_norm": 11.54944133758545, "learning_rate": 3.8484327217125384e-05, "loss": 0.983, "step": 74100 }, { "epoch": 4.8, "grad_norm": 7.232990264892578, "learning_rate": 3.843654434250765e-05, "loss": 1.0343, "step": 74200 }, { "epoch": 4.8, "eval_accuracy": 0.7606405394016014, "eval_loss": 1.0653371810913086, "eval_runtime": 7.8275, "eval_samples_per_second": 127.754, "eval_steps_per_second": 8.048, "step": 74200 }, { "epoch": 4.8, "grad_norm": 12.886724472045898, "learning_rate": 3.838876146788991e-05, "loss": 0.9761, "step": 74300 }, { "epoch": 4.81, "grad_norm": 9.514069557189941, "learning_rate": 3.8340978593272174e-05, "loss": 1.0015, "step": 74400 }, { "epoch": 4.81, "eval_accuracy": 0.7731305449936628, "eval_loss": 1.013373613357544, "eval_runtime": 8.4386, "eval_samples_per_second": 118.504, "eval_steps_per_second": 7.466, "step": 74400 }, { "epoch": 4.82, "grad_norm": 8.296751976013184, "learning_rate": 3.829319571865443e-05, "loss": 0.9731, "step": 74500 }, { "epoch": 4.82, "grad_norm": 8.945280075073242, "learning_rate": 3.82454128440367e-05, "loss": 0.9845, "step": 74600 }, { "epoch": 4.82, "eval_accuracy": 0.7728415758591786, "eval_loss": 1.021104097366333, "eval_runtime": 7.8636, "eval_samples_per_second": 127.168, "eval_steps_per_second": 8.012, "step": 74600 }, { "epoch": 4.83, "grad_norm": 8.86462116241455, "learning_rate": 3.8197629969418963e-05, "loss": 0.988, "step": 74700 }, { "epoch": 4.84, "grad_norm": 9.48746395111084, "learning_rate": 3.814984709480123e-05, "loss": 0.9389, "step": 74800 }, { "epoch": 4.84, "eval_accuracy": 0.7748845023099538, "eval_loss": 0.9560744762420654, "eval_runtime": 8.271, "eval_samples_per_second": 120.904, "eval_steps_per_second": 7.617, "step": 74800 }, { "epoch": 4.84, "grad_norm": 9.44326114654541, "learning_rate": 3.810206422018349e-05, "loss": 1.0232, "step": 74900 }, { "epoch": 4.85, "grad_norm": 7.291296482086182, "learning_rate": 3.805428134556575e-05, "loss": 0.9894, "step": 75000 }, { "epoch": 4.85, "eval_accuracy": 0.7769753610875106, "eval_loss": 0.9872747659683228, "eval_runtime": 7.7138, "eval_samples_per_second": 129.638, "eval_steps_per_second": 8.167, "step": 75000 }, { "epoch": 4.86, "grad_norm": 7.511613845825195, "learning_rate": 3.800649847094802e-05, "loss": 0.944, "step": 75100 }, { "epoch": 4.86, "grad_norm": 10.044756889343262, "learning_rate": 3.795871559633028e-05, "loss": 1.0179, "step": 75200 }, { "epoch": 4.86, "eval_accuracy": 0.7379949452401011, "eval_loss": 1.2330255508422852, "eval_runtime": 7.8028, "eval_samples_per_second": 128.159, "eval_steps_per_second": 8.074, "step": 75200 }, { "epoch": 4.87, "grad_norm": 8.356000900268555, "learning_rate": 3.7910932721712536e-05, "loss": 1.0322, "step": 75300 }, { "epoch": 4.88, "grad_norm": 13.360698699951172, "learning_rate": 3.78631498470948e-05, "loss": 1.04, "step": 75400 }, { "epoch": 4.88, "eval_accuracy": 0.78125, "eval_loss": 0.9776120781898499, "eval_runtime": 7.9093, "eval_samples_per_second": 126.433, "eval_steps_per_second": 7.965, "step": 75400 }, { "epoch": 4.88, "grad_norm": 9.66020393371582, "learning_rate": 3.781536697247707e-05, "loss": 0.9733, "step": 75500 }, { "epoch": 4.89, "grad_norm": 6.86496639251709, "learning_rate": 3.7767584097859326e-05, "loss": 0.9711, "step": 75600 }, { "epoch": 4.89, "eval_accuracy": 0.7736486486486487, "eval_loss": 1.0055361986160278, "eval_runtime": 8.491, "eval_samples_per_second": 117.771, "eval_steps_per_second": 7.42, "step": 75600 }, { "epoch": 4.9, "grad_norm": 14.185347557067871, "learning_rate": 3.771980122324159e-05, "loss": 0.934, "step": 75700 }, { "epoch": 4.9, "grad_norm": 7.5016374588012695, "learning_rate": 3.767201834862386e-05, "loss": 1.0196, "step": 75800 }, { "epoch": 4.9, "eval_accuracy": 0.7741393786733837, "eval_loss": 0.9855289459228516, "eval_runtime": 8.2412, "eval_samples_per_second": 121.341, "eval_steps_per_second": 7.644, "step": 75800 }, { "epoch": 4.91, "grad_norm": 7.798384666442871, "learning_rate": 3.762423547400612e-05, "loss": 0.9841, "step": 75900 }, { "epoch": 4.91, "grad_norm": 10.278339385986328, "learning_rate": 3.757645259938838e-05, "loss": 0.9363, "step": 76000 }, { "epoch": 4.91, "eval_accuracy": 0.7698446031079378, "eval_loss": 0.9834262728691101, "eval_runtime": 7.7208, "eval_samples_per_second": 129.52, "eval_steps_per_second": 8.16, "step": 76000 }, { "epoch": 4.92, "grad_norm": 9.339378356933594, "learning_rate": 3.752866972477064e-05, "loss": 1.0015, "step": 76100 }, { "epoch": 4.93, "grad_norm": 10.695347785949707, "learning_rate": 3.7480886850152906e-05, "loss": 0.9768, "step": 76200 }, { "epoch": 4.93, "eval_accuracy": 0.7671290458175704, "eval_loss": 1.0341378450393677, "eval_runtime": 8.0648, "eval_samples_per_second": 123.996, "eval_steps_per_second": 7.812, "step": 76200 }, { "epoch": 4.93, "grad_norm": 11.344563484191895, "learning_rate": 3.743310397553517e-05, "loss": 0.9929, "step": 76300 }, { "epoch": 4.94, "grad_norm": 7.927706718444824, "learning_rate": 3.738532110091743e-05, "loss": 0.9948, "step": 76400 }, { "epoch": 4.94, "eval_accuracy": 0.760268231349539, "eval_loss": 1.0710370540618896, "eval_runtime": 7.6484, "eval_samples_per_second": 130.746, "eval_steps_per_second": 8.237, "step": 76400 }, { "epoch": 4.95, "grad_norm": 6.267642498016357, "learning_rate": 3.7337538226299695e-05, "loss": 0.9844, "step": 76500 }, { "epoch": 4.95, "grad_norm": 6.786309719085693, "learning_rate": 3.728975535168196e-05, "loss": 0.978, "step": 76600 }, { "epoch": 4.95, "eval_accuracy": 0.7860134003350083, "eval_loss": 0.997105598449707, "eval_runtime": 6.2714, "eval_samples_per_second": 159.454, "eval_steps_per_second": 10.046, "step": 76600 }, { "epoch": 4.96, "grad_norm": 6.824797630310059, "learning_rate": 3.7241972477064226e-05, "loss": 0.9845, "step": 76700 }, { "epoch": 4.97, "grad_norm": 7.04786491394043, "learning_rate": 3.7194189602446485e-05, "loss": 0.9405, "step": 76800 }, { "epoch": 4.97, "eval_accuracy": 0.7890724269377383, "eval_loss": 0.9879198670387268, "eval_runtime": 7.4399, "eval_samples_per_second": 134.411, "eval_steps_per_second": 8.468, "step": 76800 }, { "epoch": 4.97, "grad_norm": 15.592721939086914, "learning_rate": 3.714640672782875e-05, "loss": 0.9757, "step": 76900 }, { "epoch": 4.98, "grad_norm": 10.215802192687988, "learning_rate": 3.709862385321101e-05, "loss": 0.9618, "step": 77000 }, { "epoch": 4.98, "eval_accuracy": 0.7859249894648125, "eval_loss": 0.9632707834243774, "eval_runtime": 8.0535, "eval_samples_per_second": 124.17, "eval_steps_per_second": 7.823, "step": 77000 }, { "epoch": 4.99, "grad_norm": 9.940308570861816, "learning_rate": 3.7050840978593275e-05, "loss": 0.9567, "step": 77100 }, { "epoch": 4.99, "grad_norm": 12.076956748962402, "learning_rate": 3.7003058103975534e-05, "loss": 0.9726, "step": 77200 }, { "epoch": 4.99, "eval_accuracy": 0.7714762301918265, "eval_loss": 1.0063599348068237, "eval_runtime": 8.2167, "eval_samples_per_second": 121.703, "eval_steps_per_second": 7.667, "step": 77200 }, { "epoch": 5.0, "grad_norm": 8.833041191101074, "learning_rate": 3.69552752293578e-05, "loss": 0.9768, "step": 77300 }, { "epoch": 5.01, "grad_norm": 7.536500453948975, "learning_rate": 3.6907492354740065e-05, "loss": 0.9243, "step": 77400 }, { "epoch": 5.01, "eval_accuracy": 0.7756813417190775, "eval_loss": 0.9875549674034119, "eval_runtime": 7.8624, "eval_samples_per_second": 127.188, "eval_steps_per_second": 8.013, "step": 77400 }, { "epoch": 5.01, "grad_norm": 10.045833587646484, "learning_rate": 3.685970948012233e-05, "loss": 0.9311, "step": 77500 }, { "epoch": 5.02, "grad_norm": 8.147746086120605, "learning_rate": 3.681192660550459e-05, "loss": 0.9616, "step": 77600 }, { "epoch": 5.02, "eval_accuracy": 0.7842306061890632, "eval_loss": 0.9469566941261292, "eval_runtime": 8.1185, "eval_samples_per_second": 123.175, "eval_steps_per_second": 7.76, "step": 77600 }, { "epoch": 5.02, "grad_norm": 8.476813316345215, "learning_rate": 3.6764143730886855e-05, "loss": 0.9276, "step": 77700 }, { "epoch": 5.03, "grad_norm": 15.78298568725586, "learning_rate": 3.671636085626911e-05, "loss": 0.9309, "step": 77800 }, { "epoch": 5.03, "eval_accuracy": 0.7664570230607967, "eval_loss": 0.9893642663955688, "eval_runtime": 8.2683, "eval_samples_per_second": 120.944, "eval_steps_per_second": 7.619, "step": 77800 }, { "epoch": 5.04, "grad_norm": 6.2742533683776855, "learning_rate": 3.666857798165137e-05, "loss": 1.0043, "step": 77900 }, { "epoch": 5.04, "grad_norm": 13.026144027709961, "learning_rate": 3.662079510703364e-05, "loss": 1.0114, "step": 78000 }, { "epoch": 5.04, "eval_accuracy": 0.7655259822560203, "eval_loss": 1.0401846170425415, "eval_runtime": 7.8858, "eval_samples_per_second": 126.81, "eval_steps_per_second": 7.989, "step": 78000 }, { "epoch": 5.05, "grad_norm": 9.533039093017578, "learning_rate": 3.65730122324159e-05, "loss": 0.9775, "step": 78100 }, { "epoch": 5.06, "grad_norm": 9.72433853149414, "learning_rate": 3.652522935779817e-05, "loss": 0.9084, "step": 78200 }, { "epoch": 5.06, "eval_accuracy": 0.7908249158249159, "eval_loss": 0.8913083076477051, "eval_runtime": 8.4214, "eval_samples_per_second": 118.746, "eval_steps_per_second": 7.481, "step": 78200 }, { "epoch": 5.06, "grad_norm": 12.880206108093262, "learning_rate": 3.647744648318043e-05, "loss": 0.954, "step": 78300 }, { "epoch": 5.07, "grad_norm": 7.068869590759277, "learning_rate": 3.642966360856269e-05, "loss": 0.8962, "step": 78400 }, { "epoch": 5.07, "eval_accuracy": 0.7859574468085107, "eval_loss": 0.9073309302330017, "eval_runtime": 7.549, "eval_samples_per_second": 132.468, "eval_steps_per_second": 8.345, "step": 78400 }, { "epoch": 5.08, "grad_norm": 7.86995267868042, "learning_rate": 3.638188073394496e-05, "loss": 0.9441, "step": 78500 }, { "epoch": 5.08, "grad_norm": 8.041131973266602, "learning_rate": 3.6334097859327224e-05, "loss": 0.9285, "step": 78600 }, { "epoch": 5.08, "eval_accuracy": 0.7738500851788757, "eval_loss": 0.9955961108207703, "eval_runtime": 7.7145, "eval_samples_per_second": 129.625, "eval_steps_per_second": 8.166, "step": 78600 }, { "epoch": 5.09, "grad_norm": 14.696179389953613, "learning_rate": 3.628631498470948e-05, "loss": 0.968, "step": 78700 }, { "epoch": 5.1, "grad_norm": 11.034079551696777, "learning_rate": 3.623853211009174e-05, "loss": 0.9445, "step": 78800 }, { "epoch": 5.1, "eval_accuracy": 0.7939698492462312, "eval_loss": 0.9135068655014038, "eval_runtime": 8.2798, "eval_samples_per_second": 120.777, "eval_steps_per_second": 7.609, "step": 78800 }, { "epoch": 5.1, "grad_norm": 10.858016967773438, "learning_rate": 3.619074923547401e-05, "loss": 0.9152, "step": 78900 }, { "epoch": 5.11, "grad_norm": 7.905811309814453, "learning_rate": 3.614296636085627e-05, "loss": 0.976, "step": 79000 }, { "epoch": 5.11, "eval_accuracy": 0.7687788501888376, "eval_loss": 1.0114176273345947, "eval_runtime": 7.9521, "eval_samples_per_second": 125.753, "eval_steps_per_second": 7.922, "step": 79000 }, { "epoch": 5.12, "grad_norm": 7.815107822418213, "learning_rate": 3.609518348623853e-05, "loss": 1.1041, "step": 79100 }, { "epoch": 5.12, "grad_norm": 12.222792625427246, "learning_rate": 3.60474006116208e-05, "loss": 1.1777, "step": 79200 }, { "epoch": 5.12, "eval_accuracy": 0.7285774588433939, "eval_loss": 1.302636981010437, "eval_runtime": 7.8799, "eval_samples_per_second": 126.905, "eval_steps_per_second": 7.995, "step": 79200 }, { "epoch": 5.13, "grad_norm": 11.148119926452637, "learning_rate": 3.599961773700306e-05, "loss": 1.0935, "step": 79300 }, { "epoch": 5.13, "grad_norm": 7.047947883605957, "learning_rate": 3.595183486238533e-05, "loss": 0.9449, "step": 79400 }, { "epoch": 5.13, "eval_accuracy": 0.7882901994060246, "eval_loss": 0.9401076436042786, "eval_runtime": 7.719, "eval_samples_per_second": 129.55, "eval_steps_per_second": 8.162, "step": 79400 }, { "epoch": 5.14, "grad_norm": 6.8881025314331055, "learning_rate": 3.590405198776759e-05, "loss": 0.9685, "step": 79500 }, { "epoch": 5.15, "grad_norm": 7.733551502227783, "learning_rate": 3.5856269113149845e-05, "loss": 0.9406, "step": 79600 }, { "epoch": 5.15, "eval_accuracy": 0.786488740617181, "eval_loss": 0.9090317487716675, "eval_runtime": 7.9069, "eval_samples_per_second": 126.472, "eval_steps_per_second": 7.968, "step": 79600 }, { "epoch": 5.15, "grad_norm": 5.8939337730407715, "learning_rate": 3.580848623853211e-05, "loss": 0.9449, "step": 79700 }, { "epoch": 5.16, "grad_norm": 7.594625473022461, "learning_rate": 3.5760703363914376e-05, "loss": 0.9177, "step": 79800 }, { "epoch": 5.16, "eval_accuracy": 0.78562421185372, "eval_loss": 0.9216437339782715, "eval_runtime": 8.1776, "eval_samples_per_second": 122.285, "eval_steps_per_second": 7.704, "step": 79800 }, { "epoch": 5.17, "grad_norm": 8.1451997756958, "learning_rate": 3.5712920489296635e-05, "loss": 0.9001, "step": 79900 }, { "epoch": 5.17, "grad_norm": 9.533666610717773, "learning_rate": 3.56651376146789e-05, "loss": 0.9027, "step": 80000 }, { "epoch": 5.17, "eval_accuracy": 0.7849958088851634, "eval_loss": 0.9562706351280212, "eval_runtime": 8.2351, "eval_samples_per_second": 121.431, "eval_steps_per_second": 7.65, "step": 80000 }, { "epoch": 5.18, "grad_norm": 10.21576976776123, "learning_rate": 3.5617354740061166e-05, "loss": 0.9369, "step": 80100 }, { "epoch": 5.19, "grad_norm": 8.954793930053711, "learning_rate": 3.5569571865443425e-05, "loss": 0.9841, "step": 80200 }, { "epoch": 5.19, "eval_accuracy": 0.7562105263157894, "eval_loss": 1.0245416164398193, "eval_runtime": 8.3012, "eval_samples_per_second": 120.465, "eval_steps_per_second": 7.589, "step": 80200 }, { "epoch": 5.19, "grad_norm": 11.038591384887695, "learning_rate": 3.552178899082569e-05, "loss": 0.9495, "step": 80300 }, { "epoch": 5.2, "grad_norm": 7.728280544281006, "learning_rate": 3.5474006116207956e-05, "loss": 0.9249, "step": 80400 }, { "epoch": 5.2, "eval_accuracy": 0.7685223943072416, "eval_loss": 1.031061053276062, "eval_runtime": 7.7607, "eval_samples_per_second": 128.855, "eval_steps_per_second": 8.118, "step": 80400 }, { "epoch": 5.21, "grad_norm": 9.668965339660645, "learning_rate": 3.5426223241590215e-05, "loss": 0.9836, "step": 80500 }, { "epoch": 5.21, "grad_norm": 11.04190444946289, "learning_rate": 3.5378440366972473e-05, "loss": 1.0035, "step": 80600 }, { "epoch": 5.21, "eval_accuracy": 0.7244725738396625, "eval_loss": 1.214395523071289, "eval_runtime": 7.8728, "eval_samples_per_second": 127.019, "eval_steps_per_second": 8.002, "step": 80600 }, { "epoch": 5.22, "grad_norm": 13.495457649230957, "learning_rate": 3.533065749235474e-05, "loss": 1.1877, "step": 80700 }, { "epoch": 5.23, "grad_norm": 7.8721418380737305, "learning_rate": 3.5282874617737005e-05, "loss": 1.2407, "step": 80800 }, { "epoch": 5.23, "eval_accuracy": 0.759439966058549, "eval_loss": 1.0647492408752441, "eval_runtime": 7.7132, "eval_samples_per_second": 129.648, "eval_steps_per_second": 8.168, "step": 80800 }, { "epoch": 5.23, "grad_norm": 19.532407760620117, "learning_rate": 3.523509174311927e-05, "loss": 0.9967, "step": 80900 }, { "epoch": 5.24, "grad_norm": 9.506444931030273, "learning_rate": 3.518730886850153e-05, "loss": 0.9772, "step": 81000 }, { "epoch": 5.24, "eval_accuracy": 0.7701967350355797, "eval_loss": 0.9932196140289307, "eval_runtime": 8.4299, "eval_samples_per_second": 118.626, "eval_steps_per_second": 7.473, "step": 81000 }, { "epoch": 5.24, "grad_norm": 7.377074718475342, "learning_rate": 3.5139525993883794e-05, "loss": 0.9677, "step": 81100 }, { "epoch": 5.25, "grad_norm": 6.975955486297607, "learning_rate": 3.509174311926606e-05, "loss": 0.9064, "step": 81200 }, { "epoch": 5.25, "eval_accuracy": 0.7707193941943626, "eval_loss": 1.0310475826263428, "eval_runtime": 7.4204, "eval_samples_per_second": 134.763, "eval_steps_per_second": 8.49, "step": 81200 }, { "epoch": 5.26, "grad_norm": 48.970985412597656, "learning_rate": 3.504396024464832e-05, "loss": 0.9204, "step": 81300 }, { "epoch": 5.26, "grad_norm": 15.577527046203613, "learning_rate": 3.499617737003058e-05, "loss": 0.925, "step": 81400 }, { "epoch": 5.26, "eval_accuracy": 0.7825722664432342, "eval_loss": 0.9727858304977417, "eval_runtime": 8.2884, "eval_samples_per_second": 120.65, "eval_steps_per_second": 7.601, "step": 81400 }, { "epoch": 5.27, "grad_norm": 8.15576457977295, "learning_rate": 3.494839449541284e-05, "loss": 0.9023, "step": 81500 }, { "epoch": 5.28, "grad_norm": 10.179049491882324, "learning_rate": 3.490061162079511e-05, "loss": 0.9286, "step": 81600 }, { "epoch": 5.28, "eval_accuracy": 0.7602885023334748, "eval_loss": 1.0272160768508911, "eval_runtime": 8.1271, "eval_samples_per_second": 123.045, "eval_steps_per_second": 7.752, "step": 81600 }, { "epoch": 5.28, "grad_norm": 10.03041934967041, "learning_rate": 3.4852828746177374e-05, "loss": 0.9302, "step": 81700 }, { "epoch": 5.29, "grad_norm": 10.167094230651855, "learning_rate": 3.480504587155963e-05, "loss": 0.8952, "step": 81800 }, { "epoch": 5.29, "eval_accuracy": 0.779804836656767, "eval_loss": 0.9469917416572571, "eval_runtime": 8.2134, "eval_samples_per_second": 121.752, "eval_steps_per_second": 7.67, "step": 81800 }, { "epoch": 5.3, "grad_norm": 9.40182876586914, "learning_rate": 3.47572629969419e-05, "loss": 0.878, "step": 81900 }, { "epoch": 5.3, "grad_norm": 15.262103080749512, "learning_rate": 3.4709480122324164e-05, "loss": 0.9642, "step": 82000 }, { "epoch": 5.3, "eval_accuracy": 0.7912133891213389, "eval_loss": 0.9394938349723816, "eval_runtime": 8.5481, "eval_samples_per_second": 116.986, "eval_steps_per_second": 7.37, "step": 82000 }, { "epoch": 5.31, "grad_norm": 8.52119255065918, "learning_rate": 3.466169724770643e-05, "loss": 0.9027, "step": 82100 }, { "epoch": 5.32, "grad_norm": 7.9369916915893555, "learning_rate": 3.461391437308869e-05, "loss": 0.905, "step": 82200 }, { "epoch": 5.32, "eval_accuracy": 0.7910944652517686, "eval_loss": 0.9243242740631104, "eval_runtime": 8.3673, "eval_samples_per_second": 119.513, "eval_steps_per_second": 7.529, "step": 82200 }, { "epoch": 5.32, "grad_norm": 60.414737701416016, "learning_rate": 3.456613149847095e-05, "loss": 0.908, "step": 82300 }, { "epoch": 5.33, "grad_norm": 7.450528144836426, "learning_rate": 3.451834862385321e-05, "loss": 0.9337, "step": 82400 }, { "epoch": 5.33, "eval_accuracy": 0.765479219677693, "eval_loss": 1.0324569940567017, "eval_runtime": 7.9296, "eval_samples_per_second": 126.11, "eval_steps_per_second": 7.945, "step": 82400 }, { "epoch": 5.33, "grad_norm": 19.149539947509766, "learning_rate": 3.447056574923548e-05, "loss": 0.8967, "step": 82500 }, { "epoch": 5.34, "grad_norm": 7.677934646606445, "learning_rate": 3.4422782874617737e-05, "loss": 0.903, "step": 82600 }, { "epoch": 5.34, "eval_accuracy": 0.7689393939393939, "eval_loss": 0.9923136830329895, "eval_runtime": 7.7436, "eval_samples_per_second": 129.139, "eval_steps_per_second": 8.136, "step": 82600 }, { "epoch": 5.35, "grad_norm": 8.481142044067383, "learning_rate": 3.4375e-05, "loss": 0.9345, "step": 82700 }, { "epoch": 5.35, "grad_norm": 10.160356521606445, "learning_rate": 3.432721712538227e-05, "loss": 0.8974, "step": 82800 }, { "epoch": 5.35, "eval_accuracy": 0.7836010143702451, "eval_loss": 0.9833623170852661, "eval_runtime": 8.0576, "eval_samples_per_second": 124.106, "eval_steps_per_second": 7.819, "step": 82800 }, { "epoch": 5.36, "grad_norm": 10.77476978302002, "learning_rate": 3.4279434250764526e-05, "loss": 0.9652, "step": 82900 }, { "epoch": 5.37, "grad_norm": 8.436919212341309, "learning_rate": 3.423165137614679e-05, "loss": 0.919, "step": 83000 }, { "epoch": 5.37, "eval_accuracy": 0.7750519750519751, "eval_loss": 0.9782270193099976, "eval_runtime": 8.2552, "eval_samples_per_second": 121.135, "eval_steps_per_second": 7.632, "step": 83000 }, { "epoch": 5.37, "grad_norm": 10.50770092010498, "learning_rate": 3.418386850152905e-05, "loss": 0.8549, "step": 83100 }, { "epoch": 5.38, "grad_norm": 7.287281513214111, "learning_rate": 3.4136085626911316e-05, "loss": 0.9233, "step": 83200 }, { "epoch": 5.38, "eval_accuracy": 0.7829166666666667, "eval_loss": 0.9793387651443481, "eval_runtime": 7.8527, "eval_samples_per_second": 127.345, "eval_steps_per_second": 8.023, "step": 83200 }, { "epoch": 5.39, "grad_norm": 9.222356796264648, "learning_rate": 3.4088302752293575e-05, "loss": 0.9468, "step": 83300 }, { "epoch": 5.39, "grad_norm": 8.27151870727539, "learning_rate": 3.404051987767584e-05, "loss": 0.894, "step": 83400 }, { "epoch": 5.39, "eval_accuracy": 0.7796538623891938, "eval_loss": 0.9310882687568665, "eval_runtime": 7.5684, "eval_samples_per_second": 132.128, "eval_steps_per_second": 8.324, "step": 83400 }, { "epoch": 5.4, "grad_norm": 7.489036560058594, "learning_rate": 3.3992737003058106e-05, "loss": 0.9176, "step": 83500 }, { "epoch": 5.41, "grad_norm": 10.03458023071289, "learning_rate": 3.394495412844037e-05, "loss": 0.9505, "step": 83600 }, { "epoch": 5.41, "eval_accuracy": 0.7788944723618091, "eval_loss": 0.9906249046325684, "eval_runtime": 8.4169, "eval_samples_per_second": 118.808, "eval_steps_per_second": 7.485, "step": 83600 }, { "epoch": 5.41, "grad_norm": 8.205994606018066, "learning_rate": 3.389717125382263e-05, "loss": 0.8876, "step": 83700 }, { "epoch": 5.42, "grad_norm": 7.652585506439209, "learning_rate": 3.3849388379204896e-05, "loss": 0.8805, "step": 83800 }, { "epoch": 5.42, "eval_accuracy": 0.7829521829521829, "eval_loss": 0.9870421290397644, "eval_runtime": 7.5535, "eval_samples_per_second": 132.388, "eval_steps_per_second": 8.34, "step": 83800 }, { "epoch": 5.43, "grad_norm": 9.724054336547852, "learning_rate": 3.380160550458716e-05, "loss": 0.8821, "step": 83900 }, { "epoch": 5.43, "grad_norm": 8.065447807312012, "learning_rate": 3.375382262996942e-05, "loss": 0.9066, "step": 84000 }, { "epoch": 5.43, "eval_accuracy": 0.7873322147651006, "eval_loss": 0.9227031469345093, "eval_runtime": 8.2325, "eval_samples_per_second": 121.469, "eval_steps_per_second": 7.653, "step": 84000 }, { "epoch": 5.44, "grad_norm": 10.38203239440918, "learning_rate": 3.370603975535168e-05, "loss": 0.9217, "step": 84100 }, { "epoch": 5.44, "grad_norm": 7.027584552764893, "learning_rate": 3.3658256880733944e-05, "loss": 0.8784, "step": 84200 }, { "epoch": 5.44, "eval_accuracy": 0.7857142857142857, "eval_loss": 0.9321097135543823, "eval_runtime": 8.0502, "eval_samples_per_second": 124.221, "eval_steps_per_second": 7.826, "step": 84200 }, { "epoch": 5.45, "grad_norm": 7.678597450256348, "learning_rate": 3.361047400611621e-05, "loss": 0.8721, "step": 84300 }, { "epoch": 5.46, "grad_norm": 7.529230117797852, "learning_rate": 3.3562691131498475e-05, "loss": 0.9098, "step": 84400 }, { "epoch": 5.46, "eval_accuracy": 0.7820782498948254, "eval_loss": 0.9505414366722107, "eval_runtime": 8.0699, "eval_samples_per_second": 123.918, "eval_steps_per_second": 7.807, "step": 84400 }, { "epoch": 5.46, "grad_norm": 10.329743385314941, "learning_rate": 3.3514908256880734e-05, "loss": 0.9439, "step": 84500 }, { "epoch": 5.47, "grad_norm": 8.409812927246094, "learning_rate": 3.3467125382263e-05, "loss": 0.9252, "step": 84600 }, { "epoch": 5.47, "eval_accuracy": 0.7889637742207245, "eval_loss": 0.887751579284668, "eval_runtime": 8.0478, "eval_samples_per_second": 124.258, "eval_steps_per_second": 7.828, "step": 84600 }, { "epoch": 5.48, "grad_norm": 13.971770286560059, "learning_rate": 3.3419342507645265e-05, "loss": 0.9136, "step": 84700 }, { "epoch": 5.48, "grad_norm": 8.07504940032959, "learning_rate": 3.3371559633027524e-05, "loss": 0.8808, "step": 84800 }, { "epoch": 5.48, "eval_accuracy": 0.7876512307050479, "eval_loss": 0.9458027482032776, "eval_runtime": 8.0618, "eval_samples_per_second": 124.042, "eval_steps_per_second": 7.815, "step": 84800 }, { "epoch": 5.49, "grad_norm": 7.722691059112549, "learning_rate": 3.332377675840978e-05, "loss": 0.9161, "step": 84900 }, { "epoch": 5.5, "grad_norm": 9.495827674865723, "learning_rate": 3.327599388379205e-05, "loss": 0.9064, "step": 85000 }, { "epoch": 5.5, "eval_accuracy": 0.7749263777871266, "eval_loss": 0.9309916496276855, "eval_runtime": 8.1552, "eval_samples_per_second": 122.621, "eval_steps_per_second": 7.725, "step": 85000 }, { "epoch": 5.5, "grad_norm": 7.678147792816162, "learning_rate": 3.3228211009174314e-05, "loss": 0.9036, "step": 85100 }, { "epoch": 5.51, "grad_norm": 9.377488136291504, "learning_rate": 3.318042813455658e-05, "loss": 0.8988, "step": 85200 }, { "epoch": 5.51, "eval_accuracy": 0.7797818791946308, "eval_loss": 0.940589964389801, "eval_runtime": 8.1611, "eval_samples_per_second": 122.533, "eval_steps_per_second": 7.72, "step": 85200 }, { "epoch": 5.52, "grad_norm": 12.125085830688477, "learning_rate": 3.313264525993884e-05, "loss": 0.8793, "step": 85300 }, { "epoch": 5.52, "grad_norm": 13.322086334228516, "learning_rate": 3.3084862385321103e-05, "loss": 0.9525, "step": 85400 }, { "epoch": 5.52, "eval_accuracy": 0.7887678122380554, "eval_loss": 0.9751226902008057, "eval_runtime": 8.0004, "eval_samples_per_second": 124.994, "eval_steps_per_second": 7.875, "step": 85400 }, { "epoch": 5.53, "grad_norm": 18.882335662841797, "learning_rate": 3.303707951070337e-05, "loss": 0.873, "step": 85500 }, { "epoch": 5.54, "grad_norm": 8.189373970031738, "learning_rate": 3.298929663608563e-05, "loss": 0.9193, "step": 85600 }, { "epoch": 5.54, "eval_accuracy": 0.7846730528946272, "eval_loss": 0.9461279511451721, "eval_runtime": 7.9549, "eval_samples_per_second": 125.708, "eval_steps_per_second": 7.92, "step": 85600 }, { "epoch": 5.54, "grad_norm": 11.661227226257324, "learning_rate": 3.294151376146789e-05, "loss": 0.929, "step": 85700 }, { "epoch": 5.55, "grad_norm": 7.611550807952881, "learning_rate": 3.289373088685015e-05, "loss": 0.8955, "step": 85800 }, { "epoch": 5.55, "eval_accuracy": 0.8001693480101609, "eval_loss": 0.8574084639549255, "eval_runtime": 7.7204, "eval_samples_per_second": 129.526, "eval_steps_per_second": 8.16, "step": 85800 }, { "epoch": 5.55, "grad_norm": 8.986960411071777, "learning_rate": 3.284594801223242e-05, "loss": 0.9021, "step": 85900 }, { "epoch": 5.56, "grad_norm": 11.404871940612793, "learning_rate": 3.2798165137614676e-05, "loss": 0.8948, "step": 86000 }, { "epoch": 5.56, "eval_accuracy": 0.7777305567360816, "eval_loss": 0.982026994228363, "eval_runtime": 7.817, "eval_samples_per_second": 127.927, "eval_steps_per_second": 8.059, "step": 86000 }, { "epoch": 5.57, "grad_norm": 7.563215255737305, "learning_rate": 3.275038226299694e-05, "loss": 0.9157, "step": 86100 }, { "epoch": 5.57, "grad_norm": 9.517242431640625, "learning_rate": 3.270259938837921e-05, "loss": 0.9146, "step": 86200 }, { "epoch": 5.57, "eval_accuracy": 0.7886005954912803, "eval_loss": 0.9028939008712769, "eval_runtime": 8.1616, "eval_samples_per_second": 122.525, "eval_steps_per_second": 7.719, "step": 86200 }, { "epoch": 5.58, "grad_norm": 7.066720485687256, "learning_rate": 3.265481651376147e-05, "loss": 0.9036, "step": 86300 }, { "epoch": 5.59, "grad_norm": 9.641302108764648, "learning_rate": 3.260703363914373e-05, "loss": 0.897, "step": 86400 }, { "epoch": 5.59, "eval_accuracy": 0.7937181663837012, "eval_loss": 0.8961277008056641, "eval_runtime": 7.9401, "eval_samples_per_second": 125.942, "eval_steps_per_second": 7.934, "step": 86400 }, { "epoch": 5.59, "grad_norm": 11.103862762451172, "learning_rate": 3.2559250764526e-05, "loss": 0.8818, "step": 86500 }, { "epoch": 5.6, "grad_norm": 6.695640563964844, "learning_rate": 3.2511467889908256e-05, "loss": 0.8719, "step": 86600 }, { "epoch": 5.6, "eval_accuracy": 0.7838858581619806, "eval_loss": 0.965584933757782, "eval_runtime": 8.253, "eval_samples_per_second": 121.168, "eval_steps_per_second": 7.634, "step": 86600 }, { "epoch": 5.61, "grad_norm": 9.646697044372559, "learning_rate": 3.246368501529052e-05, "loss": 0.8937, "step": 86700 }, { "epoch": 5.61, "grad_norm": 8.870104789733887, "learning_rate": 3.241590214067278e-05, "loss": 0.9493, "step": 86800 }, { "epoch": 5.61, "eval_accuracy": 0.776890756302521, "eval_loss": 0.9335665702819824, "eval_runtime": 7.7853, "eval_samples_per_second": 128.447, "eval_steps_per_second": 8.092, "step": 86800 }, { "epoch": 5.62, "grad_norm": 10.724740982055664, "learning_rate": 3.2368119266055046e-05, "loss": 0.8703, "step": 86900 }, { "epoch": 5.63, "grad_norm": 12.347381591796875, "learning_rate": 3.232033639143731e-05, "loss": 0.8844, "step": 87000 }, { "epoch": 5.63, "eval_accuracy": 0.7954924874791319, "eval_loss": 0.9188147187232971, "eval_runtime": 7.5749, "eval_samples_per_second": 132.014, "eval_steps_per_second": 8.317, "step": 87000 }, { "epoch": 5.63, "grad_norm": 5.729376316070557, "learning_rate": 3.227255351681958e-05, "loss": 0.8928, "step": 87100 }, { "epoch": 5.64, "grad_norm": 6.503875732421875, "learning_rate": 3.2224770642201835e-05, "loss": 0.8491, "step": 87200 }, { "epoch": 5.64, "eval_accuracy": 0.7939548744146445, "eval_loss": 0.8775643110275269, "eval_runtime": 7.5352, "eval_samples_per_second": 132.71, "eval_steps_per_second": 8.361, "step": 87200 }, { "epoch": 5.65, "grad_norm": 7.970768928527832, "learning_rate": 3.21769877675841e-05, "loss": 0.8916, "step": 87300 }, { "epoch": 5.65, "grad_norm": 9.523329734802246, "learning_rate": 3.2129204892966367e-05, "loss": 0.887, "step": 87400 }, { "epoch": 5.65, "eval_accuracy": 0.7829554995801847, "eval_loss": 0.9380804300308228, "eval_runtime": 7.7339, "eval_samples_per_second": 129.3, "eval_steps_per_second": 8.146, "step": 87400 }, { "epoch": 5.66, "grad_norm": 9.840675354003906, "learning_rate": 3.2081422018348625e-05, "loss": 0.9446, "step": 87500 }, { "epoch": 5.66, "grad_norm": 5.532453536987305, "learning_rate": 3.2033639143730884e-05, "loss": 0.9011, "step": 87600 }, { "epoch": 5.66, "eval_accuracy": 0.7904520490071821, "eval_loss": 0.9112616181373596, "eval_runtime": 7.8734, "eval_samples_per_second": 127.01, "eval_steps_per_second": 8.002, "step": 87600 }, { "epoch": 5.67, "grad_norm": 9.928085327148438, "learning_rate": 3.198585626911315e-05, "loss": 0.918, "step": 87700 }, { "epoch": 5.68, "grad_norm": 5.921030521392822, "learning_rate": 3.1938073394495415e-05, "loss": 0.9091, "step": 87800 }, { "epoch": 5.68, "eval_accuracy": 0.7774946921443737, "eval_loss": 0.9265080690383911, "eval_runtime": 8.0079, "eval_samples_per_second": 124.877, "eval_steps_per_second": 7.867, "step": 87800 }, { "epoch": 5.68, "grad_norm": 5.009400367736816, "learning_rate": 3.1890290519877674e-05, "loss": 0.8514, "step": 87900 }, { "epoch": 5.69, "grad_norm": 8.697906494140625, "learning_rate": 3.184250764525994e-05, "loss": 0.8898, "step": 88000 }, { "epoch": 5.69, "eval_accuracy": 0.7924050632911392, "eval_loss": 0.879066526889801, "eval_runtime": 7.9939, "eval_samples_per_second": 125.096, "eval_steps_per_second": 7.881, "step": 88000 }, { "epoch": 5.7, "grad_norm": 10.87426471710205, "learning_rate": 3.1794724770642205e-05, "loss": 0.8846, "step": 88100 }, { "epoch": 5.7, "grad_norm": 8.635502815246582, "learning_rate": 3.174694189602447e-05, "loss": 0.9048, "step": 88200 }, { "epoch": 5.7, "eval_accuracy": 0.7936575052854122, "eval_loss": 0.9104540944099426, "eval_runtime": 8.051, "eval_samples_per_second": 124.209, "eval_steps_per_second": 7.825, "step": 88200 }, { "epoch": 5.71, "grad_norm": 5.57421875, "learning_rate": 3.169915902140673e-05, "loss": 0.8947, "step": 88300 }, { "epoch": 5.72, "grad_norm": 8.421577453613281, "learning_rate": 3.1651376146788995e-05, "loss": 0.9324, "step": 88400 }, { "epoch": 5.72, "eval_accuracy": 0.7782002534854245, "eval_loss": 0.9782570600509644, "eval_runtime": 7.7687, "eval_samples_per_second": 128.722, "eval_steps_per_second": 8.109, "step": 88400 }, { "epoch": 5.72, "grad_norm": 8.589170455932617, "learning_rate": 3.1603593272171253e-05, "loss": 0.8983, "step": 88500 }, { "epoch": 5.73, "grad_norm": 10.936827659606934, "learning_rate": 3.155581039755352e-05, "loss": 0.8671, "step": 88600 }, { "epoch": 5.73, "eval_accuracy": 0.7874049027895181, "eval_loss": 0.9338342547416687, "eval_runtime": 7.9373, "eval_samples_per_second": 125.988, "eval_steps_per_second": 7.937, "step": 88600 }, { "epoch": 5.74, "grad_norm": 8.629457473754883, "learning_rate": 3.150802752293578e-05, "loss": 0.8879, "step": 88700 }, { "epoch": 5.74, "grad_norm": 10.026850700378418, "learning_rate": 3.146024464831804e-05, "loss": 0.8957, "step": 88800 }, { "epoch": 5.74, "eval_accuracy": 0.7740989103101426, "eval_loss": 0.9649574160575867, "eval_runtime": 7.7904, "eval_samples_per_second": 128.364, "eval_steps_per_second": 8.087, "step": 88800 }, { "epoch": 5.75, "grad_norm": 7.236749649047852, "learning_rate": 3.141246177370031e-05, "loss": 0.8994, "step": 88900 }, { "epoch": 5.76, "grad_norm": 11.710840225219727, "learning_rate": 3.1364678899082574e-05, "loss": 0.9154, "step": 89000 }, { "epoch": 5.76, "eval_accuracy": 0.7919123841617524, "eval_loss": 0.9108350872993469, "eval_runtime": 7.8792, "eval_samples_per_second": 126.917, "eval_steps_per_second": 7.996, "step": 89000 }, { "epoch": 5.76, "grad_norm": 7.834099292755127, "learning_rate": 3.131689602446483e-05, "loss": 0.9056, "step": 89100 }, { "epoch": 5.77, "grad_norm": 11.162922859191895, "learning_rate": 3.12691131498471e-05, "loss": 0.9346, "step": 89200 }, { "epoch": 5.77, "eval_accuracy": 0.788339670468948, "eval_loss": 0.9112759828567505, "eval_runtime": 8.0275, "eval_samples_per_second": 124.571, "eval_steps_per_second": 7.848, "step": 89200 }, { "epoch": 5.77, "grad_norm": 9.490047454833984, "learning_rate": 3.122133027522936e-05, "loss": 0.8788, "step": 89300 }, { "epoch": 5.78, "grad_norm": 9.370221138000488, "learning_rate": 3.117354740061162e-05, "loss": 0.8768, "step": 89400 }, { "epoch": 5.78, "eval_accuracy": 0.7880184331797235, "eval_loss": 0.9196951389312744, "eval_runtime": 7.6599, "eval_samples_per_second": 130.549, "eval_steps_per_second": 8.225, "step": 89400 }, { "epoch": 5.79, "grad_norm": 8.26474666595459, "learning_rate": 3.112576452599388e-05, "loss": 0.9043, "step": 89500 }, { "epoch": 5.79, "grad_norm": 7.796627044677734, "learning_rate": 3.107798165137615e-05, "loss": 0.86, "step": 89600 }, { "epoch": 5.79, "eval_accuracy": 0.7930890924229809, "eval_loss": 0.9431032538414001, "eval_runtime": 7.584, "eval_samples_per_second": 131.856, "eval_steps_per_second": 8.307, "step": 89600 }, { "epoch": 5.8, "grad_norm": 7.791547775268555, "learning_rate": 3.103019877675841e-05, "loss": 0.8391, "step": 89700 }, { "epoch": 5.81, "grad_norm": 8.953518867492676, "learning_rate": 3.098241590214068e-05, "loss": 0.8931, "step": 89800 }, { "epoch": 5.81, "eval_accuracy": 0.7845766974015088, "eval_loss": 0.9420713186264038, "eval_runtime": 8.0931, "eval_samples_per_second": 123.562, "eval_steps_per_second": 7.784, "step": 89800 }, { "epoch": 5.81, "grad_norm": 5.745658874511719, "learning_rate": 3.093463302752294e-05, "loss": 0.8681, "step": 89900 }, { "epoch": 5.82, "grad_norm": 11.024269104003906, "learning_rate": 3.08868501529052e-05, "loss": 0.8727, "step": 90000 }, { "epoch": 5.82, "eval_accuracy": 0.796888141295206, "eval_loss": 0.9127826690673828, "eval_runtime": 7.7874, "eval_samples_per_second": 128.413, "eval_steps_per_second": 8.09, "step": 90000 }, { "epoch": 5.83, "grad_norm": 6.172329902648926, "learning_rate": 3.083906727828747e-05, "loss": 0.8917, "step": 90100 }, { "epoch": 5.83, "grad_norm": 7.93392276763916, "learning_rate": 3.079128440366973e-05, "loss": 0.8907, "step": 90200 }, { "epoch": 5.83, "eval_accuracy": 0.776796973518285, "eval_loss": 0.9231885671615601, "eval_runtime": 8.2084, "eval_samples_per_second": 121.827, "eval_steps_per_second": 7.675, "step": 90200 }, { "epoch": 5.84, "grad_norm": 5.323368072509766, "learning_rate": 3.0743501529051985e-05, "loss": 0.8932, "step": 90300 }, { "epoch": 5.85, "grad_norm": 10.454434394836426, "learning_rate": 3.069571865443425e-05, "loss": 0.8428, "step": 90400 }, { "epoch": 5.85, "eval_accuracy": 0.7824620573355818, "eval_loss": 0.9586480855941772, "eval_runtime": 8.2406, "eval_samples_per_second": 121.35, "eval_steps_per_second": 7.645, "step": 90400 }, { "epoch": 5.85, "grad_norm": 9.168259620666504, "learning_rate": 3.0647935779816516e-05, "loss": 0.8559, "step": 90500 }, { "epoch": 5.86, "grad_norm": 11.24862289428711, "learning_rate": 3.0600152905198775e-05, "loss": 0.8814, "step": 90600 }, { "epoch": 5.86, "eval_accuracy": 0.7797971259509721, "eval_loss": 0.9584033489227295, "eval_runtime": 7.8026, "eval_samples_per_second": 128.162, "eval_steps_per_second": 8.074, "step": 90600 }, { "epoch": 5.87, "grad_norm": 9.994714736938477, "learning_rate": 3.055237003058104e-05, "loss": 0.8643, "step": 90700 }, { "epoch": 5.87, "grad_norm": 6.701624870300293, "learning_rate": 3.0504587155963303e-05, "loss": 0.8722, "step": 90800 }, { "epoch": 5.87, "eval_accuracy": 0.8035638523546882, "eval_loss": 0.8553415536880493, "eval_runtime": 7.8088, "eval_samples_per_second": 128.061, "eval_steps_per_second": 8.068, "step": 90800 }, { "epoch": 5.88, "grad_norm": 9.148612022399902, "learning_rate": 3.045680428134557e-05, "loss": 0.8801, "step": 90900 }, { "epoch": 5.88, "grad_norm": 6.915698051452637, "learning_rate": 3.0409021406727827e-05, "loss": 0.8573, "step": 91000 }, { "epoch": 5.88, "eval_accuracy": 0.8027153160797624, "eval_loss": 0.9007756114006042, "eval_runtime": 7.4761, "eval_samples_per_second": 133.759, "eval_steps_per_second": 8.427, "step": 91000 }, { "epoch": 5.89, "grad_norm": 8.366974830627441, "learning_rate": 3.0361238532110093e-05, "loss": 0.8504, "step": 91100 }, { "epoch": 5.9, "grad_norm": 7.432284832000732, "learning_rate": 3.0313455657492358e-05, "loss": 0.8607, "step": 91200 }, { "epoch": 5.9, "eval_accuracy": 0.783989834815756, "eval_loss": 0.9024417400360107, "eval_runtime": 7.6192, "eval_samples_per_second": 131.247, "eval_steps_per_second": 8.269, "step": 91200 }, { "epoch": 5.9, "grad_norm": 6.147519588470459, "learning_rate": 3.026567278287462e-05, "loss": 0.8906, "step": 91300 }, { "epoch": 5.91, "grad_norm": 11.765032768249512, "learning_rate": 3.021788990825688e-05, "loss": 0.8854, "step": 91400 }, { "epoch": 5.91, "eval_accuracy": 0.7916317991631799, "eval_loss": 0.90964674949646, "eval_runtime": 8.0468, "eval_samples_per_second": 124.274, "eval_steps_per_second": 7.829, "step": 91400 }, { "epoch": 5.92, "grad_norm": 8.494417190551758, "learning_rate": 3.0170107033639145e-05, "loss": 0.9107, "step": 91500 }, { "epoch": 5.92, "grad_norm": 10.992939949035645, "learning_rate": 3.012232415902141e-05, "loss": 0.8916, "step": 91600 }, { "epoch": 5.92, "eval_accuracy": 0.7972175379426644, "eval_loss": 0.8819576501846313, "eval_runtime": 7.4986, "eval_samples_per_second": 133.358, "eval_steps_per_second": 8.402, "step": 91600 }, { "epoch": 5.93, "grad_norm": 7.120448589324951, "learning_rate": 3.0074541284403672e-05, "loss": 0.8994, "step": 91700 }, { "epoch": 5.94, "grad_norm": 8.606697082519531, "learning_rate": 3.002675840978593e-05, "loss": 0.8597, "step": 91800 }, { "epoch": 5.94, "eval_accuracy": 0.7826638477801269, "eval_loss": 0.9251160621643066, "eval_runtime": 8.0894, "eval_samples_per_second": 123.619, "eval_steps_per_second": 7.788, "step": 91800 }, { "epoch": 5.94, "grad_norm": 9.002483367919922, "learning_rate": 2.9978975535168197e-05, "loss": 0.8991, "step": 91900 }, { "epoch": 5.95, "grad_norm": 6.8238606452941895, "learning_rate": 2.9931192660550462e-05, "loss": 0.8775, "step": 92000 }, { "epoch": 5.95, "eval_accuracy": 0.7885183621781342, "eval_loss": 0.937637209892273, "eval_runtime": 8.5884, "eval_samples_per_second": 116.437, "eval_steps_per_second": 7.336, "step": 92000 }, { "epoch": 5.96, "grad_norm": 7.614147186279297, "learning_rate": 2.9883409785932724e-05, "loss": 0.8853, "step": 92100 }, { "epoch": 5.96, "grad_norm": 10.971732139587402, "learning_rate": 2.9835626911314983e-05, "loss": 0.9085, "step": 92200 }, { "epoch": 5.96, "eval_accuracy": 0.800836820083682, "eval_loss": 0.8575210571289062, "eval_runtime": 7.2543, "eval_samples_per_second": 137.848, "eval_steps_per_second": 8.684, "step": 92200 }, { "epoch": 5.97, "grad_norm": 9.091797828674316, "learning_rate": 2.978784403669725e-05, "loss": 0.8756, "step": 92300 }, { "epoch": 5.98, "grad_norm": 11.398760795593262, "learning_rate": 2.9740061162079514e-05, "loss": 0.8968, "step": 92400 }, { "epoch": 5.98, "eval_accuracy": 0.7831728756802009, "eval_loss": 0.9540303945541382, "eval_runtime": 7.8037, "eval_samples_per_second": 128.144, "eval_steps_per_second": 8.073, "step": 92400 }, { "epoch": 5.98, "grad_norm": 12.938034057617188, "learning_rate": 2.9692278287461776e-05, "loss": 0.8699, "step": 92500 }, { "epoch": 5.99, "grad_norm": 6.5428595542907715, "learning_rate": 2.9644495412844038e-05, "loss": 0.8692, "step": 92600 }, { "epoch": 5.99, "eval_accuracy": 0.7867956265769555, "eval_loss": 0.9559907913208008, "eval_runtime": 7.7035, "eval_samples_per_second": 129.812, "eval_steps_per_second": 8.178, "step": 92600 }, { "epoch": 5.99, "grad_norm": 8.186531066894531, "learning_rate": 2.95967125382263e-05, "loss": 0.8572, "step": 92700 }, { "epoch": 6.0, "grad_norm": 8.819890022277832, "learning_rate": 2.9548929663608566e-05, "loss": 0.8548, "step": 92800 }, { "epoch": 6.0, "eval_accuracy": 0.7899659863945578, "eval_loss": 0.8754277229309082, "eval_runtime": 7.5948, "eval_samples_per_second": 131.67, "eval_steps_per_second": 8.295, "step": 92800 }, { "epoch": 6.01, "grad_norm": 8.905518531799316, "learning_rate": 2.9501146788990825e-05, "loss": 0.8696, "step": 92900 }, { "epoch": 6.01, "grad_norm": 10.360737800598145, "learning_rate": 2.945336391437309e-05, "loss": 0.8421, "step": 93000 }, { "epoch": 6.01, "eval_accuracy": 0.7978233570531603, "eval_loss": 0.8433436155319214, "eval_runtime": 7.71, "eval_samples_per_second": 129.702, "eval_steps_per_second": 8.171, "step": 93000 }, { "epoch": 6.02, "grad_norm": 6.7970356941223145, "learning_rate": 2.9405581039755352e-05, "loss": 0.8451, "step": 93100 }, { "epoch": 6.03, "grad_norm": 6.2845587730407715, "learning_rate": 2.9357798165137618e-05, "loss": 0.8657, "step": 93200 }, { "epoch": 6.03, "eval_accuracy": 0.80041928721174, "eval_loss": 0.850225031375885, "eval_runtime": 7.6191, "eval_samples_per_second": 131.249, "eval_steps_per_second": 8.269, "step": 93200 }, { "epoch": 6.03, "grad_norm": 14.295214653015137, "learning_rate": 2.9310015290519877e-05, "loss": 0.8616, "step": 93300 }, { "epoch": 6.04, "grad_norm": 8.752495765686035, "learning_rate": 2.9262232415902142e-05, "loss": 0.8499, "step": 93400 }, { "epoch": 6.04, "eval_accuracy": 0.7892962494732406, "eval_loss": 0.8869959712028503, "eval_runtime": 7.3263, "eval_samples_per_second": 136.494, "eval_steps_per_second": 8.599, "step": 93400 }, { "epoch": 6.05, "grad_norm": 10.902244567871094, "learning_rate": 2.9214449541284404e-05, "loss": 0.8454, "step": 93500 }, { "epoch": 6.05, "grad_norm": 9.891865730285645, "learning_rate": 2.916666666666667e-05, "loss": 0.8083, "step": 93600 }, { "epoch": 6.05, "eval_accuracy": 0.78625, "eval_loss": 0.9436712265014648, "eval_runtime": 8.3743, "eval_samples_per_second": 119.413, "eval_steps_per_second": 7.523, "step": 93600 }, { "epoch": 6.06, "grad_norm": 9.00574779510498, "learning_rate": 2.911888379204893e-05, "loss": 0.8491, "step": 93700 }, { "epoch": 6.07, "grad_norm": 9.056849479675293, "learning_rate": 2.9071100917431194e-05, "loss": 0.8653, "step": 93800 }, { "epoch": 6.07, "eval_accuracy": 0.8015839933305544, "eval_loss": 0.830663800239563, "eval_runtime": 7.9884, "eval_samples_per_second": 125.182, "eval_steps_per_second": 7.886, "step": 93800 }, { "epoch": 6.07, "grad_norm": 8.541321754455566, "learning_rate": 2.9023318042813456e-05, "loss": 0.8378, "step": 93900 }, { "epoch": 6.08, "grad_norm": 9.547113418579102, "learning_rate": 2.897553516819572e-05, "loss": 0.8363, "step": 94000 }, { "epoch": 6.08, "eval_accuracy": 0.801926298157454, "eval_loss": 0.8954129219055176, "eval_runtime": 7.5137, "eval_samples_per_second": 133.091, "eval_steps_per_second": 8.385, "step": 94000 }, { "epoch": 6.09, "grad_norm": 11.00920295715332, "learning_rate": 2.892775229357798e-05, "loss": 0.8286, "step": 94100 }, { "epoch": 6.09, "grad_norm": 5.111521244049072, "learning_rate": 2.8879969418960246e-05, "loss": 0.8772, "step": 94200 }, { "epoch": 6.09, "eval_accuracy": 0.7840573597638127, "eval_loss": 0.8885440826416016, "eval_runtime": 8.2491, "eval_samples_per_second": 121.226, "eval_steps_per_second": 7.637, "step": 94200 }, { "epoch": 6.1, "grad_norm": 9.434537887573242, "learning_rate": 2.883218654434251e-05, "loss": 0.832, "step": 94300 }, { "epoch": 6.1, "grad_norm": 7.234210014343262, "learning_rate": 2.8784403669724774e-05, "loss": 0.8649, "step": 94400 }, { "epoch": 6.1, "eval_accuracy": 0.7865215571368773, "eval_loss": 0.9001644849777222, "eval_runtime": 8.317, "eval_samples_per_second": 120.235, "eval_steps_per_second": 7.575, "step": 94400 }, { "epoch": 6.11, "grad_norm": 13.038410186767578, "learning_rate": 2.8736620795107032e-05, "loss": 0.8276, "step": 94500 }, { "epoch": 6.12, "grad_norm": 5.259621620178223, "learning_rate": 2.8688837920489298e-05, "loss": 0.8936, "step": 94600 }, { "epoch": 6.12, "eval_accuracy": 0.7968815844922039, "eval_loss": 0.9160247445106506, "eval_runtime": 7.6527, "eval_samples_per_second": 130.673, "eval_steps_per_second": 8.232, "step": 94600 }, { "epoch": 6.12, "grad_norm": 6.667730808258057, "learning_rate": 2.8641055045871563e-05, "loss": 0.8402, "step": 94700 }, { "epoch": 6.13, "grad_norm": 7.143275260925293, "learning_rate": 2.8593272171253826e-05, "loss": 0.8821, "step": 94800 }, { "epoch": 6.13, "eval_accuracy": 0.8147837043259135, "eval_loss": 0.801220715045929, "eval_runtime": 8.2206, "eval_samples_per_second": 121.646, "eval_steps_per_second": 7.664, "step": 94800 }, { "epoch": 6.14, "grad_norm": 8.726034164428711, "learning_rate": 2.8545489296636084e-05, "loss": 0.8551, "step": 94900 }, { "epoch": 6.14, "grad_norm": 12.30269718170166, "learning_rate": 2.849770642201835e-05, "loss": 0.8772, "step": 95000 }, { "epoch": 6.14, "eval_accuracy": 0.7785684386772709, "eval_loss": 0.9351415038108826, "eval_runtime": 8.2132, "eval_samples_per_second": 121.755, "eval_steps_per_second": 7.671, "step": 95000 }, { "epoch": 6.15, "grad_norm": 4.616902828216553, "learning_rate": 2.8449923547400615e-05, "loss": 0.8594, "step": 95100 }, { "epoch": 6.16, "grad_norm": 8.903868675231934, "learning_rate": 2.8402140672782877e-05, "loss": 0.8179, "step": 95200 }, { "epoch": 6.16, "eval_accuracy": 0.7996647108130763, "eval_loss": 0.8784301280975342, "eval_runtime": 7.9503, "eval_samples_per_second": 125.781, "eval_steps_per_second": 7.924, "step": 95200 }, { "epoch": 6.16, "grad_norm": 8.224273681640625, "learning_rate": 2.8354357798165136e-05, "loss": 0.832, "step": 95300 }, { "epoch": 6.17, "grad_norm": 9.37580394744873, "learning_rate": 2.8306574923547402e-05, "loss": 0.8496, "step": 95400 }, { "epoch": 6.17, "eval_accuracy": 0.7993311036789298, "eval_loss": 0.9080206155776978, "eval_runtime": 7.3926, "eval_samples_per_second": 135.27, "eval_steps_per_second": 8.522, "step": 95400 }, { "epoch": 6.18, "grad_norm": 6.672393798828125, "learning_rate": 2.8258792048929667e-05, "loss": 0.8506, "step": 95500 }, { "epoch": 6.18, "grad_norm": 7.1164116859436035, "learning_rate": 2.8211009174311926e-05, "loss": 0.86, "step": 95600 }, { "epoch": 6.18, "eval_accuracy": 0.799492385786802, "eval_loss": 0.888501763343811, "eval_runtime": 7.6603, "eval_samples_per_second": 130.543, "eval_steps_per_second": 8.224, "step": 95600 }, { "epoch": 6.19, "grad_norm": 5.989982604980469, "learning_rate": 2.8163226299694188e-05, "loss": 0.8027, "step": 95700 }, { "epoch": 6.2, "grad_norm": 13.453332901000977, "learning_rate": 2.8115443425076454e-05, "loss": 0.8301, "step": 95800 }, { "epoch": 6.2, "eval_accuracy": 0.8060271646859083, "eval_loss": 0.8681384325027466, "eval_runtime": 8.063, "eval_samples_per_second": 124.024, "eval_steps_per_second": 7.814, "step": 95800 }, { "epoch": 6.2, "grad_norm": 8.304060935974121, "learning_rate": 2.806766055045872e-05, "loss": 0.9084, "step": 95900 }, { "epoch": 6.21, "grad_norm": 8.473187446594238, "learning_rate": 2.8019877675840978e-05, "loss": 0.8678, "step": 96000 }, { "epoch": 6.21, "eval_accuracy": 0.7994125052454889, "eval_loss": 0.8349708318710327, "eval_runtime": 8.2692, "eval_samples_per_second": 120.93, "eval_steps_per_second": 7.619, "step": 96000 }, { "epoch": 6.21, "grad_norm": 5.363618850708008, "learning_rate": 2.7972094801223243e-05, "loss": 0.8438, "step": 96100 }, { "epoch": 6.22, "grad_norm": 5.787271022796631, "learning_rate": 2.7924311926605506e-05, "loss": 0.8624, "step": 96200 }, { "epoch": 6.22, "eval_accuracy": 0.7824459234608985, "eval_loss": 0.9209972023963928, "eval_runtime": 7.5716, "eval_samples_per_second": 132.073, "eval_steps_per_second": 8.321, "step": 96200 }, { "epoch": 6.23, "grad_norm": 8.099592208862305, "learning_rate": 2.787652905198777e-05, "loss": 0.8464, "step": 96300 }, { "epoch": 6.23, "grad_norm": 9.677714347839355, "learning_rate": 2.782874617737003e-05, "loss": 0.8235, "step": 96400 }, { "epoch": 6.23, "eval_accuracy": 0.7833052276559865, "eval_loss": 0.9143757224082947, "eval_runtime": 7.5858, "eval_samples_per_second": 131.825, "eval_steps_per_second": 8.305, "step": 96400 }, { "epoch": 6.24, "grad_norm": 10.933472633361816, "learning_rate": 2.7780963302752295e-05, "loss": 0.8819, "step": 96500 }, { "epoch": 6.25, "grad_norm": 8.941792488098145, "learning_rate": 2.7733180428134558e-05, "loss": 0.8691, "step": 96600 }, { "epoch": 6.25, "eval_accuracy": 0.7958754208754208, "eval_loss": 0.8920222520828247, "eval_runtime": 7.6044, "eval_samples_per_second": 131.503, "eval_steps_per_second": 8.285, "step": 96600 }, { "epoch": 6.25, "grad_norm": 7.9691243171691895, "learning_rate": 2.7685397553516823e-05, "loss": 0.8361, "step": 96700 }, { "epoch": 6.26, "grad_norm": 6.835637092590332, "learning_rate": 2.7637614678899082e-05, "loss": 0.8193, "step": 96800 }, { "epoch": 6.26, "eval_accuracy": 0.7925894378194208, "eval_loss": 0.8974847197532654, "eval_runtime": 8.0305, "eval_samples_per_second": 124.525, "eval_steps_per_second": 7.845, "step": 96800 }, { "epoch": 6.27, "grad_norm": 5.817837238311768, "learning_rate": 2.7589831804281347e-05, "loss": 0.8445, "step": 96900 }, { "epoch": 6.27, "grad_norm": 9.832756042480469, "learning_rate": 2.754204892966361e-05, "loss": 0.8569, "step": 97000 }, { "epoch": 6.27, "eval_accuracy": 0.7938931297709924, "eval_loss": 0.8813791871070862, "eval_runtime": 8.4583, "eval_samples_per_second": 118.227, "eval_steps_per_second": 7.448, "step": 97000 }, { "epoch": 6.28, "grad_norm": 6.363796710968018, "learning_rate": 2.7494266055045875e-05, "loss": 0.8457, "step": 97100 }, { "epoch": 6.29, "grad_norm": 6.835550308227539, "learning_rate": 2.7446483180428134e-05, "loss": 0.8355, "step": 97200 }, { "epoch": 6.29, "eval_accuracy": 0.7822816548265775, "eval_loss": 0.9683089256286621, "eval_runtime": 7.3796, "eval_samples_per_second": 135.508, "eval_steps_per_second": 8.537, "step": 97200 }, { "epoch": 6.29, "grad_norm": 6.251167297363281, "learning_rate": 2.73987003058104e-05, "loss": 0.8537, "step": 97300 }, { "epoch": 6.3, "grad_norm": 6.859745502471924, "learning_rate": 2.735091743119266e-05, "loss": 0.8274, "step": 97400 }, { "epoch": 6.3, "eval_accuracy": 0.7925486875529213, "eval_loss": 0.8807522654533386, "eval_runtime": 8.4557, "eval_samples_per_second": 118.264, "eval_steps_per_second": 7.451, "step": 97400 }, { "epoch": 6.3, "grad_norm": 6.257714748382568, "learning_rate": 2.7303134556574927e-05, "loss": 0.8637, "step": 97500 }, { "epoch": 6.31, "grad_norm": 8.0186128616333, "learning_rate": 2.7255351681957186e-05, "loss": 0.8453, "step": 97600 }, { "epoch": 6.31, "eval_accuracy": 0.8033542976939203, "eval_loss": 0.8838856220245361, "eval_runtime": 7.8889, "eval_samples_per_second": 126.76, "eval_steps_per_second": 7.986, "step": 97600 }, { "epoch": 6.32, "grad_norm": 7.938974380493164, "learning_rate": 2.720756880733945e-05, "loss": 0.8753, "step": 97700 }, { "epoch": 6.32, "grad_norm": 9.223575592041016, "learning_rate": 2.7159785932721717e-05, "loss": 0.9004, "step": 97800 }, { "epoch": 6.32, "eval_accuracy": 0.7976539589442815, "eval_loss": 0.9368798732757568, "eval_runtime": 8.1175, "eval_samples_per_second": 123.191, "eval_steps_per_second": 7.761, "step": 97800 }, { "epoch": 6.33, "grad_norm": 7.561581611633301, "learning_rate": 2.7112003058103975e-05, "loss": 0.8297, "step": 97900 }, { "epoch": 6.34, "grad_norm": 11.252593040466309, "learning_rate": 2.7064220183486238e-05, "loss": 0.8082, "step": 98000 }, { "epoch": 6.34, "eval_accuracy": 0.7925398155909472, "eval_loss": 0.8953116536140442, "eval_runtime": 7.5952, "eval_samples_per_second": 131.662, "eval_steps_per_second": 8.295, "step": 98000 }, { "epoch": 6.34, "grad_norm": 6.762524604797363, "learning_rate": 2.7016437308868503e-05, "loss": 0.8209, "step": 98100 }, { "epoch": 6.35, "grad_norm": 6.294374942779541, "learning_rate": 2.696865443425077e-05, "loss": 0.8299, "step": 98200 }, { "epoch": 6.35, "eval_accuracy": 0.795798319327731, "eval_loss": 0.8667227625846863, "eval_runtime": 8.2208, "eval_samples_per_second": 121.643, "eval_steps_per_second": 7.663, "step": 98200 }, { "epoch": 6.36, "grad_norm": 7.540713787078857, "learning_rate": 2.6920871559633027e-05, "loss": 0.8648, "step": 98300 }, { "epoch": 6.36, "grad_norm": 9.391749382019043, "learning_rate": 2.687308868501529e-05, "loss": 0.8476, "step": 98400 }, { "epoch": 6.36, "eval_accuracy": 0.78562421185372, "eval_loss": 0.923201858997345, "eval_runtime": 7.8556, "eval_samples_per_second": 127.297, "eval_steps_per_second": 8.02, "step": 98400 }, { "epoch": 6.37, "grad_norm": 9.132808685302734, "learning_rate": 2.6825305810397555e-05, "loss": 0.8726, "step": 98500 }, { "epoch": 6.38, "grad_norm": 8.487500190734863, "learning_rate": 2.677752293577982e-05, "loss": 0.8073, "step": 98600 }, { "epoch": 6.38, "eval_accuracy": 0.8025316455696202, "eval_loss": 0.8385837078094482, "eval_runtime": 8.1835, "eval_samples_per_second": 122.197, "eval_steps_per_second": 7.698, "step": 98600 }, { "epoch": 6.38, "grad_norm": 7.330880641937256, "learning_rate": 2.672974006116208e-05, "loss": 0.816, "step": 98700 }, { "epoch": 6.39, "grad_norm": 5.89218282699585, "learning_rate": 2.668195718654434e-05, "loss": 0.8314, "step": 98800 }, { "epoch": 6.39, "eval_accuracy": 0.8, "eval_loss": 0.8430078029632568, "eval_runtime": 8.0833, "eval_samples_per_second": 123.712, "eval_steps_per_second": 7.794, "step": 98800 }, { "epoch": 6.4, "grad_norm": 10.489568710327148, "learning_rate": 2.6634174311926607e-05, "loss": 0.8639, "step": 98900 }, { "epoch": 6.4, "grad_norm": 5.168033123016357, "learning_rate": 2.6586391437308873e-05, "loss": 0.8268, "step": 99000 }, { "epoch": 6.4, "eval_accuracy": 0.8087866108786611, "eval_loss": 0.8594768047332764, "eval_runtime": 7.6492, "eval_samples_per_second": 130.733, "eval_steps_per_second": 8.236, "step": 99000 }, { "epoch": 6.41, "grad_norm": 8.712702751159668, "learning_rate": 2.653860856269113e-05, "loss": 0.8245, "step": 99100 }, { "epoch": 6.41, "grad_norm": 8.510950088500977, "learning_rate": 2.6490825688073397e-05, "loss": 0.7826, "step": 99200 }, { "epoch": 6.41, "eval_accuracy": 0.7817796610169492, "eval_loss": 0.9254732728004456, "eval_runtime": 8.1127, "eval_samples_per_second": 123.263, "eval_steps_per_second": 7.766, "step": 99200 }, { "epoch": 6.42, "grad_norm": 8.828670501708984, "learning_rate": 2.644304281345566e-05, "loss": 0.8284, "step": 99300 }, { "epoch": 6.43, "grad_norm": 11.803581237792969, "learning_rate": 2.6395259938837924e-05, "loss": 0.8202, "step": 99400 }, { "epoch": 6.43, "eval_accuracy": 0.7985791893021312, "eval_loss": 0.8763936758041382, "eval_runtime": 8.0604, "eval_samples_per_second": 124.063, "eval_steps_per_second": 7.816, "step": 99400 }, { "epoch": 6.43, "grad_norm": 9.902961730957031, "learning_rate": 2.6347477064220183e-05, "loss": 0.8061, "step": 99500 }, { "epoch": 6.44, "grad_norm": 8.136748313903809, "learning_rate": 2.629969418960245e-05, "loss": 0.8394, "step": 99600 }, { "epoch": 6.44, "eval_accuracy": 0.7845117845117845, "eval_loss": 0.9501782059669495, "eval_runtime": 7.4436, "eval_samples_per_second": 134.344, "eval_steps_per_second": 8.464, "step": 99600 }, { "epoch": 6.45, "grad_norm": 9.858920097351074, "learning_rate": 2.625191131498471e-05, "loss": 0.8237, "step": 99700 }, { "epoch": 6.45, "grad_norm": 7.0880208015441895, "learning_rate": 2.6204128440366976e-05, "loss": 0.8067, "step": 99800 }, { "epoch": 6.45, "eval_accuracy": 0.8018707482993197, "eval_loss": 0.8582417964935303, "eval_runtime": 7.5832, "eval_samples_per_second": 131.87, "eval_steps_per_second": 8.308, "step": 99800 }, { "epoch": 6.46, "grad_norm": 10.761656761169434, "learning_rate": 2.6156345565749235e-05, "loss": 0.8052, "step": 99900 }, { "epoch": 6.47, "grad_norm": 7.500642776489258, "learning_rate": 2.61085626911315e-05, "loss": 0.8371, "step": 100000 }, { "epoch": 6.47, "eval_accuracy": 0.8036164844407064, "eval_loss": 0.8842684030532837, "eval_runtime": 7.4355, "eval_samples_per_second": 134.49, "eval_steps_per_second": 8.473, "step": 100000 }, { "epoch": 6.47, "grad_norm": 11.73560619354248, "learning_rate": 2.6060779816513763e-05, "loss": 0.8244, "step": 100100 }, { "epoch": 6.48, "grad_norm": 7.773484230041504, "learning_rate": 2.601299694189603e-05, "loss": 0.8378, "step": 100200 }, { "epoch": 6.48, "eval_accuracy": 0.7966893039049237, "eval_loss": 0.8564867377281189, "eval_runtime": 7.3467, "eval_samples_per_second": 136.116, "eval_steps_per_second": 8.575, "step": 100200 }, { "epoch": 6.49, "grad_norm": 8.470911979675293, "learning_rate": 2.5965214067278287e-05, "loss": 0.8352, "step": 100300 }, { "epoch": 6.49, "grad_norm": 7.060744285583496, "learning_rate": 2.5917431192660553e-05, "loss": 0.8657, "step": 100400 }, { "epoch": 6.49, "eval_accuracy": 0.8036783575705732, "eval_loss": 0.8975025415420532, "eval_runtime": 7.553, "eval_samples_per_second": 132.398, "eval_steps_per_second": 8.341, "step": 100400 }, { "epoch": 6.5, "grad_norm": 4.939974784851074, "learning_rate": 2.5869648318042815e-05, "loss": 0.8576, "step": 100500 }, { "epoch": 6.51, "grad_norm": 8.578023910522461, "learning_rate": 2.5821865443425073e-05, "loss": 0.7972, "step": 100600 }, { "epoch": 6.51, "eval_accuracy": 0.7879303017424565, "eval_loss": 0.9334100484848022, "eval_runtime": 7.5409, "eval_samples_per_second": 132.611, "eval_steps_per_second": 8.354, "step": 100600 }, { "epoch": 6.51, "grad_norm": 8.303977012634277, "learning_rate": 2.577408256880734e-05, "loss": 0.8822, "step": 100700 }, { "epoch": 6.52, "grad_norm": 8.781500816345215, "learning_rate": 2.5726299694189605e-05, "loss": 0.8704, "step": 100800 }, { "epoch": 6.52, "eval_accuracy": 0.7973828619670748, "eval_loss": 0.891633152961731, "eval_runtime": 7.4323, "eval_samples_per_second": 134.549, "eval_steps_per_second": 8.477, "step": 100800 }, { "epoch": 6.52, "grad_norm": 7.294911861419678, "learning_rate": 2.567851681957187e-05, "loss": 0.7832, "step": 100900 }, { "epoch": 6.53, "grad_norm": 7.5453104972839355, "learning_rate": 2.563073394495413e-05, "loss": 0.8101, "step": 101000 }, { "epoch": 6.53, "eval_accuracy": 0.807061790668348, "eval_loss": 0.8556983470916748, "eval_runtime": 7.4472, "eval_samples_per_second": 134.278, "eval_steps_per_second": 8.46, "step": 101000 }, { "epoch": 6.54, "grad_norm": 6.810634613037109, "learning_rate": 2.558295107033639e-05, "loss": 0.8122, "step": 101100 }, { "epoch": 6.54, "grad_norm": 9.665030479431152, "learning_rate": 2.5535168195718656e-05, "loss": 0.7958, "step": 101200 }, { "epoch": 6.54, "eval_accuracy": 0.8002523128679563, "eval_loss": 0.8284469842910767, "eval_runtime": 7.2708, "eval_samples_per_second": 137.536, "eval_steps_per_second": 8.665, "step": 101200 }, { "epoch": 6.55, "grad_norm": 7.369723320007324, "learning_rate": 2.5487385321100922e-05, "loss": 0.7745, "step": 101300 }, { "epoch": 6.56, "grad_norm": 7.250212669372559, "learning_rate": 2.543960244648318e-05, "loss": 0.7995, "step": 101400 }, { "epoch": 6.56, "eval_accuracy": 0.7941426146010186, "eval_loss": 0.8724551200866699, "eval_runtime": 7.6018, "eval_samples_per_second": 131.547, "eval_steps_per_second": 8.287, "step": 101400 }, { "epoch": 6.56, "grad_norm": 8.48430061340332, "learning_rate": 2.5391819571865443e-05, "loss": 0.8226, "step": 101500 }, { "epoch": 6.57, "grad_norm": 7.700632572174072, "learning_rate": 2.534403669724771e-05, "loss": 0.8405, "step": 101600 }, { "epoch": 6.57, "eval_accuracy": 0.793918918918919, "eval_loss": 0.9304710030555725, "eval_runtime": 7.5396, "eval_samples_per_second": 132.633, "eval_steps_per_second": 8.356, "step": 101600 }, { "epoch": 6.58, "grad_norm": 8.074151992797852, "learning_rate": 2.5296253822629974e-05, "loss": 0.8571, "step": 101700 }, { "epoch": 6.58, "grad_norm": 10.547660827636719, "learning_rate": 2.5248470948012233e-05, "loss": 0.8348, "step": 101800 }, { "epoch": 6.58, "eval_accuracy": 0.7945263157894736, "eval_loss": 0.9371234178543091, "eval_runtime": 7.7096, "eval_samples_per_second": 129.708, "eval_steps_per_second": 8.172, "step": 101800 }, { "epoch": 6.59, "grad_norm": 7.635470867156982, "learning_rate": 2.5200688073394495e-05, "loss": 0.799, "step": 101900 }, { "epoch": 6.6, "grad_norm": 9.02029800415039, "learning_rate": 2.515290519877676e-05, "loss": 0.8207, "step": 102000 }, { "epoch": 6.6, "eval_accuracy": 0.7996611605252012, "eval_loss": 0.8464606404304504, "eval_runtime": 7.9596, "eval_samples_per_second": 125.634, "eval_steps_per_second": 7.915, "step": 102000 }, { "epoch": 6.6, "grad_norm": 8.539718627929688, "learning_rate": 2.5105122324159026e-05, "loss": 0.8412, "step": 102100 }, { "epoch": 6.61, "grad_norm": 8.250880241394043, "learning_rate": 2.5057339449541285e-05, "loss": 0.8113, "step": 102200 }, { "epoch": 6.61, "eval_accuracy": 0.799496644295302, "eval_loss": 0.8784326910972595, "eval_runtime": 7.8194, "eval_samples_per_second": 127.887, "eval_steps_per_second": 8.057, "step": 102200 }, { "epoch": 6.62, "grad_norm": 10.51920223236084, "learning_rate": 2.5009556574923547e-05, "loss": 0.8202, "step": 102300 }, { "epoch": 6.62, "grad_norm": 6.438490867614746, "learning_rate": 2.4961773700305812e-05, "loss": 0.8182, "step": 102400 }, { "epoch": 6.62, "eval_accuracy": 0.8060836501901141, "eval_loss": 0.8400623798370361, "eval_runtime": 7.574, "eval_samples_per_second": 132.031, "eval_steps_per_second": 8.318, "step": 102400 }, { "epoch": 6.63, "grad_norm": 8.204225540161133, "learning_rate": 2.4913990825688074e-05, "loss": 0.8409, "step": 102500 }, { "epoch": 6.63, "grad_norm": 7.048851490020752, "learning_rate": 2.486620795107034e-05, "loss": 0.8477, "step": 102600 }, { "epoch": 6.63, "eval_accuracy": 0.7963357477631018, "eval_loss": 0.8624759316444397, "eval_runtime": 7.5257, "eval_samples_per_second": 132.877, "eval_steps_per_second": 8.371, "step": 102600 }, { "epoch": 6.64, "grad_norm": 13.9310302734375, "learning_rate": 2.4818425076452602e-05, "loss": 0.826, "step": 102700 }, { "epoch": 6.65, "grad_norm": 8.92416000366211, "learning_rate": 2.4770642201834864e-05, "loss": 0.862, "step": 102800 }, { "epoch": 6.65, "eval_accuracy": 0.7905320485965647, "eval_loss": 0.925977885723114, "eval_runtime": 8.3206, "eval_samples_per_second": 120.184, "eval_steps_per_second": 7.572, "step": 102800 }, { "epoch": 6.65, "grad_norm": 7.824712753295898, "learning_rate": 2.4722859327217126e-05, "loss": 0.8031, "step": 102900 }, { "epoch": 6.66, "grad_norm": 9.589764595031738, "learning_rate": 2.467507645259939e-05, "loss": 0.8005, "step": 103000 }, { "epoch": 6.66, "eval_accuracy": 0.7892962494732406, "eval_loss": 0.9120190143585205, "eval_runtime": 7.8765, "eval_samples_per_second": 126.96, "eval_steps_per_second": 7.998, "step": 103000 }, { "epoch": 6.67, "grad_norm": 9.98837661743164, "learning_rate": 2.4627293577981654e-05, "loss": 0.8439, "step": 103100 }, { "epoch": 6.67, "grad_norm": 6.645535945892334, "learning_rate": 2.4579510703363913e-05, "loss": 0.8735, "step": 103200 }, { "epoch": 6.67, "eval_accuracy": 0.8116744780570941, "eval_loss": 0.8443155288696289, "eval_runtime": 7.9496, "eval_samples_per_second": 125.793, "eval_steps_per_second": 7.925, "step": 103200 }, { "epoch": 6.68, "grad_norm": 7.067810535430908, "learning_rate": 2.4531727828746178e-05, "loss": 0.8308, "step": 103300 }, { "epoch": 6.69, "grad_norm": 6.189504146575928, "learning_rate": 2.448394495412844e-05, "loss": 0.8057, "step": 103400 }, { "epoch": 6.69, "eval_accuracy": 0.7854149203688181, "eval_loss": 0.9068706631660461, "eval_runtime": 7.9623, "eval_samples_per_second": 125.591, "eval_steps_per_second": 7.912, "step": 103400 }, { "epoch": 6.69, "grad_norm": 7.151706218719482, "learning_rate": 2.4436162079510706e-05, "loss": 0.8374, "step": 103500 }, { "epoch": 6.7, "grad_norm": 8.603375434875488, "learning_rate": 2.4388379204892968e-05, "loss": 0.8634, "step": 103600 }, { "epoch": 6.7, "eval_accuracy": 0.8042194092827004, "eval_loss": 0.8885533809661865, "eval_runtime": 8.2339, "eval_samples_per_second": 121.45, "eval_steps_per_second": 7.651, "step": 103600 }, { "epoch": 6.71, "grad_norm": 6.844043254852295, "learning_rate": 2.434059633027523e-05, "loss": 0.8123, "step": 103700 }, { "epoch": 6.71, "grad_norm": 6.442944526672363, "learning_rate": 2.4292813455657492e-05, "loss": 0.8644, "step": 103800 }, { "epoch": 6.71, "eval_accuracy": 0.795425667090216, "eval_loss": 0.9474312663078308, "eval_runtime": 8.1366, "eval_samples_per_second": 122.902, "eval_steps_per_second": 7.743, "step": 103800 }, { "epoch": 6.72, "grad_norm": 9.47812271118164, "learning_rate": 2.4245030581039758e-05, "loss": 0.8246, "step": 103900 }, { "epoch": 6.73, "grad_norm": 6.981095314025879, "learning_rate": 2.419724770642202e-05, "loss": 0.8246, "step": 104000 }, { "epoch": 6.73, "eval_accuracy": 0.7934371055952881, "eval_loss": 0.9285019636154175, "eval_runtime": 8.4271, "eval_samples_per_second": 118.664, "eval_steps_per_second": 7.476, "step": 104000 }, { "epoch": 6.73, "grad_norm": 8.786751747131348, "learning_rate": 2.4149464831804282e-05, "loss": 0.8027, "step": 104100 }, { "epoch": 6.74, "grad_norm": 13.612492561340332, "learning_rate": 2.4101681957186544e-05, "loss": 0.8091, "step": 104200 }, { "epoch": 6.74, "eval_accuracy": 0.7998314369995786, "eval_loss": 0.8698335289955139, "eval_runtime": 8.3092, "eval_samples_per_second": 120.348, "eval_steps_per_second": 7.582, "step": 104200 }, { "epoch": 6.74, "grad_norm": 11.23725700378418, "learning_rate": 2.405389908256881e-05, "loss": 0.8395, "step": 104300 }, { "epoch": 6.75, "grad_norm": 5.598878383636475, "learning_rate": 2.4006116207951072e-05, "loss": 0.8301, "step": 104400 }, { "epoch": 6.75, "eval_accuracy": 0.8182199832073888, "eval_loss": 0.7797525525093079, "eval_runtime": 8.3811, "eval_samples_per_second": 119.316, "eval_steps_per_second": 7.517, "step": 104400 }, { "epoch": 6.76, "grad_norm": 6.9407958984375, "learning_rate": 2.3958333333333334e-05, "loss": 0.856, "step": 104500 }, { "epoch": 6.76, "grad_norm": 8.587924003601074, "learning_rate": 2.3910550458715596e-05, "loss": 0.8281, "step": 104600 }, { "epoch": 6.76, "eval_accuracy": 0.7925328807806534, "eval_loss": 0.8926629424095154, "eval_runtime": 8.3031, "eval_samples_per_second": 120.437, "eval_steps_per_second": 7.588, "step": 104600 }, { "epoch": 6.77, "grad_norm": 8.196210861206055, "learning_rate": 2.3862767584097862e-05, "loss": 0.8321, "step": 104700 }, { "epoch": 6.78, "grad_norm": 12.062895774841309, "learning_rate": 2.3814984709480124e-05, "loss": 0.8193, "step": 104800 }, { "epoch": 6.78, "eval_accuracy": 0.809363137916491, "eval_loss": 0.835158109664917, "eval_runtime": 7.7573, "eval_samples_per_second": 128.911, "eval_steps_per_second": 8.121, "step": 104800 }, { "epoch": 6.78, "grad_norm": 9.538196563720703, "learning_rate": 2.3767201834862386e-05, "loss": 0.8207, "step": 104900 }, { "epoch": 6.79, "grad_norm": 10.14394760131836, "learning_rate": 2.3719418960244648e-05, "loss": 0.8503, "step": 105000 }, { "epoch": 6.79, "eval_accuracy": 0.8043660789252729, "eval_loss": 0.8509929776191711, "eval_runtime": 7.4345, "eval_samples_per_second": 134.507, "eval_steps_per_second": 8.474, "step": 105000 }, { "epoch": 6.8, "grad_norm": 8.408036231994629, "learning_rate": 2.3671636085626914e-05, "loss": 0.8486, "step": 105100 }, { "epoch": 6.8, "grad_norm": 8.437019348144531, "learning_rate": 2.3623853211009176e-05, "loss": 0.8074, "step": 105200 }, { "epoch": 6.8, "eval_accuracy": 0.7911392405063291, "eval_loss": 0.8818649053573608, "eval_runtime": 7.7788, "eval_samples_per_second": 128.555, "eval_steps_per_second": 8.099, "step": 105200 }, { "epoch": 6.81, "grad_norm": 7.933304309844971, "learning_rate": 2.357607033639144e-05, "loss": 0.7818, "step": 105300 }, { "epoch": 6.82, "grad_norm": 7.470320701599121, "learning_rate": 2.35282874617737e-05, "loss": 0.831, "step": 105400 }, { "epoch": 6.82, "eval_accuracy": 0.7949685534591195, "eval_loss": 0.8892700672149658, "eval_runtime": 7.8305, "eval_samples_per_second": 127.705, "eval_steps_per_second": 8.045, "step": 105400 }, { "epoch": 6.82, "grad_norm": 8.107166290283203, "learning_rate": 2.3480504587155962e-05, "loss": 0.8092, "step": 105500 }, { "epoch": 6.83, "grad_norm": 8.14939022064209, "learning_rate": 2.3432721712538228e-05, "loss": 0.8415, "step": 105600 }, { "epoch": 6.83, "eval_accuracy": 0.7967959527824621, "eval_loss": 0.9460989236831665, "eval_runtime": 8.2042, "eval_samples_per_second": 121.888, "eval_steps_per_second": 7.679, "step": 105600 }, { "epoch": 6.84, "grad_norm": 6.977346897125244, "learning_rate": 2.338493883792049e-05, "loss": 0.8392, "step": 105700 }, { "epoch": 6.84, "grad_norm": 7.53939151763916, "learning_rate": 2.3337155963302752e-05, "loss": 0.8192, "step": 105800 }, { "epoch": 6.84, "eval_accuracy": 0.8046744574290484, "eval_loss": 0.8960748910903931, "eval_runtime": 8.0716, "eval_samples_per_second": 123.891, "eval_steps_per_second": 7.805, "step": 105800 }, { "epoch": 6.85, "grad_norm": 5.466322422027588, "learning_rate": 2.3289373088685014e-05, "loss": 0.845, "step": 105900 }, { "epoch": 6.85, "grad_norm": 10.160593032836914, "learning_rate": 2.324159021406728e-05, "loss": 0.8008, "step": 106000 }, { "epoch": 6.85, "eval_accuracy": 0.8000845308537616, "eval_loss": 0.8557285070419312, "eval_runtime": 8.0957, "eval_samples_per_second": 123.523, "eval_steps_per_second": 7.782, "step": 106000 }, { "epoch": 6.86, "grad_norm": 6.103987216949463, "learning_rate": 2.3193807339449542e-05, "loss": 0.885, "step": 106100 }, { "epoch": 6.87, "grad_norm": 8.6193208694458, "learning_rate": 2.3146024464831807e-05, "loss": 0.8458, "step": 106200 }, { "epoch": 6.87, "eval_accuracy": 0.78918692372171, "eval_loss": 0.9357724189758301, "eval_runtime": 8.202, "eval_samples_per_second": 121.922, "eval_steps_per_second": 7.681, "step": 106200 }, { "epoch": 6.87, "grad_norm": 8.533239364624023, "learning_rate": 2.3098241590214066e-05, "loss": 0.8097, "step": 106300 }, { "epoch": 6.88, "grad_norm": 9.609285354614258, "learning_rate": 2.305045871559633e-05, "loss": 0.8085, "step": 106400 }, { "epoch": 6.88, "eval_accuracy": 0.7938963210702341, "eval_loss": 0.9040654301643372, "eval_runtime": 7.8009, "eval_samples_per_second": 128.191, "eval_steps_per_second": 8.076, "step": 106400 }, { "epoch": 6.89, "grad_norm": 7.713673114776611, "learning_rate": 2.3002675840978594e-05, "loss": 0.8393, "step": 106500 }, { "epoch": 6.89, "grad_norm": 7.908583641052246, "learning_rate": 2.295489296636086e-05, "loss": 0.8277, "step": 106600 }, { "epoch": 6.89, "eval_accuracy": 0.8148460565162379, "eval_loss": 0.8399859666824341, "eval_runtime": 7.8879, "eval_samples_per_second": 126.776, "eval_steps_per_second": 7.987, "step": 106600 }, { "epoch": 6.9, "grad_norm": 8.78435230255127, "learning_rate": 2.290711009174312e-05, "loss": 0.8684, "step": 106700 }, { "epoch": 6.91, "grad_norm": 10.262534141540527, "learning_rate": 2.2859327217125384e-05, "loss": 0.8496, "step": 106800 }, { "epoch": 6.91, "eval_accuracy": 0.8127388535031848, "eval_loss": 0.8215518593788147, "eval_runtime": 7.7666, "eval_samples_per_second": 128.757, "eval_steps_per_second": 8.112, "step": 106800 }, { "epoch": 6.91, "grad_norm": 10.416743278503418, "learning_rate": 2.2811544342507646e-05, "loss": 0.8313, "step": 106900 }, { "epoch": 6.92, "grad_norm": 7.01884126663208, "learning_rate": 2.276376146788991e-05, "loss": 0.8257, "step": 107000 }, { "epoch": 6.92, "eval_accuracy": 0.8031825795644891, "eval_loss": 0.8514235019683838, "eval_runtime": 8.289, "eval_samples_per_second": 120.642, "eval_steps_per_second": 7.6, "step": 107000 }, { "epoch": 6.93, "grad_norm": 9.809340476989746, "learning_rate": 2.2715978593272173e-05, "loss": 0.8271, "step": 107100 }, { "epoch": 6.93, "grad_norm": 7.623280048370361, "learning_rate": 2.2668195718654435e-05, "loss": 0.7954, "step": 107200 }, { "epoch": 6.93, "eval_accuracy": 0.7956943858168003, "eval_loss": 0.9155858159065247, "eval_runtime": 7.6957, "eval_samples_per_second": 129.943, "eval_steps_per_second": 8.186, "step": 107200 }, { "epoch": 6.94, "grad_norm": 9.132543563842773, "learning_rate": 2.2620412844036698e-05, "loss": 0.8491, "step": 107300 }, { "epoch": 6.95, "grad_norm": 8.2843599319458, "learning_rate": 2.2572629969418963e-05, "loss": 0.8051, "step": 107400 }, { "epoch": 6.95, "eval_accuracy": 0.7936305732484077, "eval_loss": 0.889539361000061, "eval_runtime": 8.5033, "eval_samples_per_second": 117.601, "eval_steps_per_second": 7.409, "step": 107400 }, { "epoch": 6.95, "grad_norm": 22.00254249572754, "learning_rate": 2.2524847094801225e-05, "loss": 0.8344, "step": 107500 }, { "epoch": 6.96, "grad_norm": 4.783402919769287, "learning_rate": 2.2477064220183487e-05, "loss": 0.7783, "step": 107600 }, { "epoch": 6.96, "eval_accuracy": 0.799496644295302, "eval_loss": 0.9133100509643555, "eval_runtime": 7.5877, "eval_samples_per_second": 131.792, "eval_steps_per_second": 8.303, "step": 107600 }, { "epoch": 6.96, "grad_norm": 8.960688591003418, "learning_rate": 2.242928134556575e-05, "loss": 0.8592, "step": 107700 }, { "epoch": 6.97, "grad_norm": 9.379547119140625, "learning_rate": 2.2381498470948015e-05, "loss": 0.7917, "step": 107800 }, { "epoch": 6.97, "eval_accuracy": 0.8017789072426937, "eval_loss": 0.8997214436531067, "eval_runtime": 7.9763, "eval_samples_per_second": 125.371, "eval_steps_per_second": 7.898, "step": 107800 }, { "epoch": 6.98, "grad_norm": 7.59943151473999, "learning_rate": 2.2333715596330277e-05, "loss": 0.8255, "step": 107900 }, { "epoch": 6.98, "grad_norm": 9.025933265686035, "learning_rate": 2.228593272171254e-05, "loss": 0.7984, "step": 108000 }, { "epoch": 6.98, "eval_accuracy": 0.8046413502109705, "eval_loss": 0.8711063861846924, "eval_runtime": 7.7911, "eval_samples_per_second": 128.352, "eval_steps_per_second": 8.086, "step": 108000 }, { "epoch": 6.99, "grad_norm": 7.55230188369751, "learning_rate": 2.22381498470948e-05, "loss": 0.8127, "step": 108100 }, { "epoch": 7.0, "grad_norm": 8.109315872192383, "learning_rate": 2.2190366972477064e-05, "loss": 0.7999, "step": 108200 }, { "epoch": 7.0, "eval_accuracy": 0.7975792988313857, "eval_loss": 0.9040386080741882, "eval_runtime": 7.6862, "eval_samples_per_second": 130.104, "eval_steps_per_second": 8.197, "step": 108200 }, { "epoch": 7.0, "grad_norm": 7.071262359619141, "learning_rate": 2.214258409785933e-05, "loss": 0.8006, "step": 108300 }, { "epoch": 7.01, "grad_norm": 10.19521713256836, "learning_rate": 2.209480122324159e-05, "loss": 0.8014, "step": 108400 }, { "epoch": 7.01, "eval_accuracy": 0.8056137410976121, "eval_loss": 0.8504235744476318, "eval_runtime": 7.7942, "eval_samples_per_second": 128.301, "eval_steps_per_second": 8.083, "step": 108400 }, { "epoch": 7.02, "grad_norm": 8.479470252990723, "learning_rate": 2.2047018348623853e-05, "loss": 0.7931, "step": 108500 }, { "epoch": 7.02, "grad_norm": 8.284814834594727, "learning_rate": 2.1999235474006116e-05, "loss": 0.8287, "step": 108600 }, { "epoch": 7.02, "eval_accuracy": 0.8010033444816054, "eval_loss": 0.8911890983581543, "eval_runtime": 7.6139, "eval_samples_per_second": 131.34, "eval_steps_per_second": 8.274, "step": 108600 }, { "epoch": 7.03, "grad_norm": 11.088824272155762, "learning_rate": 2.195145259938838e-05, "loss": 0.7587, "step": 108700 }, { "epoch": 7.04, "grad_norm": 7.8180341720581055, "learning_rate": 2.1903669724770643e-05, "loss": 0.7853, "step": 108800 }, { "epoch": 7.04, "eval_accuracy": 0.8088851634534786, "eval_loss": 0.8645035028457642, "eval_runtime": 7.9095, "eval_samples_per_second": 126.43, "eval_steps_per_second": 7.965, "step": 108800 }, { "epoch": 7.04, "grad_norm": 7.924280643463135, "learning_rate": 2.1855886850152905e-05, "loss": 0.8087, "step": 108900 }, { "epoch": 7.05, "grad_norm": 6.721798896789551, "learning_rate": 2.1808103975535167e-05, "loss": 0.814, "step": 109000 }, { "epoch": 7.05, "eval_accuracy": 0.7934330839567747, "eval_loss": 0.9403297305107117, "eval_runtime": 7.6767, "eval_samples_per_second": 130.265, "eval_steps_per_second": 8.207, "step": 109000 }, { "epoch": 7.06, "grad_norm": 7.638154029846191, "learning_rate": 2.1760321100917433e-05, "loss": 0.7868, "step": 109100 }, { "epoch": 7.06, "grad_norm": 9.40565299987793, "learning_rate": 2.1712538226299695e-05, "loss": 0.8179, "step": 109200 }, { "epoch": 7.06, "eval_accuracy": 0.8088983050847458, "eval_loss": 0.8510028123855591, "eval_runtime": 8.2378, "eval_samples_per_second": 121.392, "eval_steps_per_second": 7.648, "step": 109200 }, { "epoch": 7.07, "grad_norm": 8.5471773147583, "learning_rate": 2.166475535168196e-05, "loss": 0.807, "step": 109300 }, { "epoch": 7.07, "grad_norm": 9.998223304748535, "learning_rate": 2.161697247706422e-05, "loss": 0.776, "step": 109400 }, { "epoch": 7.07, "eval_accuracy": 0.7866215071972904, "eval_loss": 0.8974270224571228, "eval_runtime": 7.6105, "eval_samples_per_second": 131.397, "eval_steps_per_second": 8.278, "step": 109400 }, { "epoch": 7.08, "grad_norm": 6.680050849914551, "learning_rate": 2.1569189602446485e-05, "loss": 0.7949, "step": 109500 }, { "epoch": 7.09, "grad_norm": 6.632976531982422, "learning_rate": 2.1521406727828747e-05, "loss": 0.7978, "step": 109600 }, { "epoch": 7.09, "eval_accuracy": 0.8093042749371333, "eval_loss": 0.804295539855957, "eval_runtime": 8.0573, "eval_samples_per_second": 124.111, "eval_steps_per_second": 7.819, "step": 109600 }, { "epoch": 7.09, "grad_norm": 7.297399520874023, "learning_rate": 2.1473623853211013e-05, "loss": 0.769, "step": 109700 }, { "epoch": 7.1, "grad_norm": 7.6808671951293945, "learning_rate": 2.142584097859327e-05, "loss": 0.7828, "step": 109800 }, { "epoch": 7.1, "eval_accuracy": 0.8004255319148936, "eval_loss": 0.8927489519119263, "eval_runtime": 7.2725, "eval_samples_per_second": 137.505, "eval_steps_per_second": 8.663, "step": 109800 }, { "epoch": 7.11, "grad_norm": 6.006447792053223, "learning_rate": 2.1378058103975537e-05, "loss": 0.7799, "step": 109900 }, { "epoch": 7.11, "grad_norm": 7.417251110076904, "learning_rate": 2.13302752293578e-05, "loss": 0.8062, "step": 110000 }, { "epoch": 7.11, "eval_accuracy": 0.7996611605252012, "eval_loss": 0.8884763121604919, "eval_runtime": 7.9299, "eval_samples_per_second": 126.105, "eval_steps_per_second": 7.945, "step": 110000 }, { "epoch": 7.12, "grad_norm": 6.604735851287842, "learning_rate": 2.1282492354740064e-05, "loss": 0.7838, "step": 110100 }, { "epoch": 7.13, "grad_norm": 8.088116645812988, "learning_rate": 2.1234709480122327e-05, "loss": 0.8215, "step": 110200 }, { "epoch": 7.13, "eval_accuracy": 0.7994137353433836, "eval_loss": 0.873217761516571, "eval_runtime": 7.4591, "eval_samples_per_second": 134.065, "eval_steps_per_second": 8.446, "step": 110200 }, { "epoch": 7.13, "grad_norm": 8.449728965759277, "learning_rate": 2.118692660550459e-05, "loss": 0.7817, "step": 110300 }, { "epoch": 7.14, "grad_norm": 7.570514678955078, "learning_rate": 2.113914373088685e-05, "loss": 0.7708, "step": 110400 }, { "epoch": 7.14, "eval_accuracy": 0.8042918454935623, "eval_loss": 0.8519502878189087, "eval_runtime": 7.3985, "eval_samples_per_second": 135.163, "eval_steps_per_second": 8.515, "step": 110400 }, { "epoch": 7.15, "grad_norm": 5.2633819580078125, "learning_rate": 2.1091360856269113e-05, "loss": 0.7777, "step": 110500 }, { "epoch": 7.15, "grad_norm": 9.224042892456055, "learning_rate": 2.104357798165138e-05, "loss": 0.7928, "step": 110600 }, { "epoch": 7.15, "eval_accuracy": 0.801433389544688, "eval_loss": 0.8718428015708923, "eval_runtime": 7.9715, "eval_samples_per_second": 125.446, "eval_steps_per_second": 7.903, "step": 110600 }, { "epoch": 7.16, "grad_norm": 8.180413246154785, "learning_rate": 2.0995795107033637e-05, "loss": 0.7912, "step": 110700 }, { "epoch": 7.17, "grad_norm": 10.361764907836914, "learning_rate": 2.0948012232415903e-05, "loss": 0.796, "step": 110800 }, { "epoch": 7.17, "eval_accuracy": 0.8005082592121983, "eval_loss": 0.8303714394569397, "eval_runtime": 7.8541, "eval_samples_per_second": 127.322, "eval_steps_per_second": 8.021, "step": 110800 }, { "epoch": 7.17, "grad_norm": 5.428534507751465, "learning_rate": 2.0900229357798165e-05, "loss": 0.7725, "step": 110900 }, { "epoch": 7.18, "grad_norm": 8.343489646911621, "learning_rate": 2.085244648318043e-05, "loss": 0.8385, "step": 111000 }, { "epoch": 7.18, "eval_accuracy": 0.8115881617340559, "eval_loss": 0.8125203847885132, "eval_runtime": 7.9144, "eval_samples_per_second": 126.352, "eval_steps_per_second": 7.96, "step": 111000 }, { "epoch": 7.18, "grad_norm": 9.176019668579102, "learning_rate": 2.0804663608562693e-05, "loss": 0.7905, "step": 111100 }, { "epoch": 7.19, "grad_norm": 7.977216720581055, "learning_rate": 2.0756880733944955e-05, "loss": 0.779, "step": 111200 }, { "epoch": 7.19, "eval_accuracy": 0.8046315789473685, "eval_loss": 0.8759897351264954, "eval_runtime": 7.6442, "eval_samples_per_second": 130.819, "eval_steps_per_second": 8.242, "step": 111200 }, { "epoch": 7.2, "grad_norm": 8.114157676696777, "learning_rate": 2.0709097859327217e-05, "loss": 0.7728, "step": 111300 }, { "epoch": 7.2, "grad_norm": 10.519754409790039, "learning_rate": 2.0661314984709482e-05, "loss": 0.8333, "step": 111400 }, { "epoch": 7.2, "eval_accuracy": 0.7947103274559194, "eval_loss": 0.900131106376648, "eval_runtime": 8.3421, "eval_samples_per_second": 119.873, "eval_steps_per_second": 7.552, "step": 111400 }, { "epoch": 7.21, "grad_norm": 10.321903228759766, "learning_rate": 2.0613532110091745e-05, "loss": 0.8089, "step": 111500 }, { "epoch": 7.22, "grad_norm": 8.312304496765137, "learning_rate": 2.0565749235474007e-05, "loss": 0.7953, "step": 111600 }, { "epoch": 7.22, "eval_accuracy": 0.8001682793437106, "eval_loss": 0.8327771425247192, "eval_runtime": 7.9167, "eval_samples_per_second": 126.316, "eval_steps_per_second": 7.958, "step": 111600 }, { "epoch": 7.22, "grad_norm": 7.290288925170898, "learning_rate": 2.051796636085627e-05, "loss": 0.7802, "step": 111700 }, { "epoch": 7.23, "grad_norm": 7.423213481903076, "learning_rate": 2.0470183486238534e-05, "loss": 0.7385, "step": 111800 }, { "epoch": 7.23, "eval_accuracy": 0.8086342880863429, "eval_loss": 0.8399224877357483, "eval_runtime": 7.8424, "eval_samples_per_second": 127.512, "eval_steps_per_second": 8.033, "step": 111800 }, { "epoch": 7.24, "grad_norm": 6.42292594909668, "learning_rate": 2.0422400611620796e-05, "loss": 0.798, "step": 111900 }, { "epoch": 7.24, "grad_norm": 5.441192626953125, "learning_rate": 2.037461773700306e-05, "loss": 0.8235, "step": 112000 }, { "epoch": 7.24, "eval_accuracy": 0.8052721088435374, "eval_loss": 0.8630133271217346, "eval_runtime": 8.4782, "eval_samples_per_second": 117.95, "eval_steps_per_second": 7.431, "step": 112000 }, { "epoch": 7.25, "grad_norm": 6.32123327255249, "learning_rate": 2.032683486238532e-05, "loss": 0.7694, "step": 112100 }, { "epoch": 7.26, "grad_norm": 7.5005974769592285, "learning_rate": 2.0279051987767586e-05, "loss": 0.8145, "step": 112200 }, { "epoch": 7.26, "eval_accuracy": 0.8042218543046358, "eval_loss": 0.8470879197120667, "eval_runtime": 8.0261, "eval_samples_per_second": 124.594, "eval_steps_per_second": 7.849, "step": 112200 }, { "epoch": 7.26, "grad_norm": 11.352280616760254, "learning_rate": 2.023126911314985e-05, "loss": 0.7829, "step": 112300 }, { "epoch": 7.27, "grad_norm": 7.281013488769531, "learning_rate": 2.018348623853211e-05, "loss": 0.7407, "step": 112400 }, { "epoch": 7.27, "eval_accuracy": 0.7982158028887001, "eval_loss": 0.8909757137298584, "eval_runtime": 7.4359, "eval_samples_per_second": 134.483, "eval_steps_per_second": 8.472, "step": 112400 }, { "epoch": 7.27, "grad_norm": 6.962748050689697, "learning_rate": 2.0135703363914373e-05, "loss": 0.7864, "step": 112500 }, { "epoch": 7.28, "grad_norm": 11.104227066040039, "learning_rate": 2.0087920489296638e-05, "loss": 0.7849, "step": 112600 }, { "epoch": 7.28, "eval_accuracy": 0.8043019822859553, "eval_loss": 0.8339477777481079, "eval_runtime": 9.2348, "eval_samples_per_second": 108.286, "eval_steps_per_second": 6.822, "step": 112600 }, { "epoch": 7.29, "grad_norm": 8.812771797180176, "learning_rate": 2.00401376146789e-05, "loss": 0.8036, "step": 112700 }, { "epoch": 7.29, "grad_norm": 9.127943992614746, "learning_rate": 1.9992354740061166e-05, "loss": 0.8064, "step": 112800 }, { "epoch": 7.29, "eval_accuracy": 0.8142259414225942, "eval_loss": 0.8057135343551636, "eval_runtime": 7.5753, "eval_samples_per_second": 132.007, "eval_steps_per_second": 8.316, "step": 112800 }, { "epoch": 7.3, "grad_norm": 6.521322250366211, "learning_rate": 1.9944571865443425e-05, "loss": 0.7501, "step": 112900 }, { "epoch": 7.31, "grad_norm": 9.14055061340332, "learning_rate": 1.9896788990825687e-05, "loss": 0.7755, "step": 113000 }, { "epoch": 7.31, "eval_accuracy": 0.8005877413937867, "eval_loss": 0.8729177117347717, "eval_runtime": 8.1542, "eval_samples_per_second": 122.636, "eval_steps_per_second": 7.726, "step": 113000 }, { "epoch": 7.31, "grad_norm": 5.6724090576171875, "learning_rate": 1.9849006116207952e-05, "loss": 0.7423, "step": 113100 }, { "epoch": 7.32, "grad_norm": 7.546582221984863, "learning_rate": 1.9801223241590214e-05, "loss": 0.7741, "step": 113200 }, { "epoch": 7.32, "eval_accuracy": 0.803781512605042, "eval_loss": 0.8778061866760254, "eval_runtime": 7.5078, "eval_samples_per_second": 133.194, "eval_steps_per_second": 8.391, "step": 113200 }, { "epoch": 7.33, "grad_norm": 7.4891533851623535, "learning_rate": 1.9753440366972477e-05, "loss": 0.7821, "step": 113300 }, { "epoch": 7.33, "grad_norm": 5.200985431671143, "learning_rate": 1.970565749235474e-05, "loss": 0.7423, "step": 113400 }, { "epoch": 7.33, "eval_accuracy": 0.8068325601012231, "eval_loss": 0.8130391240119934, "eval_runtime": 7.8883, "eval_samples_per_second": 126.77, "eval_steps_per_second": 7.986, "step": 113400 }, { "epoch": 7.34, "grad_norm": 7.421447277069092, "learning_rate": 1.9657874617737004e-05, "loss": 0.7813, "step": 113500 }, { "epoch": 7.35, "grad_norm": 8.951075553894043, "learning_rate": 1.9610091743119266e-05, "loss": 0.7733, "step": 113600 }, { "epoch": 7.35, "eval_accuracy": 0.791437049597287, "eval_loss": 0.8548867106437683, "eval_runtime": 6.6235, "eval_samples_per_second": 150.977, "eval_steps_per_second": 9.512, "step": 113600 }, { "epoch": 7.35, "grad_norm": 6.4592742919921875, "learning_rate": 1.9562308868501532e-05, "loss": 0.7636, "step": 113700 }, { "epoch": 7.36, "grad_norm": 8.847403526306152, "learning_rate": 1.951452599388379e-05, "loss": 0.7346, "step": 113800 }, { "epoch": 7.36, "eval_accuracy": 0.8126825198164372, "eval_loss": 0.7579282522201538, "eval_runtime": 8.3789, "eval_samples_per_second": 119.348, "eval_steps_per_second": 7.519, "step": 113800 }, { "epoch": 7.37, "grad_norm": 5.661942958831787, "learning_rate": 1.9466743119266056e-05, "loss": 0.7551, "step": 113900 }, { "epoch": 7.37, "grad_norm": 11.62883186340332, "learning_rate": 1.9418960244648318e-05, "loss": 0.8135, "step": 114000 }, { "epoch": 7.37, "eval_accuracy": 0.8149078726968174, "eval_loss": 0.8106382489204407, "eval_runtime": 8.2459, "eval_samples_per_second": 121.272, "eval_steps_per_second": 7.64, "step": 114000 }, { "epoch": 7.38, "grad_norm": 12.097107887268066, "learning_rate": 1.9371177370030584e-05, "loss": 0.8206, "step": 114100 }, { "epoch": 7.38, "grad_norm": 7.322299480438232, "learning_rate": 1.9323394495412846e-05, "loss": 0.794, "step": 114200 }, { "epoch": 7.38, "eval_accuracy": 0.8041543026706232, "eval_loss": 0.857973039150238, "eval_runtime": 7.6935, "eval_samples_per_second": 129.98, "eval_steps_per_second": 8.189, "step": 114200 }, { "epoch": 7.39, "grad_norm": 7.7154927253723145, "learning_rate": 1.9275611620795108e-05, "loss": 0.7658, "step": 114300 }, { "epoch": 7.4, "grad_norm": 7.391234397888184, "learning_rate": 1.922782874617737e-05, "loss": 0.7769, "step": 114400 }, { "epoch": 7.4, "eval_accuracy": 0.8109577582601422, "eval_loss": 0.7969567179679871, "eval_runtime": 7.7456, "eval_samples_per_second": 129.106, "eval_steps_per_second": 8.134, "step": 114400 }, { "epoch": 7.4, "grad_norm": 9.21231460571289, "learning_rate": 1.9180045871559636e-05, "loss": 0.8135, "step": 114500 }, { "epoch": 7.41, "grad_norm": 9.001349449157715, "learning_rate": 1.9132262996941898e-05, "loss": 0.759, "step": 114600 }, { "epoch": 7.41, "eval_accuracy": 0.8264568268821778, "eval_loss": 0.7757923007011414, "eval_runtime": 7.6646, "eval_samples_per_second": 130.47, "eval_steps_per_second": 8.22, "step": 114600 }, { "epoch": 7.42, "grad_norm": 5.489566802978516, "learning_rate": 1.908448012232416e-05, "loss": 0.7563, "step": 114700 }, { "epoch": 7.42, "grad_norm": 9.950483322143555, "learning_rate": 1.9036697247706422e-05, "loss": 0.7926, "step": 114800 }, { "epoch": 7.42, "eval_accuracy": 0.8016913319238901, "eval_loss": 0.8743452429771423, "eval_runtime": 7.4231, "eval_samples_per_second": 134.715, "eval_steps_per_second": 8.487, "step": 114800 }, { "epoch": 7.43, "grad_norm": 7.646630764007568, "learning_rate": 1.8988914373088688e-05, "loss": 0.7605, "step": 114900 }, { "epoch": 7.44, "grad_norm": 7.349968910217285, "learning_rate": 1.894113149847095e-05, "loss": 0.7955, "step": 115000 }, { "epoch": 7.44, "eval_accuracy": 0.8069883527454242, "eval_loss": 0.8461830019950867, "eval_runtime": 7.8943, "eval_samples_per_second": 126.674, "eval_steps_per_second": 7.98, "step": 115000 }, { "epoch": 7.44, "grad_norm": 10.858098030090332, "learning_rate": 1.8893348623853212e-05, "loss": 0.7418, "step": 115100 }, { "epoch": 7.45, "grad_norm": 8.511702537536621, "learning_rate": 1.8845565749235474e-05, "loss": 0.7988, "step": 115200 }, { "epoch": 7.45, "eval_accuracy": 0.8078859060402684, "eval_loss": 0.8202550411224365, "eval_runtime": 7.9332, "eval_samples_per_second": 126.052, "eval_steps_per_second": 7.941, "step": 115200 }, { "epoch": 7.46, "grad_norm": 9.882185935974121, "learning_rate": 1.879778287461774e-05, "loss": 0.8022, "step": 115300 }, { "epoch": 7.46, "grad_norm": 8.880218505859375, "learning_rate": 1.8750000000000002e-05, "loss": 0.787, "step": 115400 }, { "epoch": 7.46, "eval_accuracy": 0.8009181969949917, "eval_loss": 0.8274089097976685, "eval_runtime": 8.1943, "eval_samples_per_second": 122.037, "eval_steps_per_second": 7.688, "step": 115400 }, { "epoch": 7.47, "grad_norm": 8.132078170776367, "learning_rate": 1.8702217125382264e-05, "loss": 0.786, "step": 115500 }, { "epoch": 7.48, "grad_norm": 6.304058074951172, "learning_rate": 1.8654434250764526e-05, "loss": 0.79, "step": 115600 }, { "epoch": 7.48, "eval_accuracy": 0.7911259941398074, "eval_loss": 0.9037956595420837, "eval_runtime": 7.5723, "eval_samples_per_second": 132.061, "eval_steps_per_second": 8.32, "step": 115600 }, { "epoch": 7.48, "grad_norm": 7.565331935882568, "learning_rate": 1.8606651376146788e-05, "loss": 0.7908, "step": 115700 }, { "epoch": 7.49, "grad_norm": 5.3656463623046875, "learning_rate": 1.8558868501529054e-05, "loss": 0.7751, "step": 115800 }, { "epoch": 7.49, "eval_accuracy": 0.8052166596550273, "eval_loss": 0.7966258525848389, "eval_runtime": 8.1149, "eval_samples_per_second": 123.231, "eval_steps_per_second": 7.764, "step": 115800 }, { "epoch": 7.49, "grad_norm": 11.758662223815918, "learning_rate": 1.8511085626911316e-05, "loss": 0.7861, "step": 115900 }, { "epoch": 7.5, "grad_norm": 7.712491035461426, "learning_rate": 1.8463302752293578e-05, "loss": 0.7752, "step": 116000 }, { "epoch": 7.5, "eval_accuracy": 0.8173766343315056, "eval_loss": 0.7711414694786072, "eval_runtime": 7.5967, "eval_samples_per_second": 131.635, "eval_steps_per_second": 8.293, "step": 116000 }, { "epoch": 7.51, "grad_norm": 6.041299819946289, "learning_rate": 1.841551987767584e-05, "loss": 0.8083, "step": 116100 }, { "epoch": 7.51, "grad_norm": 7.197325229644775, "learning_rate": 1.8367737003058106e-05, "loss": 0.7761, "step": 116200 }, { "epoch": 7.51, "eval_accuracy": 0.799163179916318, "eval_loss": 0.8413881063461304, "eval_runtime": 8.1339, "eval_samples_per_second": 122.942, "eval_steps_per_second": 7.745, "step": 116200 }, { "epoch": 7.52, "grad_norm": 7.622585296630859, "learning_rate": 1.8319954128440368e-05, "loss": 0.7944, "step": 116300 }, { "epoch": 7.53, "grad_norm": 5.909121036529541, "learning_rate": 1.827217125382263e-05, "loss": 0.7927, "step": 116400 }, { "epoch": 7.53, "eval_accuracy": 0.8081494057724957, "eval_loss": 0.829373300075531, "eval_runtime": 7.739, "eval_samples_per_second": 129.216, "eval_steps_per_second": 8.141, "step": 116400 }, { "epoch": 7.53, "grad_norm": 7.117471694946289, "learning_rate": 1.8224388379204892e-05, "loss": 0.7419, "step": 116500 }, { "epoch": 7.54, "grad_norm": 7.335910797119141, "learning_rate": 1.8176605504587158e-05, "loss": 0.7452, "step": 116600 }, { "epoch": 7.54, "eval_accuracy": 0.816708229426434, "eval_loss": 0.8427916169166565, "eval_runtime": 7.791, "eval_samples_per_second": 128.353, "eval_steps_per_second": 8.086, "step": 116600 }, { "epoch": 7.55, "grad_norm": 7.440423965454102, "learning_rate": 1.812882262996942e-05, "loss": 0.7784, "step": 116700 }, { "epoch": 7.55, "grad_norm": 10.298877716064453, "learning_rate": 1.8081039755351685e-05, "loss": 0.8061, "step": 116800 }, { "epoch": 7.55, "eval_accuracy": 0.8123953098827471, "eval_loss": 0.8214563131332397, "eval_runtime": 7.8474, "eval_samples_per_second": 127.43, "eval_steps_per_second": 8.028, "step": 116800 }, { "epoch": 7.56, "grad_norm": 7.216022491455078, "learning_rate": 1.8033256880733944e-05, "loss": 0.7611, "step": 116900 }, { "epoch": 7.57, "grad_norm": 7.257930755615234, "learning_rate": 1.798547400611621e-05, "loss": 0.7687, "step": 117000 }, { "epoch": 7.57, "eval_accuracy": 0.8050341296928327, "eval_loss": 0.8460366725921631, "eval_runtime": 7.8079, "eval_samples_per_second": 128.075, "eval_steps_per_second": 8.069, "step": 117000 }, { "epoch": 7.57, "grad_norm": 9.324633598327637, "learning_rate": 1.793769113149847e-05, "loss": 0.7517, "step": 117100 }, { "epoch": 7.58, "grad_norm": 12.260783195495605, "learning_rate": 1.7889908256880737e-05, "loss": 0.7742, "step": 117200 }, { "epoch": 7.58, "eval_accuracy": 0.8030936454849499, "eval_loss": 0.8482139706611633, "eval_runtime": 8.3259, "eval_samples_per_second": 120.107, "eval_steps_per_second": 7.567, "step": 117200 }, { "epoch": 7.59, "grad_norm": 5.510626316070557, "learning_rate": 1.7842125382262996e-05, "loss": 0.7546, "step": 117300 }, { "epoch": 7.59, "grad_norm": 6.497379302978516, "learning_rate": 1.779434250764526e-05, "loss": 0.7906, "step": 117400 }, { "epoch": 7.59, "eval_accuracy": 0.808955223880597, "eval_loss": 0.847764253616333, "eval_runtime": 7.5252, "eval_samples_per_second": 132.887, "eval_steps_per_second": 8.372, "step": 117400 }, { "epoch": 7.6, "grad_norm": 8.562554359436035, "learning_rate": 1.7746559633027524e-05, "loss": 0.755, "step": 117500 }, { "epoch": 7.6, "grad_norm": 7.017602920532227, "learning_rate": 1.769877675840979e-05, "loss": 0.7792, "step": 117600 }, { "epoch": 7.6, "eval_accuracy": 0.8005061155630535, "eval_loss": 0.878292977809906, "eval_runtime": 7.9844, "eval_samples_per_second": 125.245, "eval_steps_per_second": 7.89, "step": 117600 }, { "epoch": 7.61, "grad_norm": 8.146720886230469, "learning_rate": 1.765099388379205e-05, "loss": 0.7947, "step": 117700 }, { "epoch": 7.62, "grad_norm": 10.628425598144531, "learning_rate": 1.7603211009174313e-05, "loss": 0.7615, "step": 117800 }, { "epoch": 7.62, "eval_accuracy": 0.80668358714044, "eval_loss": 0.8206958174705505, "eval_runtime": 7.9553, "eval_samples_per_second": 125.702, "eval_steps_per_second": 7.919, "step": 117800 }, { "epoch": 7.62, "grad_norm": 10.741018295288086, "learning_rate": 1.7555428134556575e-05, "loss": 0.7761, "step": 117900 }, { "epoch": 7.63, "grad_norm": 10.584565162658691, "learning_rate": 1.7507645259938838e-05, "loss": 0.7419, "step": 118000 }, { "epoch": 7.63, "eval_accuracy": 0.8019023986765922, "eval_loss": 0.8421455025672913, "eval_runtime": 8.1273, "eval_samples_per_second": 123.043, "eval_steps_per_second": 7.752, "step": 118000 }, { "epoch": 7.64, "grad_norm": 4.195310592651367, "learning_rate": 1.7459862385321103e-05, "loss": 0.7665, "step": 118100 }, { "epoch": 7.64, "grad_norm": 8.840511322021484, "learning_rate": 1.7412079510703362e-05, "loss": 0.7644, "step": 118200 }, { "epoch": 7.64, "eval_accuracy": 0.8088983050847458, "eval_loss": 0.821992039680481, "eval_runtime": 7.8499, "eval_samples_per_second": 127.391, "eval_steps_per_second": 8.026, "step": 118200 }, { "epoch": 7.65, "grad_norm": 7.339004039764404, "learning_rate": 1.7364296636085627e-05, "loss": 0.7917, "step": 118300 }, { "epoch": 7.66, "grad_norm": 6.73811149597168, "learning_rate": 1.731651376146789e-05, "loss": 0.7909, "step": 118400 }, { "epoch": 7.66, "eval_accuracy": 0.8049496644295302, "eval_loss": 0.8142576813697815, "eval_runtime": 8.0133, "eval_samples_per_second": 124.793, "eval_steps_per_second": 7.862, "step": 118400 }, { "epoch": 7.66, "grad_norm": 6.679124355316162, "learning_rate": 1.7268730886850155e-05, "loss": 0.7321, "step": 118500 }, { "epoch": 7.67, "grad_norm": 8.392818450927734, "learning_rate": 1.7220948012232417e-05, "loss": 0.7521, "step": 118600 }, { "epoch": 7.67, "eval_accuracy": 0.8185654008438819, "eval_loss": 0.7626935243606567, "eval_runtime": 8.2683, "eval_samples_per_second": 120.944, "eval_steps_per_second": 7.619, "step": 118600 }, { "epoch": 7.68, "grad_norm": 8.990431785583496, "learning_rate": 1.717316513761468e-05, "loss": 0.8175, "step": 118700 }, { "epoch": 7.68, "grad_norm": 7.373343467712402, "learning_rate": 1.712538226299694e-05, "loss": 0.754, "step": 118800 }, { "epoch": 7.68, "eval_accuracy": 0.8031001256807708, "eval_loss": 0.898451566696167, "eval_runtime": 8.1522, "eval_samples_per_second": 122.667, "eval_steps_per_second": 7.728, "step": 118800 }, { "epoch": 7.69, "grad_norm": 9.680159568786621, "learning_rate": 1.7077599388379207e-05, "loss": 0.7873, "step": 118900 }, { "epoch": 7.7, "grad_norm": 6.915431022644043, "learning_rate": 1.702981651376147e-05, "loss": 0.7452, "step": 119000 }, { "epoch": 7.7, "eval_accuracy": 0.8051673019906819, "eval_loss": 0.840599000453949, "eval_runtime": 7.5131, "eval_samples_per_second": 133.1, "eval_steps_per_second": 8.385, "step": 119000 }, { "epoch": 7.7, "grad_norm": 7.43475866317749, "learning_rate": 1.698203363914373e-05, "loss": 0.7886, "step": 119100 }, { "epoch": 7.71, "grad_norm": 8.908156394958496, "learning_rate": 1.6934250764525993e-05, "loss": 0.7875, "step": 119200 }, { "epoch": 7.71, "eval_accuracy": 0.8130957314546208, "eval_loss": 0.8372833728790283, "eval_runtime": 7.6235, "eval_samples_per_second": 131.174, "eval_steps_per_second": 8.264, "step": 119200 }, { "epoch": 7.71, "grad_norm": 7.716223239898682, "learning_rate": 1.688646788990826e-05, "loss": 0.7603, "step": 119300 }, { "epoch": 7.72, "grad_norm": 9.764396667480469, "learning_rate": 1.683868501529052e-05, "loss": 0.7672, "step": 119400 }, { "epoch": 7.72, "eval_accuracy": 0.8075959933222037, "eval_loss": 0.8411709666252136, "eval_runtime": 7.9292, "eval_samples_per_second": 126.116, "eval_steps_per_second": 7.945, "step": 119400 }, { "epoch": 7.73, "grad_norm": 9.699015617370605, "learning_rate": 1.6790902140672783e-05, "loss": 0.7757, "step": 119500 }, { "epoch": 7.73, "grad_norm": 5.045684814453125, "learning_rate": 1.6743119266055045e-05, "loss": 0.7912, "step": 119600 }, { "epoch": 7.73, "eval_accuracy": 0.8068038639227215, "eval_loss": 0.8446268439292908, "eval_runtime": 7.5089, "eval_samples_per_second": 133.175, "eval_steps_per_second": 8.39, "step": 119600 }, { "epoch": 7.74, "grad_norm": 10.084988594055176, "learning_rate": 1.669533639143731e-05, "loss": 0.7613, "step": 119700 }, { "epoch": 7.75, "grad_norm": 11.244512557983398, "learning_rate": 1.6647553516819573e-05, "loss": 0.7868, "step": 119800 }, { "epoch": 7.75, "eval_accuracy": 0.8263246425567704, "eval_loss": 0.7883582711219788, "eval_runtime": 7.9431, "eval_samples_per_second": 125.896, "eval_steps_per_second": 7.931, "step": 119800 }, { "epoch": 7.75, "grad_norm": 12.212841987609863, "learning_rate": 1.6599770642201835e-05, "loss": 0.7545, "step": 119900 }, { "epoch": 7.76, "grad_norm": 8.695969581604004, "learning_rate": 1.6551987767584097e-05, "loss": 0.7598, "step": 120000 }, { "epoch": 7.76, "eval_accuracy": 0.8086627417998318, "eval_loss": 0.8349902629852295, "eval_runtime": 7.3877, "eval_samples_per_second": 135.361, "eval_steps_per_second": 8.528, "step": 120000 }, { "epoch": 7.77, "grad_norm": 13.7140474319458, "learning_rate": 1.6504204892966363e-05, "loss": 0.7603, "step": 120100 }, { "epoch": 7.77, "grad_norm": 6.1484880447387695, "learning_rate": 1.6456422018348625e-05, "loss": 0.7812, "step": 120200 }, { "epoch": 7.77, "eval_accuracy": 0.825885328836425, "eval_loss": 0.7970248460769653, "eval_runtime": 7.6762, "eval_samples_per_second": 130.273, "eval_steps_per_second": 8.207, "step": 120200 }, { "epoch": 7.78, "grad_norm": 7.882537364959717, "learning_rate": 1.640863914373089e-05, "loss": 0.7312, "step": 120300 }, { "epoch": 7.79, "grad_norm": 7.9782915115356445, "learning_rate": 1.636085626911315e-05, "loss": 0.7864, "step": 120400 }, { "epoch": 7.79, "eval_accuracy": 0.8200677392040644, "eval_loss": 0.7592423558235168, "eval_runtime": 8.2525, "eval_samples_per_second": 121.175, "eval_steps_per_second": 7.634, "step": 120400 }, { "epoch": 7.79, "grad_norm": 6.920046329498291, "learning_rate": 1.631307339449541e-05, "loss": 0.7674, "step": 120500 }, { "epoch": 7.8, "grad_norm": 9.237918853759766, "learning_rate": 1.6265290519877677e-05, "loss": 0.763, "step": 120600 }, { "epoch": 7.8, "eval_accuracy": 0.8154433602036487, "eval_loss": 0.8269646167755127, "eval_runtime": 8.3278, "eval_samples_per_second": 120.08, "eval_steps_per_second": 7.565, "step": 120600 }, { "epoch": 7.81, "grad_norm": 7.292183876037598, "learning_rate": 1.621750764525994e-05, "loss": 0.7959, "step": 120700 }, { "epoch": 7.81, "grad_norm": 9.117610931396484, "learning_rate": 1.61697247706422e-05, "loss": 0.7576, "step": 120800 }, { "epoch": 7.81, "eval_accuracy": 0.8012738853503185, "eval_loss": 0.8719707727432251, "eval_runtime": 7.9233, "eval_samples_per_second": 126.21, "eval_steps_per_second": 7.951, "step": 120800 }, { "epoch": 7.82, "grad_norm": 8.232123374938965, "learning_rate": 1.6121941896024463e-05, "loss": 0.7718, "step": 120900 }, { "epoch": 7.82, "grad_norm": 7.845206260681152, "learning_rate": 1.607415902140673e-05, "loss": 0.7831, "step": 121000 }, { "epoch": 7.82, "eval_accuracy": 0.8167583580194668, "eval_loss": 0.7924259305000305, "eval_runtime": 7.798, "eval_samples_per_second": 128.237, "eval_steps_per_second": 8.079, "step": 121000 }, { "epoch": 7.83, "grad_norm": 6.8974080085754395, "learning_rate": 1.602637614678899e-05, "loss": 0.7441, "step": 121100 }, { "epoch": 7.84, "grad_norm": 7.729079246520996, "learning_rate": 1.5978593272171256e-05, "loss": 0.7662, "step": 121200 }, { "epoch": 7.84, "eval_accuracy": 0.7989886219974716, "eval_loss": 0.8805814385414124, "eval_runtime": 7.7493, "eval_samples_per_second": 129.044, "eval_steps_per_second": 8.13, "step": 121200 }, { "epoch": 7.84, "grad_norm": 13.465519905090332, "learning_rate": 1.5930810397553515e-05, "loss": 0.7287, "step": 121300 }, { "epoch": 7.85, "grad_norm": 10.649438858032227, "learning_rate": 1.588302752293578e-05, "loss": 0.7513, "step": 121400 }, { "epoch": 7.85, "eval_accuracy": 0.8106609808102345, "eval_loss": 0.8287137746810913, "eval_runtime": 7.9332, "eval_samples_per_second": 126.053, "eval_steps_per_second": 7.941, "step": 121400 }, { "epoch": 7.86, "grad_norm": 6.002485275268555, "learning_rate": 1.5835244648318043e-05, "loss": 0.7299, "step": 121500 }, { "epoch": 7.86, "grad_norm": 7.347038269042969, "learning_rate": 1.578746177370031e-05, "loss": 0.774, "step": 121600 }, { "epoch": 7.86, "eval_accuracy": 0.7955778055903212, "eval_loss": 0.8997684121131897, "eval_runtime": 8.3408, "eval_samples_per_second": 119.893, "eval_steps_per_second": 7.553, "step": 121600 }, { "epoch": 7.87, "grad_norm": 10.17648983001709, "learning_rate": 1.573967889908257e-05, "loss": 0.7997, "step": 121700 }, { "epoch": 7.88, "grad_norm": 11.544589042663574, "learning_rate": 1.5691896024464833e-05, "loss": 0.7633, "step": 121800 }, { "epoch": 7.88, "eval_accuracy": 0.7956265769554247, "eval_loss": 0.8756794333457947, "eval_runtime": 8.1796, "eval_samples_per_second": 122.255, "eval_steps_per_second": 7.702, "step": 121800 }, { "epoch": 7.88, "grad_norm": 5.650937080383301, "learning_rate": 1.5644113149847095e-05, "loss": 0.742, "step": 121900 }, { "epoch": 7.89, "grad_norm": 12.037466049194336, "learning_rate": 1.559633027522936e-05, "loss": 0.745, "step": 122000 }, { "epoch": 7.89, "eval_accuracy": 0.819023569023569, "eval_loss": 0.7672392129898071, "eval_runtime": 7.5345, "eval_samples_per_second": 132.724, "eval_steps_per_second": 8.362, "step": 122000 }, { "epoch": 7.9, "grad_norm": 6.637892723083496, "learning_rate": 1.5548547400611622e-05, "loss": 0.7837, "step": 122100 }, { "epoch": 7.9, "grad_norm": 8.093043327331543, "learning_rate": 1.5500764525993885e-05, "loss": 0.7498, "step": 122200 }, { "epoch": 7.9, "eval_accuracy": 0.8123938879456706, "eval_loss": 0.7847465872764587, "eval_runtime": 7.7246, "eval_samples_per_second": 129.457, "eval_steps_per_second": 8.156, "step": 122200 }, { "epoch": 7.91, "grad_norm": 10.415103912353516, "learning_rate": 1.5452981651376147e-05, "loss": 0.7228, "step": 122300 }, { "epoch": 7.92, "grad_norm": 7.7208333015441895, "learning_rate": 1.5405198776758412e-05, "loss": 0.7536, "step": 122400 }, { "epoch": 7.92, "eval_accuracy": 0.809282700421941, "eval_loss": 0.8364168405532837, "eval_runtime": 7.8406, "eval_samples_per_second": 127.542, "eval_steps_per_second": 8.035, "step": 122400 }, { "epoch": 7.92, "grad_norm": 10.398211479187012, "learning_rate": 1.5357415902140674e-05, "loss": 0.7404, "step": 122500 }, { "epoch": 7.93, "grad_norm": 8.01150894165039, "learning_rate": 1.5309633027522937e-05, "loss": 0.7768, "step": 122600 }, { "epoch": 7.93, "eval_accuracy": 0.820730671197961, "eval_loss": 0.7737154960632324, "eval_runtime": 8.218, "eval_samples_per_second": 121.685, "eval_steps_per_second": 7.666, "step": 122600 }, { "epoch": 7.93, "grad_norm": 10.308253288269043, "learning_rate": 1.52618501529052e-05, "loss": 0.7817, "step": 122700 }, { "epoch": 7.94, "grad_norm": 9.390105247497559, "learning_rate": 1.5214067278287464e-05, "loss": 0.7571, "step": 122800 }, { "epoch": 7.94, "eval_accuracy": 0.8082706766917294, "eval_loss": 0.8100508451461792, "eval_runtime": 7.9936, "eval_samples_per_second": 125.099, "eval_steps_per_second": 7.881, "step": 122800 }, { "epoch": 7.95, "grad_norm": 9.319555282592773, "learning_rate": 1.5166284403669726e-05, "loss": 0.7654, "step": 122900 }, { "epoch": 7.95, "grad_norm": 8.8595609664917, "learning_rate": 1.5118501529051987e-05, "loss": 0.7924, "step": 123000 }, { "epoch": 7.95, "eval_accuracy": 0.8110367892976589, "eval_loss": 0.7933071851730347, "eval_runtime": 7.7356, "eval_samples_per_second": 129.272, "eval_steps_per_second": 8.144, "step": 123000 }, { "epoch": 7.96, "grad_norm": 9.398641586303711, "learning_rate": 1.5070718654434252e-05, "loss": 0.7763, "step": 123100 }, { "epoch": 7.97, "grad_norm": 6.877560615539551, "learning_rate": 1.5022935779816513e-05, "loss": 0.7247, "step": 123200 }, { "epoch": 7.97, "eval_accuracy": 0.8076278290025146, "eval_loss": 0.8338378667831421, "eval_runtime": 8.402, "eval_samples_per_second": 119.02, "eval_steps_per_second": 7.498, "step": 123200 }, { "epoch": 7.97, "grad_norm": 9.126778602600098, "learning_rate": 1.4975152905198778e-05, "loss": 0.7573, "step": 123300 }, { "epoch": 7.98, "grad_norm": 30.833770751953125, "learning_rate": 1.4927370030581039e-05, "loss": 0.7527, "step": 123400 }, { "epoch": 7.98, "eval_accuracy": 0.8233799237611181, "eval_loss": 0.8319187164306641, "eval_runtime": 7.775, "eval_samples_per_second": 128.618, "eval_steps_per_second": 8.103, "step": 123400 }, { "epoch": 7.99, "grad_norm": 9.372103691101074, "learning_rate": 1.4879587155963304e-05, "loss": 0.7547, "step": 123500 }, { "epoch": 7.99, "grad_norm": 9.207969665527344, "learning_rate": 1.4831804281345565e-05, "loss": 0.7665, "step": 123600 }, { "epoch": 7.99, "eval_accuracy": 0.8137378845343447, "eval_loss": 0.7576901316642761, "eval_runtime": 7.6599, "eval_samples_per_second": 130.549, "eval_steps_per_second": 8.225, "step": 123600 }, { "epoch": 8.0, "grad_norm": 6.7428412437438965, "learning_rate": 1.478402140672783e-05, "loss": 0.761, "step": 123700 }, { "epoch": 8.01, "grad_norm": 5.204671859741211, "learning_rate": 1.4736238532110092e-05, "loss": 0.7274, "step": 123800 }, { "epoch": 8.01, "eval_accuracy": 0.7953781512605042, "eval_loss": 0.872694730758667, "eval_runtime": 7.8609, "eval_samples_per_second": 127.212, "eval_steps_per_second": 8.014, "step": 123800 }, { "epoch": 8.01, "grad_norm": 13.898552894592285, "learning_rate": 1.4688455657492356e-05, "loss": 0.7334, "step": 123900 }, { "epoch": 8.02, "grad_norm": 6.748899459838867, "learning_rate": 1.4640672782874618e-05, "loss": 0.7336, "step": 124000 }, { "epoch": 8.02, "eval_accuracy": 0.8010956595027392, "eval_loss": 0.8294017910957336, "eval_runtime": 7.9804, "eval_samples_per_second": 125.308, "eval_steps_per_second": 7.894, "step": 124000 }, { "epoch": 8.03, "grad_norm": 10.345788955688477, "learning_rate": 1.4592889908256882e-05, "loss": 0.7463, "step": 124100 }, { "epoch": 8.03, "grad_norm": 7.612587928771973, "learning_rate": 1.4545107033639144e-05, "loss": 0.7426, "step": 124200 }, { "epoch": 8.03, "eval_accuracy": 0.8020390824129142, "eval_loss": 0.8507386445999146, "eval_runtime": 8.1341, "eval_samples_per_second": 122.939, "eval_steps_per_second": 7.745, "step": 124200 }, { "epoch": 8.04, "grad_norm": 5.837127208709717, "learning_rate": 1.4497324159021408e-05, "loss": 0.6998, "step": 124300 }, { "epoch": 8.04, "grad_norm": 8.116684913635254, "learning_rate": 1.444954128440367e-05, "loss": 0.7503, "step": 124400 }, { "epoch": 8.04, "eval_accuracy": 0.825809199318569, "eval_loss": 0.7829322218894958, "eval_runtime": 8.1792, "eval_samples_per_second": 122.261, "eval_steps_per_second": 7.702, "step": 124400 }, { "epoch": 8.05, "grad_norm": 5.820298194885254, "learning_rate": 1.4401758409785934e-05, "loss": 0.7661, "step": 124500 }, { "epoch": 8.06, "grad_norm": 6.217636585235596, "learning_rate": 1.4353975535168196e-05, "loss": 0.7397, "step": 124600 }, { "epoch": 8.06, "eval_accuracy": 0.7940931780366056, "eval_loss": 0.8937223553657532, "eval_runtime": 8.2234, "eval_samples_per_second": 121.605, "eval_steps_per_second": 7.661, "step": 124600 }, { "epoch": 8.06, "grad_norm": 6.162384986877441, "learning_rate": 1.430619266055046e-05, "loss": 0.7316, "step": 124700 }, { "epoch": 8.07, "grad_norm": 6.513411998748779, "learning_rate": 1.4258409785932722e-05, "loss": 0.7275, "step": 124800 }, { "epoch": 8.07, "eval_accuracy": 0.8116123642439432, "eval_loss": 0.8490036725997925, "eval_runtime": 7.7599, "eval_samples_per_second": 128.868, "eval_steps_per_second": 8.119, "step": 124800 }, { "epoch": 8.08, "grad_norm": 14.129610061645508, "learning_rate": 1.4210626911314986e-05, "loss": 0.734, "step": 124900 }, { "epoch": 8.08, "grad_norm": 9.94810962677002, "learning_rate": 1.4162844036697248e-05, "loss": 0.7389, "step": 125000 }, { "epoch": 8.08, "eval_accuracy": 0.8116496598639455, "eval_loss": 0.8185622692108154, "eval_runtime": 7.7916, "eval_samples_per_second": 128.343, "eval_steps_per_second": 8.086, "step": 125000 }, { "epoch": 8.09, "grad_norm": 8.512467384338379, "learning_rate": 1.4115061162079512e-05, "loss": 0.7332, "step": 125100 }, { "epoch": 8.1, "grad_norm": 8.828963279724121, "learning_rate": 1.4067278287461774e-05, "loss": 0.7309, "step": 125200 }, { "epoch": 8.1, "eval_accuracy": 0.8020304568527918, "eval_loss": 0.8440992832183838, "eval_runtime": 8.0928, "eval_samples_per_second": 123.567, "eval_steps_per_second": 7.785, "step": 125200 }, { "epoch": 8.1, "grad_norm": 11.332906723022461, "learning_rate": 1.4019495412844038e-05, "loss": 0.7391, "step": 125300 }, { "epoch": 8.11, "grad_norm": 18.117919921875, "learning_rate": 1.39717125382263e-05, "loss": 0.7273, "step": 125400 }, { "epoch": 8.11, "eval_accuracy": 0.8245243128964059, "eval_loss": 0.7802507877349854, "eval_runtime": 6.722, "eval_samples_per_second": 148.766, "eval_steps_per_second": 9.372, "step": 125400 }, { "epoch": 8.12, "grad_norm": 7.615926265716553, "learning_rate": 1.3923929663608562e-05, "loss": 0.7442, "step": 125500 }, { "epoch": 8.12, "grad_norm": 7.539056301116943, "learning_rate": 1.3876146788990826e-05, "loss": 0.7252, "step": 125600 }, { "epoch": 8.12, "eval_accuracy": 0.8192114093959731, "eval_loss": 0.7613576650619507, "eval_runtime": 7.6072, "eval_samples_per_second": 131.454, "eval_steps_per_second": 8.282, "step": 125600 }, { "epoch": 8.13, "grad_norm": 13.62399673461914, "learning_rate": 1.3828363914373088e-05, "loss": 0.7683, "step": 125700 }, { "epoch": 8.14, "grad_norm": 6.626446723937988, "learning_rate": 1.3780581039755352e-05, "loss": 0.7373, "step": 125800 }, { "epoch": 8.14, "eval_accuracy": 0.8178752107925801, "eval_loss": 0.7946348786354065, "eval_runtime": 8.3911, "eval_samples_per_second": 119.174, "eval_steps_per_second": 7.508, "step": 125800 }, { "epoch": 8.14, "grad_norm": 6.773547172546387, "learning_rate": 1.3732798165137614e-05, "loss": 0.724, "step": 125900 }, { "epoch": 8.15, "grad_norm": 7.1286492347717285, "learning_rate": 1.3685015290519878e-05, "loss": 0.7291, "step": 126000 }, { "epoch": 8.15, "eval_accuracy": 0.8261234775304493, "eval_loss": 0.7585474848747253, "eval_runtime": 8.0079, "eval_samples_per_second": 124.877, "eval_steps_per_second": 7.867, "step": 126000 }, { "epoch": 8.15, "grad_norm": 6.850154876708984, "learning_rate": 1.363723241590214e-05, "loss": 0.7495, "step": 126100 }, { "epoch": 8.16, "grad_norm": 6.7210307121276855, "learning_rate": 1.3589449541284404e-05, "loss": 0.7056, "step": 126200 }, { "epoch": 8.16, "eval_accuracy": 0.8132377740303541, "eval_loss": 0.8614723086357117, "eval_runtime": 8.5135, "eval_samples_per_second": 117.461, "eval_steps_per_second": 7.4, "step": 126200 }, { "epoch": 8.17, "grad_norm": 9.575725555419922, "learning_rate": 1.3541666666666666e-05, "loss": 0.7068, "step": 126300 }, { "epoch": 8.17, "grad_norm": 6.572535991668701, "learning_rate": 1.3493883792048932e-05, "loss": 0.7362, "step": 126400 }, { "epoch": 8.17, "eval_accuracy": 0.8116063919259883, "eval_loss": 0.7763041257858276, "eval_runtime": 8.3976, "eval_samples_per_second": 119.081, "eval_steps_per_second": 7.502, "step": 126400 }, { "epoch": 8.18, "grad_norm": 7.413606643676758, "learning_rate": 1.3446100917431192e-05, "loss": 0.7494, "step": 126500 }, { "epoch": 8.19, "grad_norm": 13.239013671875, "learning_rate": 1.3398318042813458e-05, "loss": 0.7334, "step": 126600 }, { "epoch": 8.19, "eval_accuracy": 0.8078529657477026, "eval_loss": 0.8084008693695068, "eval_runtime": 7.7985, "eval_samples_per_second": 128.23, "eval_steps_per_second": 8.079, "step": 126600 }, { "epoch": 8.19, "grad_norm": 12.320777893066406, "learning_rate": 1.3350535168195718e-05, "loss": 0.7119, "step": 126700 }, { "epoch": 8.2, "grad_norm": 10.723286628723145, "learning_rate": 1.3302752293577984e-05, "loss": 0.7578, "step": 126800 }, { "epoch": 8.2, "eval_accuracy": 0.7996668054977093, "eval_loss": 0.8521745800971985, "eval_runtime": 7.6354, "eval_samples_per_second": 130.969, "eval_steps_per_second": 8.251, "step": 126800 }, { "epoch": 8.21, "grad_norm": 10.421616554260254, "learning_rate": 1.3254969418960244e-05, "loss": 0.7025, "step": 126900 }, { "epoch": 8.21, "grad_norm": 4.972078800201416, "learning_rate": 1.320718654434251e-05, "loss": 0.7128, "step": 127000 }, { "epoch": 8.21, "eval_accuracy": 0.8273834523309533, "eval_loss": 0.7296645641326904, "eval_runtime": 7.9343, "eval_samples_per_second": 126.035, "eval_steps_per_second": 7.94, "step": 127000 }, { "epoch": 8.22, "grad_norm": 7.216896057128906, "learning_rate": 1.3159403669724772e-05, "loss": 0.684, "step": 127100 }, { "epoch": 8.23, "grad_norm": 8.853894233703613, "learning_rate": 1.3111620795107035e-05, "loss": 0.7079, "step": 127200 }, { "epoch": 8.23, "eval_accuracy": 0.8150190920661858, "eval_loss": 0.8213258981704712, "eval_runtime": 7.4305, "eval_samples_per_second": 134.581, "eval_steps_per_second": 8.479, "step": 127200 }, { "epoch": 8.23, "grad_norm": 6.123105525970459, "learning_rate": 1.3063837920489298e-05, "loss": 0.7689, "step": 127300 }, { "epoch": 8.24, "grad_norm": 8.155814170837402, "learning_rate": 1.3016055045871561e-05, "loss": 0.7403, "step": 127400 }, { "epoch": 8.24, "eval_accuracy": 0.8063439065108514, "eval_loss": 0.8069650530815125, "eval_runtime": 6.6641, "eval_samples_per_second": 150.057, "eval_steps_per_second": 9.454, "step": 127400 }, { "epoch": 8.24, "grad_norm": 10.278502464294434, "learning_rate": 1.2968272171253824e-05, "loss": 0.7371, "step": 127500 }, { "epoch": 8.25, "grad_norm": 9.407280921936035, "learning_rate": 1.2920489296636087e-05, "loss": 0.7488, "step": 127600 }, { "epoch": 8.25, "eval_accuracy": 0.8128183361629882, "eval_loss": 0.803422749042511, "eval_runtime": 8.1111, "eval_samples_per_second": 123.288, "eval_steps_per_second": 7.767, "step": 127600 }, { "epoch": 8.26, "grad_norm": 7.52683687210083, "learning_rate": 1.287270642201835e-05, "loss": 0.7401, "step": 127700 }, { "epoch": 8.26, "grad_norm": 7.437029838562012, "learning_rate": 1.2824923547400613e-05, "loss": 0.741, "step": 127800 }, { "epoch": 8.26, "eval_accuracy": 0.8226415094339623, "eval_loss": 0.7285290360450745, "eval_runtime": 7.8127, "eval_samples_per_second": 127.997, "eval_steps_per_second": 8.064, "step": 127800 }, { "epoch": 8.27, "grad_norm": 12.162575721740723, "learning_rate": 1.2777140672782875e-05, "loss": 0.7017, "step": 127900 }, { "epoch": 8.28, "grad_norm": 11.191537857055664, "learning_rate": 1.2729357798165138e-05, "loss": 0.7392, "step": 128000 }, { "epoch": 8.28, "eval_accuracy": 0.8089792460821685, "eval_loss": 0.8140987753868103, "eval_runtime": 8.377, "eval_samples_per_second": 119.375, "eval_steps_per_second": 7.521, "step": 128000 }, { "epoch": 8.28, "grad_norm": 8.444432258605957, "learning_rate": 1.2681574923547401e-05, "loss": 0.7546, "step": 128100 }, { "epoch": 8.29, "grad_norm": 5.596407413482666, "learning_rate": 1.2633792048929664e-05, "loss": 0.7063, "step": 128200 }, { "epoch": 8.29, "eval_accuracy": 0.8119694397283531, "eval_loss": 0.8159799575805664, "eval_runtime": 7.4063, "eval_samples_per_second": 135.02, "eval_steps_per_second": 8.506, "step": 128200 }, { "epoch": 8.3, "grad_norm": 8.66211223602295, "learning_rate": 1.2586009174311927e-05, "loss": 0.7092, "step": 128300 }, { "epoch": 8.3, "grad_norm": 9.436258316040039, "learning_rate": 1.253822629969419e-05, "loss": 0.6894, "step": 128400 }, { "epoch": 8.3, "eval_accuracy": 0.8111769686706182, "eval_loss": 0.8495166301727295, "eval_runtime": 8.1502, "eval_samples_per_second": 122.696, "eval_steps_per_second": 7.73, "step": 128400 }, { "epoch": 8.31, "grad_norm": 5.215428829193115, "learning_rate": 1.2490443425076453e-05, "loss": 0.7165, "step": 128500 }, { "epoch": 8.32, "grad_norm": 9.976493835449219, "learning_rate": 1.2442660550458717e-05, "loss": 0.7472, "step": 128600 }, { "epoch": 8.32, "eval_accuracy": 0.8076760860396457, "eval_loss": 0.8014414310455322, "eval_runtime": 8.1382, "eval_samples_per_second": 122.878, "eval_steps_per_second": 7.741, "step": 128600 }, { "epoch": 8.32, "grad_norm": 9.394295692443848, "learning_rate": 1.239487767584098e-05, "loss": 0.7419, "step": 128700 }, { "epoch": 8.33, "grad_norm": 7.96061372756958, "learning_rate": 1.2347094801223243e-05, "loss": 0.7139, "step": 128800 }, { "epoch": 8.33, "eval_accuracy": 0.8221574344023324, "eval_loss": 0.705885648727417, "eval_runtime": 7.8444, "eval_samples_per_second": 127.48, "eval_steps_per_second": 8.031, "step": 128800 }, { "epoch": 8.34, "grad_norm": 6.173077583312988, "learning_rate": 1.2299311926605505e-05, "loss": 0.7328, "step": 128900 }, { "epoch": 8.34, "grad_norm": 6.394617080688477, "learning_rate": 1.2251529051987769e-05, "loss": 0.7223, "step": 129000 }, { "epoch": 8.34, "eval_accuracy": 0.8127606338615513, "eval_loss": 0.8478501439094543, "eval_runtime": 7.8905, "eval_samples_per_second": 126.735, "eval_steps_per_second": 7.984, "step": 129000 }, { "epoch": 8.35, "grad_norm": 7.204352378845215, "learning_rate": 1.2203746177370031e-05, "loss": 0.7501, "step": 129100 }, { "epoch": 8.35, "grad_norm": 8.715394020080566, "learning_rate": 1.2155963302752295e-05, "loss": 0.729, "step": 129200 }, { "epoch": 8.35, "eval_accuracy": 0.8288135593220339, "eval_loss": 0.7385056614875793, "eval_runtime": 7.6772, "eval_samples_per_second": 130.255, "eval_steps_per_second": 8.206, "step": 129200 }, { "epoch": 8.36, "grad_norm": 7.9653401374816895, "learning_rate": 1.2108180428134557e-05, "loss": 0.7119, "step": 129300 }, { "epoch": 8.37, "grad_norm": 7.9833550453186035, "learning_rate": 1.206039755351682e-05, "loss": 0.7055, "step": 129400 }, { "epoch": 8.37, "eval_accuracy": 0.8102650399663441, "eval_loss": 0.8119258284568787, "eval_runtime": 7.6876, "eval_samples_per_second": 130.08, "eval_steps_per_second": 8.195, "step": 129400 }, { "epoch": 8.37, "grad_norm": 4.663378715515137, "learning_rate": 1.2012614678899083e-05, "loss": 0.7377, "step": 129500 }, { "epoch": 8.38, "grad_norm": 10.961010932922363, "learning_rate": 1.1964831804281345e-05, "loss": 0.7484, "step": 129600 }, { "epoch": 8.38, "eval_accuracy": 0.8262322472848789, "eval_loss": 0.750911295413971, "eval_runtime": 7.545, "eval_samples_per_second": 132.537, "eval_steps_per_second": 8.35, "step": 129600 }, { "epoch": 8.39, "grad_norm": 9.50354290008545, "learning_rate": 1.191704892966361e-05, "loss": 0.7578, "step": 129700 }, { "epoch": 8.39, "grad_norm": 5.847717761993408, "learning_rate": 1.1869266055045871e-05, "loss": 0.6909, "step": 129800 }, { "epoch": 8.39, "eval_accuracy": 0.8150510204081632, "eval_loss": 0.8111701607704163, "eval_runtime": 7.5311, "eval_samples_per_second": 132.784, "eval_steps_per_second": 8.365, "step": 129800 }, { "epoch": 8.4, "grad_norm": 8.764124870300293, "learning_rate": 1.1821483180428135e-05, "loss": 0.7218, "step": 129900 }, { "epoch": 8.41, "grad_norm": 11.613401412963867, "learning_rate": 1.1773700305810397e-05, "loss": 0.7241, "step": 130000 }, { "epoch": 8.41, "eval_accuracy": 0.8079191238416176, "eval_loss": 0.7943395972251892, "eval_runtime": 7.8928, "eval_samples_per_second": 126.698, "eval_steps_per_second": 7.982, "step": 130000 }, { "epoch": 8.41, "grad_norm": 8.501906394958496, "learning_rate": 1.1725917431192661e-05, "loss": 0.7305, "step": 130100 }, { "epoch": 8.42, "grad_norm": 8.404809951782227, "learning_rate": 1.1678134556574923e-05, "loss": 0.7313, "step": 130200 }, { "epoch": 8.42, "eval_accuracy": 0.8154106689246401, "eval_loss": 0.8001417517662048, "eval_runtime": 8.1915, "eval_samples_per_second": 122.078, "eval_steps_per_second": 7.691, "step": 130200 }, { "epoch": 8.43, "grad_norm": 9.390600204467773, "learning_rate": 1.1630351681957187e-05, "loss": 0.7062, "step": 130300 }, { "epoch": 8.43, "grad_norm": 6.487125873565674, "learning_rate": 1.1582568807339451e-05, "loss": 0.7102, "step": 130400 }, { "epoch": 8.43, "eval_accuracy": 0.8259058725531029, "eval_loss": 0.758506715297699, "eval_runtime": 7.7494, "eval_samples_per_second": 129.042, "eval_steps_per_second": 8.13, "step": 130400 }, { "epoch": 8.44, "grad_norm": 9.233036041259766, "learning_rate": 1.1534785932721713e-05, "loss": 0.7511, "step": 130500 }, { "epoch": 8.45, "grad_norm": 6.114567279815674, "learning_rate": 1.1487003058103977e-05, "loss": 0.7447, "step": 130600 }, { "epoch": 8.45, "eval_accuracy": 0.8232067510548523, "eval_loss": 0.8079655766487122, "eval_runtime": 8.2896, "eval_samples_per_second": 120.634, "eval_steps_per_second": 7.6, "step": 130600 }, { "epoch": 8.45, "grad_norm": 9.064861297607422, "learning_rate": 1.1439220183486239e-05, "loss": 0.7364, "step": 130700 }, { "epoch": 8.46, "grad_norm": 8.170574188232422, "learning_rate": 1.1391437308868503e-05, "loss": 0.7457, "step": 130800 }, { "epoch": 8.46, "eval_accuracy": 0.8133669609079445, "eval_loss": 0.7995606660842896, "eval_runtime": 7.4347, "eval_samples_per_second": 134.504, "eval_steps_per_second": 8.474, "step": 130800 }, { "epoch": 8.46, "grad_norm": 10.995182991027832, "learning_rate": 1.1343654434250765e-05, "loss": 0.6975, "step": 130900 }, { "epoch": 8.47, "grad_norm": 8.729532241821289, "learning_rate": 1.1295871559633029e-05, "loss": 0.7079, "step": 131000 }, { "epoch": 8.47, "eval_accuracy": 0.8155789473684211, "eval_loss": 0.7947694063186646, "eval_runtime": 7.7168, "eval_samples_per_second": 129.588, "eval_steps_per_second": 8.164, "step": 131000 }, { "epoch": 8.48, "grad_norm": 8.210700988769531, "learning_rate": 1.1248088685015291e-05, "loss": 0.7913, "step": 131100 }, { "epoch": 8.48, "grad_norm": 9.624825477600098, "learning_rate": 1.1200305810397555e-05, "loss": 0.7114, "step": 131200 }, { "epoch": 8.48, "eval_accuracy": 0.8049600672551492, "eval_loss": 0.8039013743400574, "eval_runtime": 7.7419, "eval_samples_per_second": 129.167, "eval_steps_per_second": 8.138, "step": 131200 }, { "epoch": 8.49, "grad_norm": 6.144248008728027, "learning_rate": 1.1152522935779817e-05, "loss": 0.6914, "step": 131300 }, { "epoch": 8.5, "grad_norm": 11.953121185302734, "learning_rate": 1.110474006116208e-05, "loss": 0.701, "step": 131400 }, { "epoch": 8.5, "eval_accuracy": 0.8135168961201502, "eval_loss": 0.8107746839523315, "eval_runtime": 8.1017, "eval_samples_per_second": 123.431, "eval_steps_per_second": 7.776, "step": 131400 }, { "epoch": 8.5, "grad_norm": 6.737626552581787, "learning_rate": 1.1056957186544343e-05, "loss": 0.7478, "step": 131500 }, { "epoch": 8.51, "grad_norm": 6.151169776916504, "learning_rate": 1.1009174311926607e-05, "loss": 0.7443, "step": 131600 }, { "epoch": 8.51, "eval_accuracy": 0.8168067226890756, "eval_loss": 0.7872596383094788, "eval_runtime": 7.6028, "eval_samples_per_second": 131.53, "eval_steps_per_second": 8.286, "step": 131600 }, { "epoch": 8.52, "grad_norm": 5.478426456451416, "learning_rate": 1.096139143730887e-05, "loss": 0.7334, "step": 131700 }, { "epoch": 8.52, "grad_norm": 7.354321002960205, "learning_rate": 1.0913608562691131e-05, "loss": 0.7063, "step": 131800 }, { "epoch": 8.52, "eval_accuracy": 0.8219584569732937, "eval_loss": 0.7806402444839478, "eval_runtime": 7.6837, "eval_samples_per_second": 130.146, "eval_steps_per_second": 8.199, "step": 131800 }, { "epoch": 8.53, "grad_norm": 6.47011661529541, "learning_rate": 1.0865825688073395e-05, "loss": 0.6932, "step": 131900 }, { "epoch": 8.54, "grad_norm": 11.405448913574219, "learning_rate": 1.0818042813455657e-05, "loss": 0.7537, "step": 132000 }, { "epoch": 8.54, "eval_accuracy": 0.8033755274261604, "eval_loss": 0.8408818244934082, "eval_runtime": 7.8092, "eval_samples_per_second": 128.055, "eval_steps_per_second": 8.067, "step": 132000 }, { "epoch": 8.54, "grad_norm": 11.40829849243164, "learning_rate": 1.077025993883792e-05, "loss": 0.724, "step": 132100 }, { "epoch": 8.55, "grad_norm": 5.700746059417725, "learning_rate": 1.0722477064220183e-05, "loss": 0.7059, "step": 132200 }, { "epoch": 8.55, "eval_accuracy": 0.8078547297297297, "eval_loss": 0.8162618279457092, "eval_runtime": 8.1253, "eval_samples_per_second": 123.073, "eval_steps_per_second": 7.754, "step": 132200 }, { "epoch": 8.56, "grad_norm": 7.390756607055664, "learning_rate": 1.0674694189602447e-05, "loss": 0.7054, "step": 132300 }, { "epoch": 8.56, "grad_norm": 6.836921215057373, "learning_rate": 1.0626911314984709e-05, "loss": 0.6998, "step": 132400 }, { "epoch": 8.56, "eval_accuracy": 0.8095838587641866, "eval_loss": 0.8128502368927002, "eval_runtime": 8.0834, "eval_samples_per_second": 123.711, "eval_steps_per_second": 7.794, "step": 132400 }, { "epoch": 8.57, "grad_norm": 7.362022876739502, "learning_rate": 1.0579128440366973e-05, "loss": 0.6897, "step": 132500 }, { "epoch": 8.57, "grad_norm": 8.185832023620605, "learning_rate": 1.0531345565749237e-05, "loss": 0.704, "step": 132600 }, { "epoch": 8.57, "eval_accuracy": 0.8274111675126904, "eval_loss": 0.7488585114479065, "eval_runtime": 7.9314, "eval_samples_per_second": 126.082, "eval_steps_per_second": 7.943, "step": 132600 }, { "epoch": 8.58, "grad_norm": 5.221545696258545, "learning_rate": 1.0483562691131499e-05, "loss": 0.7318, "step": 132700 }, { "epoch": 8.59, "grad_norm": 5.5884528160095215, "learning_rate": 1.0435779816513762e-05, "loss": 0.6971, "step": 132800 }, { "epoch": 8.59, "eval_accuracy": 0.828171404327535, "eval_loss": 0.746841311454773, "eval_runtime": 8.0578, "eval_samples_per_second": 124.103, "eval_steps_per_second": 7.818, "step": 132800 }, { "epoch": 8.59, "grad_norm": 6.386440277099609, "learning_rate": 1.0387996941896025e-05, "loss": 0.6954, "step": 132900 }, { "epoch": 8.6, "grad_norm": 10.249175071716309, "learning_rate": 1.0340214067278288e-05, "loss": 0.7938, "step": 133000 }, { "epoch": 8.6, "eval_accuracy": 0.8095637583892618, "eval_loss": 0.8094672560691833, "eval_runtime": 8.1066, "eval_samples_per_second": 123.357, "eval_steps_per_second": 7.771, "step": 133000 }, { "epoch": 8.61, "grad_norm": 8.478610038757324, "learning_rate": 1.029243119266055e-05, "loss": 0.7057, "step": 133100 }, { "epoch": 8.61, "grad_norm": 9.11021614074707, "learning_rate": 1.0244648318042814e-05, "loss": 0.7348, "step": 133200 }, { "epoch": 8.61, "eval_accuracy": 0.8168776371308016, "eval_loss": 0.7772917747497559, "eval_runtime": 7.9661, "eval_samples_per_second": 125.532, "eval_steps_per_second": 7.909, "step": 133200 }, { "epoch": 8.62, "grad_norm": 9.279688835144043, "learning_rate": 1.0196865443425077e-05, "loss": 0.7131, "step": 133300 }, { "epoch": 8.63, "grad_norm": 8.099907875061035, "learning_rate": 1.014908256880734e-05, "loss": 0.7237, "step": 133400 }, { "epoch": 8.63, "eval_accuracy": 0.8158225198827962, "eval_loss": 0.8500449657440186, "eval_runtime": 7.6565, "eval_samples_per_second": 130.607, "eval_steps_per_second": 8.228, "step": 133400 }, { "epoch": 8.63, "grad_norm": 9.314995765686035, "learning_rate": 1.0101299694189603e-05, "loss": 0.7375, "step": 133500 }, { "epoch": 8.64, "grad_norm": 8.181074142456055, "learning_rate": 1.0053516819571866e-05, "loss": 0.7168, "step": 133600 }, { "epoch": 8.64, "eval_accuracy": 0.8197478991596638, "eval_loss": 0.757551372051239, "eval_runtime": 7.5559, "eval_samples_per_second": 132.347, "eval_steps_per_second": 8.338, "step": 133600 }, { "epoch": 8.65, "grad_norm": 6.913029670715332, "learning_rate": 1.0005733944954128e-05, "loss": 0.6955, "step": 133700 }, { "epoch": 8.65, "grad_norm": 8.009628295898438, "learning_rate": 9.957951070336392e-06, "loss": 0.6974, "step": 133800 }, { "epoch": 8.65, "eval_accuracy": 0.8238255033557047, "eval_loss": 0.7969841957092285, "eval_runtime": 7.8247, "eval_samples_per_second": 127.801, "eval_steps_per_second": 8.051, "step": 133800 }, { "epoch": 8.66, "grad_norm": 7.864684581756592, "learning_rate": 9.910168195718656e-06, "loss": 0.6953, "step": 133900 }, { "epoch": 8.67, "grad_norm": 5.862139701843262, "learning_rate": 9.862385321100918e-06, "loss": 0.7185, "step": 134000 }, { "epoch": 8.67, "eval_accuracy": 0.8188344594594594, "eval_loss": 0.8055108785629272, "eval_runtime": 8.4864, "eval_samples_per_second": 117.835, "eval_steps_per_second": 7.424, "step": 134000 }, { "epoch": 8.67, "grad_norm": 6.3240461349487305, "learning_rate": 9.814602446483182e-06, "loss": 0.7214, "step": 134100 }, { "epoch": 8.68, "grad_norm": 7.2695393562316895, "learning_rate": 9.766819571865444e-06, "loss": 0.7219, "step": 134200 }, { "epoch": 8.68, "eval_accuracy": 0.8270929743374, "eval_loss": 0.7788463234901428, "eval_runtime": 8.2651, "eval_samples_per_second": 120.99, "eval_steps_per_second": 7.622, "step": 134200 }, { "epoch": 8.68, "grad_norm": 6.897620677947998, "learning_rate": 9.719036697247706e-06, "loss": 0.7325, "step": 134300 }, { "epoch": 8.69, "grad_norm": 7.224691867828369, "learning_rate": 9.671253822629969e-06, "loss": 0.7121, "step": 134400 }, { "epoch": 8.69, "eval_accuracy": 0.8139337298215803, "eval_loss": 0.7928226590156555, "eval_runtime": 7.8365, "eval_samples_per_second": 127.609, "eval_steps_per_second": 8.039, "step": 134400 }, { "epoch": 8.7, "grad_norm": 9.247693061828613, "learning_rate": 9.623470948012232e-06, "loss": 0.7159, "step": 134500 }, { "epoch": 8.7, "grad_norm": 7.786310195922852, "learning_rate": 9.575688073394496e-06, "loss": 0.723, "step": 134600 }, { "epoch": 8.7, "eval_accuracy": 0.8256688963210702, "eval_loss": 0.7563433647155762, "eval_runtime": 7.639, "eval_samples_per_second": 130.908, "eval_steps_per_second": 8.247, "step": 134600 }, { "epoch": 8.71, "grad_norm": 6.415633678436279, "learning_rate": 9.527905198776758e-06, "loss": 0.6756, "step": 134700 }, { "epoch": 8.72, "grad_norm": 10.057195663452148, "learning_rate": 9.480122324159022e-06, "loss": 0.7048, "step": 134800 }, { "epoch": 8.72, "eval_accuracy": 0.8142259414225942, "eval_loss": 0.7795703411102295, "eval_runtime": 7.8229, "eval_samples_per_second": 127.829, "eval_steps_per_second": 8.053, "step": 134800 }, { "epoch": 8.72, "grad_norm": 11.102579116821289, "learning_rate": 9.432339449541284e-06, "loss": 0.6993, "step": 134900 }, { "epoch": 8.73, "grad_norm": 6.531528949737549, "learning_rate": 9.384556574923548e-06, "loss": 0.6966, "step": 135000 }, { "epoch": 8.73, "eval_accuracy": 0.8357324301439458, "eval_loss": 0.7363231182098389, "eval_runtime": 7.7531, "eval_samples_per_second": 128.981, "eval_steps_per_second": 8.126, "step": 135000 }, { "epoch": 8.74, "grad_norm": 5.510344982147217, "learning_rate": 9.33677370030581e-06, "loss": 0.7007, "step": 135100 }, { "epoch": 8.74, "grad_norm": 6.190596103668213, "learning_rate": 9.288990825688074e-06, "loss": 0.6814, "step": 135200 }, { "epoch": 8.74, "eval_accuracy": 0.8161209068010076, "eval_loss": 0.7883857488632202, "eval_runtime": 7.969, "eval_samples_per_second": 125.486, "eval_steps_per_second": 7.906, "step": 135200 }, { "epoch": 8.75, "grad_norm": 8.472126960754395, "learning_rate": 9.241207951070336e-06, "loss": 0.747, "step": 135300 }, { "epoch": 8.76, "grad_norm": 7.907455921173096, "learning_rate": 9.1934250764526e-06, "loss": 0.6793, "step": 135400 }, { "epoch": 8.76, "eval_accuracy": 0.8190196899874319, "eval_loss": 0.7533921599388123, "eval_runtime": 7.6081, "eval_samples_per_second": 131.44, "eval_steps_per_second": 8.281, "step": 135400 }, { "epoch": 8.76, "grad_norm": 6.509176731109619, "learning_rate": 9.145642201834862e-06, "loss": 0.7224, "step": 135500 }, { "epoch": 8.77, "grad_norm": 8.534215927124023, "learning_rate": 9.097859327217126e-06, "loss": 0.6972, "step": 135600 }, { "epoch": 8.77, "eval_accuracy": 0.8164983164983165, "eval_loss": 0.7757536768913269, "eval_runtime": 7.9711, "eval_samples_per_second": 125.454, "eval_steps_per_second": 7.904, "step": 135600 }, { "epoch": 8.78, "grad_norm": 9.41044807434082, "learning_rate": 9.050076452599388e-06, "loss": 0.753, "step": 135700 }, { "epoch": 8.78, "grad_norm": 9.346735954284668, "learning_rate": 9.002293577981652e-06, "loss": 0.7071, "step": 135800 }, { "epoch": 8.78, "eval_accuracy": 0.8241525423728814, "eval_loss": 0.7600921392440796, "eval_runtime": 7.7427, "eval_samples_per_second": 129.154, "eval_steps_per_second": 8.137, "step": 135800 }, { "epoch": 8.79, "grad_norm": 8.269803047180176, "learning_rate": 8.954510703363916e-06, "loss": 0.7219, "step": 135900 }, { "epoch": 8.79, "grad_norm": 7.849192142486572, "learning_rate": 8.906727828746178e-06, "loss": 0.7235, "step": 136000 }, { "epoch": 8.79, "eval_accuracy": 0.8091993185689949, "eval_loss": 0.8072012066841125, "eval_runtime": 7.8275, "eval_samples_per_second": 127.754, "eval_steps_per_second": 8.049, "step": 136000 }, { "epoch": 8.8, "grad_norm": 8.0281343460083, "learning_rate": 8.858944954128442e-06, "loss": 0.7081, "step": 136100 }, { "epoch": 8.81, "grad_norm": 12.589245796203613, "learning_rate": 8.811162079510704e-06, "loss": 0.7093, "step": 136200 }, { "epoch": 8.81, "eval_accuracy": 0.8242220353238016, "eval_loss": 0.7667783498764038, "eval_runtime": 7.4479, "eval_samples_per_second": 134.267, "eval_steps_per_second": 8.459, "step": 136200 }, { "epoch": 8.81, "grad_norm": 11.658758163452148, "learning_rate": 8.763379204892968e-06, "loss": 0.7089, "step": 136300 }, { "epoch": 8.82, "grad_norm": 8.338024139404297, "learning_rate": 8.71559633027523e-06, "loss": 0.6792, "step": 136400 }, { "epoch": 8.82, "eval_accuracy": 0.8229299363057325, "eval_loss": 0.7329905033111572, "eval_runtime": 8.4133, "eval_samples_per_second": 118.86, "eval_steps_per_second": 7.488, "step": 136400 }, { "epoch": 8.83, "grad_norm": 8.125370979309082, "learning_rate": 8.667813455657494e-06, "loss": 0.6977, "step": 136500 }, { "epoch": 8.83, "grad_norm": 10.400834083557129, "learning_rate": 8.620030581039756e-06, "loss": 0.6629, "step": 136600 }, { "epoch": 8.83, "eval_accuracy": 0.8227524499360886, "eval_loss": 0.7728596329689026, "eval_runtime": 7.7156, "eval_samples_per_second": 129.608, "eval_steps_per_second": 8.165, "step": 136600 }, { "epoch": 8.84, "grad_norm": 7.687889099121094, "learning_rate": 8.57224770642202e-06, "loss": 0.71, "step": 136700 }, { "epoch": 8.85, "grad_norm": 6.274188041687012, "learning_rate": 8.524464831804282e-06, "loss": 0.7543, "step": 136800 }, { "epoch": 8.85, "eval_accuracy": 0.8182586644125106, "eval_loss": 0.7521857619285583, "eval_runtime": 7.6914, "eval_samples_per_second": 130.015, "eval_steps_per_second": 8.191, "step": 136800 }, { "epoch": 8.85, "grad_norm": 8.565081596374512, "learning_rate": 8.476681957186544e-06, "loss": 0.7227, "step": 136900 }, { "epoch": 8.86, "grad_norm": 5.995691776275635, "learning_rate": 8.428899082568808e-06, "loss": 0.7397, "step": 137000 }, { "epoch": 8.86, "eval_accuracy": 0.806747188671387, "eval_loss": 0.7870930433273315, "eval_runtime": 7.6328, "eval_samples_per_second": 131.014, "eval_steps_per_second": 8.254, "step": 137000 }, { "epoch": 8.87, "grad_norm": 12.075162887573242, "learning_rate": 8.38111620795107e-06, "loss": 0.7026, "step": 137100 }, { "epoch": 8.87, "grad_norm": 10.687235832214355, "learning_rate": 8.333333333333334e-06, "loss": 0.7595, "step": 137200 }, { "epoch": 8.87, "eval_accuracy": 0.8133110087902888, "eval_loss": 0.7542468309402466, "eval_runtime": 8.2217, "eval_samples_per_second": 121.63, "eval_steps_per_second": 7.663, "step": 137200 }, { "epoch": 8.88, "grad_norm": 7.822646617889404, "learning_rate": 8.285550458715596e-06, "loss": 0.6953, "step": 137300 }, { "epoch": 8.89, "grad_norm": 6.248695373535156, "learning_rate": 8.23776758409786e-06, "loss": 0.6782, "step": 137400 }, { "epoch": 8.89, "eval_accuracy": 0.8106857383256205, "eval_loss": 0.800755500793457, "eval_runtime": 7.6048, "eval_samples_per_second": 131.496, "eval_steps_per_second": 8.284, "step": 137400 }, { "epoch": 8.89, "grad_norm": 7.12138032913208, "learning_rate": 8.189984709480122e-06, "loss": 0.7258, "step": 137500 }, { "epoch": 8.9, "grad_norm": 7.537871360778809, "learning_rate": 8.142201834862386e-06, "loss": 0.6862, "step": 137600 }, { "epoch": 8.9, "eval_accuracy": 0.8141481791544579, "eval_loss": 0.7893263101577759, "eval_runtime": 8.1751, "eval_samples_per_second": 122.322, "eval_steps_per_second": 7.706, "step": 137600 }, { "epoch": 8.9, "grad_norm": 6.232292175292969, "learning_rate": 8.094418960244648e-06, "loss": 0.7333, "step": 137700 }, { "epoch": 8.91, "grad_norm": 6.799365520477295, "learning_rate": 8.046636085626912e-06, "loss": 0.7136, "step": 137800 }, { "epoch": 8.91, "eval_accuracy": 0.8158225198827962, "eval_loss": 0.7326998114585876, "eval_runtime": 7.7048, "eval_samples_per_second": 129.789, "eval_steps_per_second": 8.177, "step": 137800 }, { "epoch": 8.92, "grad_norm": 8.412924766540527, "learning_rate": 7.998853211009175e-06, "loss": 0.6914, "step": 137900 }, { "epoch": 8.92, "grad_norm": 5.799706935882568, "learning_rate": 7.951070336391438e-06, "loss": 0.7152, "step": 138000 }, { "epoch": 8.92, "eval_accuracy": 0.8220806794055202, "eval_loss": 0.7795132398605347, "eval_runtime": 7.4004, "eval_samples_per_second": 135.128, "eval_steps_per_second": 8.513, "step": 138000 }, { "epoch": 8.93, "grad_norm": 8.070270538330078, "learning_rate": 7.903287461773701e-06, "loss": 0.6784, "step": 138100 }, { "epoch": 8.94, "grad_norm": 7.864658355712891, "learning_rate": 7.855504587155964e-06, "loss": 0.7297, "step": 138200 }, { "epoch": 8.94, "eval_accuracy": 0.8169962137147665, "eval_loss": 0.7879621386528015, "eval_runtime": 7.8214, "eval_samples_per_second": 127.854, "eval_steps_per_second": 8.055, "step": 138200 }, { "epoch": 8.94, "grad_norm": 7.502408027648926, "learning_rate": 7.807721712538227e-06, "loss": 0.7038, "step": 138300 }, { "epoch": 8.95, "grad_norm": 10.054288864135742, "learning_rate": 7.75993883792049e-06, "loss": 0.7334, "step": 138400 }, { "epoch": 8.95, "eval_accuracy": 0.8270042194092827, "eval_loss": 0.7593047022819519, "eval_runtime": 8.15, "eval_samples_per_second": 122.7, "eval_steps_per_second": 7.73, "step": 138400 }, { "epoch": 8.96, "grad_norm": 8.451773643493652, "learning_rate": 7.712155963302753e-06, "loss": 0.7163, "step": 138500 }, { "epoch": 8.96, "grad_norm": 7.086478233337402, "learning_rate": 7.664373088685015e-06, "loss": 0.6943, "step": 138600 }, { "epoch": 8.96, "eval_accuracy": 0.8157341186369373, "eval_loss": 0.8297237157821655, "eval_runtime": 7.9114, "eval_samples_per_second": 126.4, "eval_steps_per_second": 7.963, "step": 138600 }, { "epoch": 8.97, "grad_norm": 7.14725399017334, "learning_rate": 7.616590214067279e-06, "loss": 0.7064, "step": 138700 }, { "epoch": 8.98, "grad_norm": 8.565879821777344, "learning_rate": 7.568807339449542e-06, "loss": 0.7142, "step": 138800 }, { "epoch": 8.98, "eval_accuracy": 0.8322005097706032, "eval_loss": 0.684068500995636, "eval_runtime": 8.2166, "eval_samples_per_second": 121.705, "eval_steps_per_second": 7.667, "step": 138800 }, { "epoch": 8.98, "grad_norm": 11.216293334960938, "learning_rate": 7.521024464831805e-06, "loss": 0.7618, "step": 138900 }, { "epoch": 8.99, "grad_norm": 8.761273384094238, "learning_rate": 7.473241590214068e-06, "loss": 0.7025, "step": 139000 }, { "epoch": 8.99, "eval_accuracy": 0.8047138047138047, "eval_loss": 0.8144974708557129, "eval_runtime": 7.5949, "eval_samples_per_second": 131.667, "eval_steps_per_second": 8.295, "step": 139000 }, { "epoch": 9.0, "grad_norm": 9.155555725097656, "learning_rate": 7.425458715596331e-06, "loss": 0.7245, "step": 139100 }, { "epoch": 9.0, "grad_norm": 8.836409568786621, "learning_rate": 7.377675840978594e-06, "loss": 0.7225, "step": 139200 }, { "epoch": 9.0, "eval_accuracy": 0.8060836501901141, "eval_loss": 0.7922817468643188, "eval_runtime": 7.7454, "eval_samples_per_second": 129.109, "eval_steps_per_second": 8.134, "step": 139200 }, { "epoch": 9.01, "grad_norm": 9.514657020568848, "learning_rate": 7.3298929663608555e-06, "loss": 0.6822, "step": 139300 }, { "epoch": 9.01, "grad_norm": 7.135375022888184, "learning_rate": 7.282110091743119e-06, "loss": 0.7129, "step": 139400 }, { "epoch": 9.01, "eval_accuracy": 0.8240740740740741, "eval_loss": 0.7534127235412598, "eval_runtime": 7.7522, "eval_samples_per_second": 128.996, "eval_steps_per_second": 8.127, "step": 139400 }, { "epoch": 9.02, "grad_norm": 7.658737659454346, "learning_rate": 7.234327217125382e-06, "loss": 0.6559, "step": 139500 }, { "epoch": 9.03, "grad_norm": 8.988152503967285, "learning_rate": 7.186544342507645e-06, "loss": 0.6436, "step": 139600 }, { "epoch": 9.03, "eval_accuracy": 0.8322175732217573, "eval_loss": 0.7328826189041138, "eval_runtime": 7.7439, "eval_samples_per_second": 129.134, "eval_steps_per_second": 8.135, "step": 139600 }, { "epoch": 9.03, "grad_norm": 7.880275249481201, "learning_rate": 7.138761467889908e-06, "loss": 0.7133, "step": 139700 }, { "epoch": 9.04, "grad_norm": 7.209887981414795, "learning_rate": 7.090978593272171e-06, "loss": 0.6941, "step": 139800 }, { "epoch": 9.04, "eval_accuracy": 0.8182579564489112, "eval_loss": 0.7689694762229919, "eval_runtime": 7.4588, "eval_samples_per_second": 134.071, "eval_steps_per_second": 8.446, "step": 139800 }, { "epoch": 9.05, "grad_norm": 9.182413101196289, "learning_rate": 7.043195718654434e-06, "loss": 0.6704, "step": 139900 }, { "epoch": 9.05, "grad_norm": 9.056734085083008, "learning_rate": 6.995412844036697e-06, "loss": 0.6953, "step": 140000 }, { "epoch": 9.05, "eval_accuracy": 0.8066275167785235, "eval_loss": 0.8176834583282471, "eval_runtime": 7.8028, "eval_samples_per_second": 128.16, "eval_steps_per_second": 8.074, "step": 140000 }, { "epoch": 9.06, "grad_norm": 7.498508930206299, "learning_rate": 6.94762996941896e-06, "loss": 0.672, "step": 140100 }, { "epoch": 9.07, "grad_norm": 6.672116756439209, "learning_rate": 6.899847094801223e-06, "loss": 0.6671, "step": 140200 }, { "epoch": 9.07, "eval_accuracy": 0.8401518346689161, "eval_loss": 0.6529211401939392, "eval_runtime": 7.5823, "eval_samples_per_second": 131.886, "eval_steps_per_second": 8.309, "step": 140200 }, { "epoch": 9.07, "grad_norm": 23.415481567382812, "learning_rate": 6.852064220183486e-06, "loss": 0.6671, "step": 140300 }, { "epoch": 9.08, "grad_norm": 10.630614280700684, "learning_rate": 6.804281345565749e-06, "loss": 0.6834, "step": 140400 }, { "epoch": 9.08, "eval_accuracy": 0.8243243243243243, "eval_loss": 0.756993293762207, "eval_runtime": 7.7678, "eval_samples_per_second": 128.737, "eval_steps_per_second": 8.11, "step": 140400 }, { "epoch": 9.09, "grad_norm": 8.077528953552246, "learning_rate": 6.756498470948012e-06, "loss": 0.6925, "step": 140500 }, { "epoch": 9.09, "grad_norm": 7.447731971740723, "learning_rate": 6.708715596330275e-06, "loss": 0.6712, "step": 140600 }, { "epoch": 9.09, "eval_accuracy": 0.8194854491775622, "eval_loss": 0.7503960132598877, "eval_runtime": 8.0069, "eval_samples_per_second": 124.893, "eval_steps_per_second": 7.868, "step": 140600 }, { "epoch": 9.1, "grad_norm": 4.514816761016846, "learning_rate": 6.660932721712539e-06, "loss": 0.7285, "step": 140700 }, { "epoch": 9.11, "grad_norm": 8.40754222869873, "learning_rate": 6.613149847094802e-06, "loss": 0.6974, "step": 140800 }, { "epoch": 9.11, "eval_accuracy": 0.8351419031719532, "eval_loss": 0.7281838655471802, "eval_runtime": 8.2464, "eval_samples_per_second": 121.265, "eval_steps_per_second": 7.64, "step": 140800 }, { "epoch": 9.11, "grad_norm": 7.763525485992432, "learning_rate": 6.565366972477065e-06, "loss": 0.7261, "step": 140900 }, { "epoch": 9.12, "grad_norm": 7.034289360046387, "learning_rate": 6.517584097859328e-06, "loss": 0.703, "step": 141000 }, { "epoch": 9.12, "eval_accuracy": 0.8195615514333895, "eval_loss": 0.755873441696167, "eval_runtime": 8.1847, "eval_samples_per_second": 122.179, "eval_steps_per_second": 7.697, "step": 141000 }, { "epoch": 9.12, "grad_norm": 5.577447414398193, "learning_rate": 6.469801223241591e-06, "loss": 0.7158, "step": 141100 }, { "epoch": 9.13, "grad_norm": 8.0060453414917, "learning_rate": 6.422018348623854e-06, "loss": 0.6795, "step": 141200 }, { "epoch": 9.13, "eval_accuracy": 0.8253768844221105, "eval_loss": 0.7303311824798584, "eval_runtime": 7.8612, "eval_samples_per_second": 127.207, "eval_steps_per_second": 8.014, "step": 141200 }, { "epoch": 9.14, "grad_norm": 5.7979536056518555, "learning_rate": 6.374235474006117e-06, "loss": 0.6807, "step": 141300 }, { "epoch": 9.14, "grad_norm": 5.291042804718018, "learning_rate": 6.32645259938838e-06, "loss": 0.6547, "step": 141400 }, { "epoch": 9.14, "eval_accuracy": 0.817798397300717, "eval_loss": 0.778563380241394, "eval_runtime": 8.4624, "eval_samples_per_second": 118.17, "eval_steps_per_second": 7.445, "step": 141400 }, { "epoch": 9.15, "grad_norm": 9.27120590209961, "learning_rate": 6.278669724770643e-06, "loss": 0.659, "step": 141500 }, { "epoch": 9.16, "grad_norm": 5.64165735244751, "learning_rate": 6.230886850152905e-06, "loss": 0.6938, "step": 141600 }, { "epoch": 9.16, "eval_accuracy": 0.8175675675675675, "eval_loss": 0.7912856340408325, "eval_runtime": 7.9017, "eval_samples_per_second": 126.556, "eval_steps_per_second": 7.973, "step": 141600 }, { "epoch": 9.16, "grad_norm": 8.031712532043457, "learning_rate": 6.183103975535169e-06, "loss": 0.7261, "step": 141700 }, { "epoch": 9.17, "grad_norm": 10.13143253326416, "learning_rate": 6.135321100917432e-06, "loss": 0.7219, "step": 141800 }, { "epoch": 9.17, "eval_accuracy": 0.8136439267886856, "eval_loss": 0.7786973714828491, "eval_runtime": 8.2307, "eval_samples_per_second": 121.496, "eval_steps_per_second": 7.654, "step": 141800 }, { "epoch": 9.18, "grad_norm": 5.492425918579102, "learning_rate": 6.087538226299695e-06, "loss": 0.6935, "step": 141900 }, { "epoch": 9.18, "grad_norm": 8.030416488647461, "learning_rate": 6.039755351681958e-06, "loss": 0.7066, "step": 142000 }, { "epoch": 9.18, "eval_accuracy": 0.8276586801176965, "eval_loss": 0.7543783187866211, "eval_runtime": 7.5355, "eval_samples_per_second": 132.706, "eval_steps_per_second": 8.36, "step": 142000 }, { "epoch": 9.19, "grad_norm": 7.0246148109436035, "learning_rate": 5.991972477064221e-06, "loss": 0.6695, "step": 142100 }, { "epoch": 9.2, "grad_norm": 7.843145847320557, "learning_rate": 5.944189602446484e-06, "loss": 0.6903, "step": 142200 }, { "epoch": 9.2, "eval_accuracy": 0.819555182543013, "eval_loss": 0.7760767936706543, "eval_runtime": 8.1004, "eval_samples_per_second": 123.451, "eval_steps_per_second": 7.777, "step": 142200 }, { "epoch": 9.2, "grad_norm": 10.429040908813477, "learning_rate": 5.896406727828747e-06, "loss": 0.6623, "step": 142300 }, { "epoch": 9.21, "grad_norm": 7.632176876068115, "learning_rate": 5.84862385321101e-06, "loss": 0.6851, "step": 142400 }, { "epoch": 9.21, "eval_accuracy": 0.8289695945945946, "eval_loss": 0.7314964532852173, "eval_runtime": 7.8161, "eval_samples_per_second": 127.94, "eval_steps_per_second": 8.06, "step": 142400 }, { "epoch": 9.21, "grad_norm": 7.133945941925049, "learning_rate": 5.800840978593272e-06, "loss": 0.687, "step": 142500 }, { "epoch": 9.22, "grad_norm": 6.438443183898926, "learning_rate": 5.753058103975535e-06, "loss": 0.6743, "step": 142600 }, { "epoch": 9.22, "eval_accuracy": 0.832282471626734, "eval_loss": 0.7007901668548584, "eval_runtime": 7.6667, "eval_samples_per_second": 130.435, "eval_steps_per_second": 8.217, "step": 142600 }, { "epoch": 9.23, "grad_norm": 8.936053276062012, "learning_rate": 5.705275229357799e-06, "loss": 0.7084, "step": 142700 }, { "epoch": 9.23, "grad_norm": 10.085330963134766, "learning_rate": 5.657492354740062e-06, "loss": 0.6741, "step": 142800 }, { "epoch": 9.23, "eval_accuracy": 0.8216857263871241, "eval_loss": 0.7984895706176758, "eval_runtime": 8.0665, "eval_samples_per_second": 123.97, "eval_steps_per_second": 7.81, "step": 142800 }, { "epoch": 9.24, "grad_norm": 6.818421363830566, "learning_rate": 5.609709480122325e-06, "loss": 0.6668, "step": 142900 }, { "epoch": 9.25, "grad_norm": 7.357988357543945, "learning_rate": 5.561926605504588e-06, "loss": 0.6732, "step": 143000 }, { "epoch": 9.25, "eval_accuracy": 0.8253497244595167, "eval_loss": 0.7807667255401611, "eval_runtime": 7.9301, "eval_samples_per_second": 126.101, "eval_steps_per_second": 7.944, "step": 143000 }, { "epoch": 9.25, "grad_norm": 9.597208976745605, "learning_rate": 5.5141437308868506e-06, "loss": 0.6738, "step": 143100 }, { "epoch": 9.26, "grad_norm": 9.097168922424316, "learning_rate": 5.4663608562691136e-06, "loss": 0.7116, "step": 143200 }, { "epoch": 9.26, "eval_accuracy": 0.8276586801176965, "eval_loss": 0.713204562664032, "eval_runtime": 7.5521, "eval_samples_per_second": 132.413, "eval_steps_per_second": 8.342, "step": 143200 }, { "epoch": 9.27, "grad_norm": 6.845571517944336, "learning_rate": 5.4185779816513765e-06, "loss": 0.675, "step": 143300 }, { "epoch": 9.27, "grad_norm": 10.701016426086426, "learning_rate": 5.3707951070336395e-06, "loss": 0.7123, "step": 143400 }, { "epoch": 9.27, "eval_accuracy": 0.8127659574468085, "eval_loss": 0.8158736824989319, "eval_runtime": 7.6286, "eval_samples_per_second": 131.086, "eval_steps_per_second": 8.258, "step": 143400 }, { "epoch": 9.28, "grad_norm": 11.90999984741211, "learning_rate": 5.3230122324159025e-06, "loss": 0.6742, "step": 143500 }, { "epoch": 9.29, "grad_norm": 7.140121936798096, "learning_rate": 5.2752293577981655e-06, "loss": 0.6686, "step": 143600 }, { "epoch": 9.29, "eval_accuracy": 0.8193979933110368, "eval_loss": 0.7775002717971802, "eval_runtime": 7.6664, "eval_samples_per_second": 130.439, "eval_steps_per_second": 8.218, "step": 143600 }, { "epoch": 9.29, "grad_norm": 12.276994705200195, "learning_rate": 5.227446483180428e-06, "loss": 0.6842, "step": 143700 }, { "epoch": 9.3, "grad_norm": 7.2004899978637695, "learning_rate": 5.1796636085626915e-06, "loss": 0.6857, "step": 143800 }, { "epoch": 9.3, "eval_accuracy": 0.8280922431865828, "eval_loss": 0.7257367372512817, "eval_runtime": 7.9792, "eval_samples_per_second": 125.325, "eval_steps_per_second": 7.895, "step": 143800 }, { "epoch": 9.31, "grad_norm": 7.0967583656311035, "learning_rate": 5.1318807339449544e-06, "loss": 0.7133, "step": 143900 }, { "epoch": 9.31, "grad_norm": 10.92885971069336, "learning_rate": 5.084097859327217e-06, "loss": 0.6555, "step": 144000 }, { "epoch": 9.31, "eval_accuracy": 0.819327731092437, "eval_loss": 0.7765259742736816, "eval_runtime": 7.719, "eval_samples_per_second": 129.551, "eval_steps_per_second": 8.162, "step": 144000 }, { "epoch": 9.32, "grad_norm": 9.067501068115234, "learning_rate": 5.03631498470948e-06, "loss": 0.6753, "step": 144100 }, { "epoch": 9.32, "grad_norm": 7.1480865478515625, "learning_rate": 4.988532110091743e-06, "loss": 0.6626, "step": 144200 }, { "epoch": 9.32, "eval_accuracy": 0.8178766261015527, "eval_loss": 0.8132330179214478, "eval_runtime": 7.5687, "eval_samples_per_second": 132.123, "eval_steps_per_second": 8.324, "step": 144200 }, { "epoch": 9.33, "grad_norm": 10.091357231140137, "learning_rate": 4.940749235474006e-06, "loss": 0.6734, "step": 144300 }, { "epoch": 9.34, "grad_norm": 8.80782413482666, "learning_rate": 4.892966360856269e-06, "loss": 0.6808, "step": 144400 }, { "epoch": 9.34, "eval_accuracy": 0.819891349770163, "eval_loss": 0.7785417437553406, "eval_runtime": 8.3462, "eval_samples_per_second": 119.815, "eval_steps_per_second": 7.548, "step": 144400 }, { "epoch": 9.34, "grad_norm": 6.368628978729248, "learning_rate": 4.845183486238532e-06, "loss": 0.6997, "step": 144500 }, { "epoch": 9.35, "grad_norm": 12.430789947509766, "learning_rate": 4.797400611620795e-06, "loss": 0.7125, "step": 144600 }, { "epoch": 9.35, "eval_accuracy": 0.8199152542372882, "eval_loss": 0.7778120040893555, "eval_runtime": 8.4752, "eval_samples_per_second": 117.991, "eval_steps_per_second": 7.433, "step": 144600 }, { "epoch": 9.36, "grad_norm": 10.412111282348633, "learning_rate": 4.749617737003058e-06, "loss": 0.68, "step": 144700 }, { "epoch": 9.36, "grad_norm": 8.765597343444824, "learning_rate": 4.701834862385321e-06, "loss": 0.7004, "step": 144800 }, { "epoch": 9.36, "eval_accuracy": 0.8175335570469798, "eval_loss": 0.7783469557762146, "eval_runtime": 8.0701, "eval_samples_per_second": 123.914, "eval_steps_per_second": 7.807, "step": 144800 }, { "epoch": 9.37, "grad_norm": 5.8738508224487305, "learning_rate": 4.654051987767584e-06, "loss": 0.6865, "step": 144900 }, { "epoch": 9.38, "grad_norm": 6.892227649688721, "learning_rate": 4.606269113149847e-06, "loss": 0.6728, "step": 145000 }, { "epoch": 9.38, "eval_accuracy": 0.8298319327731093, "eval_loss": 0.7655338048934937, "eval_runtime": 7.6382, "eval_samples_per_second": 130.921, "eval_steps_per_second": 8.248, "step": 145000 }, { "epoch": 9.38, "grad_norm": 10.24010181427002, "learning_rate": 4.55848623853211e-06, "loss": 0.6831, "step": 145100 }, { "epoch": 9.39, "grad_norm": 7.489282131195068, "learning_rate": 4.510703363914373e-06, "loss": 0.6994, "step": 145200 }, { "epoch": 9.39, "eval_accuracy": 0.8219235615287694, "eval_loss": 0.7583600878715515, "eval_runtime": 8.087, "eval_samples_per_second": 123.655, "eval_steps_per_second": 7.79, "step": 145200 }, { "epoch": 9.4, "grad_norm": 7.743747234344482, "learning_rate": 4.462920489296636e-06, "loss": 0.7037, "step": 145300 }, { "epoch": 9.4, "grad_norm": 9.136314392089844, "learning_rate": 4.415137614678899e-06, "loss": 0.7104, "step": 145400 }, { "epoch": 9.4, "eval_accuracy": 0.8356569497253908, "eval_loss": 0.7070457935333252, "eval_runtime": 7.9266, "eval_samples_per_second": 126.158, "eval_steps_per_second": 7.948, "step": 145400 }, { "epoch": 9.41, "grad_norm": 4.839635848999023, "learning_rate": 4.367354740061162e-06, "loss": 0.6962, "step": 145500 }, { "epoch": 9.42, "grad_norm": 7.43444299697876, "learning_rate": 4.319571865443425e-06, "loss": 0.6602, "step": 145600 }, { "epoch": 9.42, "eval_accuracy": 0.8231578947368421, "eval_loss": 0.7417013049125671, "eval_runtime": 7.775, "eval_samples_per_second": 128.618, "eval_steps_per_second": 8.103, "step": 145600 }, { "epoch": 9.42, "grad_norm": 9.913743019104004, "learning_rate": 4.271788990825688e-06, "loss": 0.6724, "step": 145700 }, { "epoch": 9.43, "grad_norm": 6.908413410186768, "learning_rate": 4.224006116207951e-06, "loss": 0.6773, "step": 145800 }, { "epoch": 9.43, "eval_accuracy": 0.8236030025020851, "eval_loss": 0.785513162612915, "eval_runtime": 7.9593, "eval_samples_per_second": 125.639, "eval_steps_per_second": 7.915, "step": 145800 }, { "epoch": 9.43, "grad_norm": 7.145389080047607, "learning_rate": 4.176223241590214e-06, "loss": 0.7045, "step": 145900 }, { "epoch": 9.44, "grad_norm": 11.873164176940918, "learning_rate": 4.128440366972477e-06, "loss": 0.7141, "step": 146000 }, { "epoch": 9.44, "eval_accuracy": 0.8200339558573854, "eval_loss": 0.7557082772254944, "eval_runtime": 8.3505, "eval_samples_per_second": 119.754, "eval_steps_per_second": 7.544, "step": 146000 }, { "epoch": 9.45, "grad_norm": 6.072300434112549, "learning_rate": 4.080657492354741e-06, "loss": 0.6494, "step": 146100 }, { "epoch": 9.45, "grad_norm": 6.347719669342041, "learning_rate": 4.032874617737004e-06, "loss": 0.6645, "step": 146200 }, { "epoch": 9.45, "eval_accuracy": 0.8272229245680573, "eval_loss": 0.7042951583862305, "eval_runtime": 8.3019, "eval_samples_per_second": 120.454, "eval_steps_per_second": 7.589, "step": 146200 }, { "epoch": 9.46, "grad_norm": 5.351022720336914, "learning_rate": 3.985091743119266e-06, "loss": 0.669, "step": 146300 }, { "epoch": 9.47, "grad_norm": 9.34628963470459, "learning_rate": 3.937308868501529e-06, "loss": 0.6782, "step": 146400 }, { "epoch": 9.47, "eval_accuracy": 0.8155339805825242, "eval_loss": 0.7763045430183411, "eval_runtime": 7.8167, "eval_samples_per_second": 127.932, "eval_steps_per_second": 8.06, "step": 146400 }, { "epoch": 9.47, "grad_norm": 8.982083320617676, "learning_rate": 3.889525993883792e-06, "loss": 0.714, "step": 146500 }, { "epoch": 9.48, "grad_norm": 6.254167079925537, "learning_rate": 3.841743119266055e-06, "loss": 0.7168, "step": 146600 }, { "epoch": 9.48, "eval_accuracy": 0.8272650296359018, "eval_loss": 0.7693815231323242, "eval_runtime": 8.2709, "eval_samples_per_second": 120.906, "eval_steps_per_second": 7.617, "step": 146600 }, { "epoch": 9.49, "grad_norm": 10.633997917175293, "learning_rate": 3.793960244648318e-06, "loss": 0.7175, "step": 146700 }, { "epoch": 9.49, "grad_norm": 8.950175285339355, "learning_rate": 3.7461773700305814e-06, "loss": 0.7046, "step": 146800 }, { "epoch": 9.49, "eval_accuracy": 0.8080766028309742, "eval_loss": 0.820652186870575, "eval_runtime": 8.2817, "eval_samples_per_second": 120.748, "eval_steps_per_second": 7.607, "step": 146800 }, { "epoch": 9.5, "grad_norm": 5.7409749031066895, "learning_rate": 3.6983944954128444e-06, "loss": 0.701, "step": 146900 }, { "epoch": 9.51, "grad_norm": 4.563392639160156, "learning_rate": 3.6506116207951073e-06, "loss": 0.6721, "step": 147000 }, { "epoch": 9.51, "eval_accuracy": 0.8197820620284996, "eval_loss": 0.7432876229286194, "eval_runtime": 8.0881, "eval_samples_per_second": 123.638, "eval_steps_per_second": 7.789, "step": 147000 }, { "epoch": 9.51, "grad_norm": 6.814408779144287, "learning_rate": 3.6028287461773703e-06, "loss": 0.7024, "step": 147100 }, { "epoch": 9.52, "grad_norm": 10.546920776367188, "learning_rate": 3.5550458715596333e-06, "loss": 0.689, "step": 147200 }, { "epoch": 9.52, "eval_accuracy": 0.821294363256785, "eval_loss": 0.782240629196167, "eval_runtime": 7.9223, "eval_samples_per_second": 126.226, "eval_steps_per_second": 7.952, "step": 147200 }, { "epoch": 9.53, "grad_norm": 9.842469215393066, "learning_rate": 3.5072629969418963e-06, "loss": 0.7124, "step": 147300 }, { "epoch": 9.53, "grad_norm": 6.657950401306152, "learning_rate": 3.4594801223241597e-06, "loss": 0.7324, "step": 147400 }, { "epoch": 9.53, "eval_accuracy": 0.8199233716475096, "eval_loss": 0.7816979289054871, "eval_runtime": 8.0291, "eval_samples_per_second": 124.547, "eval_steps_per_second": 7.846, "step": 147400 }, { "epoch": 9.54, "grad_norm": 8.194591522216797, "learning_rate": 3.411697247706422e-06, "loss": 0.728, "step": 147500 }, { "epoch": 9.54, "grad_norm": 9.726399421691895, "learning_rate": 3.363914373088685e-06, "loss": 0.7052, "step": 147600 }, { "epoch": 9.54, "eval_accuracy": 0.8187919463087249, "eval_loss": 0.8199048042297363, "eval_runtime": 8.1714, "eval_samples_per_second": 122.378, "eval_steps_per_second": 7.71, "step": 147600 }, { "epoch": 9.55, "grad_norm": 8.845316886901855, "learning_rate": 3.316131498470948e-06, "loss": 0.732, "step": 147700 }, { "epoch": 9.56, "grad_norm": 7.419039726257324, "learning_rate": 3.268348623853211e-06, "loss": 0.6826, "step": 147800 }, { "epoch": 9.56, "eval_accuracy": 0.8016877637130801, "eval_loss": 0.8470125794410706, "eval_runtime": 8.1729, "eval_samples_per_second": 122.355, "eval_steps_per_second": 7.708, "step": 147800 }, { "epoch": 9.56, "grad_norm": 6.060827732086182, "learning_rate": 3.220565749235474e-06, "loss": 0.6868, "step": 147900 }, { "epoch": 9.57, "grad_norm": 10.684419631958008, "learning_rate": 3.172782874617737e-06, "loss": 0.6891, "step": 148000 }, { "epoch": 9.57, "eval_accuracy": 0.8284518828451883, "eval_loss": 0.7346253395080566, "eval_runtime": 8.145, "eval_samples_per_second": 122.774, "eval_steps_per_second": 7.735, "step": 148000 }, { "epoch": 9.58, "grad_norm": 7.2785491943359375, "learning_rate": 3.125e-06, "loss": 0.694, "step": 148100 }, { "epoch": 9.58, "grad_norm": 10.50632095336914, "learning_rate": 3.077217125382263e-06, "loss": 0.7168, "step": 148200 }, { "epoch": 9.58, "eval_accuracy": 0.8148767237776849, "eval_loss": 0.8013838529586792, "eval_runtime": 7.9337, "eval_samples_per_second": 126.045, "eval_steps_per_second": 7.941, "step": 148200 }, { "epoch": 9.59, "grad_norm": 7.734995365142822, "learning_rate": 3.029434250764526e-06, "loss": 0.6825, "step": 148300 }, { "epoch": 9.6, "grad_norm": 7.202228546142578, "learning_rate": 2.981651376146789e-06, "loss": 0.715, "step": 148400 }, { "epoch": 9.6, "eval_accuracy": 0.8241805023414219, "eval_loss": 0.789307713508606, "eval_runtime": 7.621, "eval_samples_per_second": 131.216, "eval_steps_per_second": 8.267, "step": 148400 }, { "epoch": 9.6, "grad_norm": 5.188666343688965, "learning_rate": 2.933868501529052e-06, "loss": 0.7509, "step": 148500 }, { "epoch": 9.61, "grad_norm": 9.7349214553833, "learning_rate": 2.886085626911315e-06, "loss": 0.698, "step": 148600 }, { "epoch": 9.61, "eval_accuracy": 0.8174936921783011, "eval_loss": 0.8003532886505127, "eval_runtime": 7.4621, "eval_samples_per_second": 134.011, "eval_steps_per_second": 8.443, "step": 148600 }, { "epoch": 9.62, "grad_norm": 8.891592979431152, "learning_rate": 2.838302752293578e-06, "loss": 0.6802, "step": 148700 }, { "epoch": 9.62, "grad_norm": 7.455615520477295, "learning_rate": 2.790519877675841e-06, "loss": 0.747, "step": 148800 }, { "epoch": 9.62, "eval_accuracy": 0.8260500636402206, "eval_loss": 0.7079195976257324, "eval_runtime": 7.7035, "eval_samples_per_second": 129.812, "eval_steps_per_second": 8.178, "step": 148800 }, { "epoch": 9.63, "grad_norm": 12.296592712402344, "learning_rate": 2.742737003058104e-06, "loss": 0.7018, "step": 148900 }, { "epoch": 9.64, "grad_norm": 7.029719829559326, "learning_rate": 2.6949541284403674e-06, "loss": 0.7246, "step": 149000 }, { "epoch": 9.64, "eval_accuracy": 0.8235042735042735, "eval_loss": 0.7714970707893372, "eval_runtime": 7.5428, "eval_samples_per_second": 132.577, "eval_steps_per_second": 8.352, "step": 149000 }, { "epoch": 9.64, "grad_norm": 9.380716323852539, "learning_rate": 2.64717125382263e-06, "loss": 0.7118, "step": 149100 }, { "epoch": 9.65, "grad_norm": 10.494823455810547, "learning_rate": 2.599388379204893e-06, "loss": 0.7005, "step": 149200 }, { "epoch": 9.65, "eval_accuracy": 0.8210347752332485, "eval_loss": 0.7164011001586914, "eval_runtime": 7.5889, "eval_samples_per_second": 131.772, "eval_steps_per_second": 8.302, "step": 149200 }, { "epoch": 9.65, "grad_norm": 6.685737609863281, "learning_rate": 2.551605504587156e-06, "loss": 0.7013, "step": 149300 }, { "epoch": 9.66, "grad_norm": 9.050190925598145, "learning_rate": 2.503822629969419e-06, "loss": 0.693, "step": 149400 }, { "epoch": 9.66, "eval_accuracy": 0.8197551709582103, "eval_loss": 0.8076345324516296, "eval_runtime": 7.8603, "eval_samples_per_second": 127.221, "eval_steps_per_second": 8.015, "step": 149400 }, { "epoch": 9.67, "grad_norm": 9.88638687133789, "learning_rate": 2.4560397553516823e-06, "loss": 0.7017, "step": 149500 }, { "epoch": 9.67, "grad_norm": 9.526541709899902, "learning_rate": 2.4082568807339453e-06, "loss": 0.7131, "step": 149600 }, { "epoch": 9.67, "eval_accuracy": 0.815345699831366, "eval_loss": 0.7799870371818542, "eval_runtime": 7.577, "eval_samples_per_second": 131.978, "eval_steps_per_second": 8.315, "step": 149600 }, { "epoch": 9.68, "grad_norm": 8.296611785888672, "learning_rate": 2.360474006116208e-06, "loss": 0.712, "step": 149700 }, { "epoch": 9.69, "grad_norm": 6.662433624267578, "learning_rate": 2.312691131498471e-06, "loss": 0.741, "step": 149800 }, { "epoch": 9.69, "eval_accuracy": 0.8093829247675401, "eval_loss": 0.8108393549919128, "eval_runtime": 8.2911, "eval_samples_per_second": 120.611, "eval_steps_per_second": 7.598, "step": 149800 }, { "epoch": 9.69, "grad_norm": 8.503557205200195, "learning_rate": 2.264908256880734e-06, "loss": 0.7286, "step": 149900 }, { "epoch": 9.7, "grad_norm": 8.654755592346191, "learning_rate": 2.2171253822629973e-06, "loss": 0.7189, "step": 150000 }, { "epoch": 9.7, "eval_accuracy": 0.8136170212765957, "eval_loss": 0.8295705318450928, "eval_runtime": 7.4618, "eval_samples_per_second": 134.015, "eval_steps_per_second": 8.443, "step": 150000 }, { "epoch": 9.71, "grad_norm": 6.883498191833496, "learning_rate": 2.1693425076452602e-06, "loss": 0.7217, "step": 150100 }, { "epoch": 9.71, "grad_norm": 8.103069305419922, "learning_rate": 2.1215596330275232e-06, "loss": 0.717, "step": 150200 }, { "epoch": 9.71, "eval_accuracy": 0.8273984080435693, "eval_loss": 0.7452117800712585, "eval_runtime": 8.1249, "eval_samples_per_second": 123.079, "eval_steps_per_second": 7.754, "step": 150200 }, { "epoch": 9.72, "grad_norm": 9.827529907226562, "learning_rate": 2.073776758409786e-06, "loss": 0.7145, "step": 150300 }, { "epoch": 9.73, "grad_norm": 7.635141849517822, "learning_rate": 2.0259938837920488e-06, "loss": 0.7639, "step": 150400 }, { "epoch": 9.73, "eval_accuracy": 0.8142006802721088, "eval_loss": 0.7829443216323853, "eval_runtime": 8.1678, "eval_samples_per_second": 122.431, "eval_steps_per_second": 7.713, "step": 150400 }, { "epoch": 9.73, "grad_norm": 7.6774983406066895, "learning_rate": 1.978211009174312e-06, "loss": 0.719, "step": 150500 }, { "epoch": 9.74, "grad_norm": 7.132800102233887, "learning_rate": 1.930428134556575e-06, "loss": 0.7279, "step": 150600 }, { "epoch": 9.74, "eval_accuracy": 0.8162574089754445, "eval_loss": 0.8080362677574158, "eval_runtime": 7.7309, "eval_samples_per_second": 129.351, "eval_steps_per_second": 8.149, "step": 150600 }, { "epoch": 9.75, "grad_norm": 10.469795227050781, "learning_rate": 1.8826452599388381e-06, "loss": 0.7477, "step": 150700 }, { "epoch": 9.75, "grad_norm": 7.547222137451172, "learning_rate": 1.8348623853211011e-06, "loss": 0.7225, "step": 150800 }, { "epoch": 9.75, "eval_accuracy": 0.8222128378378378, "eval_loss": 0.7676485180854797, "eval_runtime": 7.8819, "eval_samples_per_second": 126.872, "eval_steps_per_second": 7.993, "step": 150800 }, { "epoch": 9.76, "grad_norm": 7.592052936553955, "learning_rate": 1.787079510703364e-06, "loss": 0.7254, "step": 150900 }, { "epoch": 9.76, "grad_norm": 7.741146087646484, "learning_rate": 1.7392966360856269e-06, "loss": 0.699, "step": 151000 }, { "epoch": 9.76, "eval_accuracy": 0.8168836623267535, "eval_loss": 0.7707257866859436, "eval_runtime": 7.5962, "eval_samples_per_second": 131.645, "eval_steps_per_second": 8.294, "step": 151000 }, { "epoch": 9.77, "grad_norm": 7.196103096008301, "learning_rate": 1.6915137614678899e-06, "loss": 0.707, "step": 151100 }, { "epoch": 9.78, "grad_norm": 6.654726982116699, "learning_rate": 1.643730886850153e-06, "loss": 0.6629, "step": 151200 }, { "epoch": 9.78, "eval_accuracy": 0.820577164366374, "eval_loss": 0.7866977453231812, "eval_runtime": 7.8218, "eval_samples_per_second": 127.849, "eval_steps_per_second": 8.054, "step": 151200 }, { "epoch": 9.78, "grad_norm": 10.104480743408203, "learning_rate": 1.595948012232416e-06, "loss": 0.6799, "step": 151300 }, { "epoch": 9.79, "grad_norm": 7.979597091674805, "learning_rate": 1.548165137614679e-06, "loss": 0.6999, "step": 151400 }, { "epoch": 9.79, "eval_accuracy": 0.8137873055905843, "eval_loss": 0.8228275179862976, "eval_runtime": 8.3817, "eval_samples_per_second": 119.307, "eval_steps_per_second": 7.516, "step": 151400 }, { "epoch": 9.8, "grad_norm": 10.689424514770508, "learning_rate": 1.500382262996942e-06, "loss": 0.7038, "step": 151500 }, { "epoch": 9.8, "grad_norm": 8.917847633361816, "learning_rate": 1.452599388379205e-06, "loss": 0.7289, "step": 151600 }, { "epoch": 9.8, "eval_accuracy": 0.8061139028475712, "eval_loss": 0.8209790587425232, "eval_runtime": 7.5496, "eval_samples_per_second": 132.458, "eval_steps_per_second": 8.345, "step": 151600 }, { "epoch": 9.81, "grad_norm": 9.84281063079834, "learning_rate": 1.404816513761468e-06, "loss": 0.7042, "step": 151700 }, { "epoch": 9.82, "grad_norm": 8.299826622009277, "learning_rate": 1.357033639143731e-06, "loss": 0.6717, "step": 151800 }, { "epoch": 9.82, "eval_accuracy": 0.8222866611433306, "eval_loss": 0.7866979837417603, "eval_runtime": 7.7367, "eval_samples_per_second": 129.254, "eval_steps_per_second": 8.143, "step": 151800 }, { "epoch": 9.82, "grad_norm": 5.905616760253906, "learning_rate": 1.309250764525994e-06, "loss": 0.7279, "step": 151900 }, { "epoch": 9.83, "grad_norm": 11.690291404724121, "learning_rate": 1.261467889908257e-06, "loss": 0.6862, "step": 152000 }, { "epoch": 9.83, "eval_accuracy": 0.8238297872340425, "eval_loss": 0.7613499164581299, "eval_runtime": 7.6728, "eval_samples_per_second": 130.331, "eval_steps_per_second": 8.211, "step": 152000 }, { "epoch": 9.84, "grad_norm": 6.986787796020508, "learning_rate": 1.2136850152905199e-06, "loss": 0.6714, "step": 152100 }, { "epoch": 9.84, "grad_norm": 7.176079273223877, "learning_rate": 1.1659021406727829e-06, "loss": 0.7073, "step": 152200 }, { "epoch": 9.84, "eval_accuracy": 0.8051024675867837, "eval_loss": 0.8180536031723022, "eval_runtime": 7.5563, "eval_samples_per_second": 132.34, "eval_steps_per_second": 8.337, "step": 152200 }, { "epoch": 9.85, "grad_norm": 9.051605224609375, "learning_rate": 1.118119266055046e-06, "loss": 0.6926, "step": 152300 }, { "epoch": 9.86, "grad_norm": 11.95942211151123, "learning_rate": 1.0703363914373088e-06, "loss": 0.7237, "step": 152400 }, { "epoch": 9.86, "eval_accuracy": 0.7987473903966598, "eval_loss": 0.8719861507415771, "eval_runtime": 7.3578, "eval_samples_per_second": 135.91, "eval_steps_per_second": 8.562, "step": 152400 }, { "epoch": 9.86, "grad_norm": 11.356337547302246, "learning_rate": 1.0225535168195718e-06, "loss": 0.7053, "step": 152500 }, { "epoch": 9.87, "grad_norm": 32.668212890625, "learning_rate": 9.74770642201835e-07, "loss": 0.7238, "step": 152600 }, { "epoch": 9.87, "eval_accuracy": 0.8225198827961491, "eval_loss": 0.7187554836273193, "eval_runtime": 7.7396, "eval_samples_per_second": 129.206, "eval_steps_per_second": 8.14, "step": 152600 }, { "epoch": 9.87, "grad_norm": 9.082686424255371, "learning_rate": 9.269877675840978e-07, "loss": 0.6847, "step": 152700 }, { "epoch": 9.88, "grad_norm": 11.404294967651367, "learning_rate": 8.792048929663609e-07, "loss": 0.7053, "step": 152800 }, { "epoch": 9.88, "eval_accuracy": 0.819327731092437, "eval_loss": 0.7904229760169983, "eval_runtime": 8.05, "eval_samples_per_second": 124.223, "eval_steps_per_second": 7.826, "step": 152800 }, { "epoch": 9.89, "grad_norm": 11.163743019104004, "learning_rate": 8.31422018348624e-07, "loss": 0.7174, "step": 152900 }, { "epoch": 9.89, "grad_norm": 7.6204986572265625, "learning_rate": 7.836391437308868e-07, "loss": 0.6858, "step": 153000 }, { "epoch": 9.89, "eval_accuracy": 0.8233305156382079, "eval_loss": 0.7522888779640198, "eval_runtime": 7.6663, "eval_samples_per_second": 130.441, "eval_steps_per_second": 8.218, "step": 153000 }, { "epoch": 9.9, "grad_norm": 10.06446647644043, "learning_rate": 7.358562691131498e-07, "loss": 0.6881, "step": 153100 }, { "epoch": 9.91, "grad_norm": 6.690400123596191, "learning_rate": 6.880733944954129e-07, "loss": 0.6934, "step": 153200 }, { "epoch": 9.91, "eval_accuracy": 0.8171768707482994, "eval_loss": 0.7971600294113159, "eval_runtime": 8.0777, "eval_samples_per_second": 123.797, "eval_steps_per_second": 7.799, "step": 153200 }, { "epoch": 9.91, "grad_norm": 9.36272144317627, "learning_rate": 6.402905198776759e-07, "loss": 0.7353, "step": 153300 }, { "epoch": 9.92, "grad_norm": 8.29776668548584, "learning_rate": 5.925076452599389e-07, "loss": 0.7187, "step": 153400 }, { "epoch": 9.92, "eval_accuracy": 0.8138747884940778, "eval_loss": 0.8160595297813416, "eval_runtime": 7.7483, "eval_samples_per_second": 129.061, "eval_steps_per_second": 8.131, "step": 153400 }, { "epoch": 9.93, "grad_norm": 8.7488374710083, "learning_rate": 5.447247706422019e-07, "loss": 0.7179, "step": 153500 }, { "epoch": 9.93, "grad_norm": 9.236979484558105, "learning_rate": 4.969418960244648e-07, "loss": 0.6988, "step": 153600 }, { "epoch": 9.93, "eval_accuracy": 0.8261603375527427, "eval_loss": 0.7793501615524292, "eval_runtime": 7.802, "eval_samples_per_second": 128.172, "eval_steps_per_second": 8.075, "step": 153600 }, { "epoch": 9.94, "grad_norm": 8.467421531677246, "learning_rate": 4.4915902140672783e-07, "loss": 0.6932, "step": 153700 }, { "epoch": 9.95, "grad_norm": 8.666404724121094, "learning_rate": 4.0137614678899087e-07, "loss": 0.6887, "step": 153800 }, { "epoch": 9.95, "eval_accuracy": 0.8170266836086404, "eval_loss": 0.7777916193008423, "eval_runtime": 7.8134, "eval_samples_per_second": 127.985, "eval_steps_per_second": 8.063, "step": 153800 }, { "epoch": 9.95, "grad_norm": 10.528636932373047, "learning_rate": 3.5359327217125385e-07, "loss": 0.6971, "step": 153900 }, { "epoch": 9.96, "grad_norm": 8.976885795593262, "learning_rate": 3.0581039755351683e-07, "loss": 0.7082, "step": 154000 }, { "epoch": 9.96, "eval_accuracy": 0.8223628691983123, "eval_loss": 0.7097562551498413, "eval_runtime": 7.6258, "eval_samples_per_second": 131.133, "eval_steps_per_second": 8.261, "step": 154000 }, { "epoch": 9.97, "grad_norm": 12.30357551574707, "learning_rate": 2.580275229357798e-07, "loss": 0.7189, "step": 154100 }, { "epoch": 9.97, "grad_norm": 9.566823959350586, "learning_rate": 2.102446483180428e-07, "loss": 0.6767, "step": 154200 }, { "epoch": 9.97, "eval_accuracy": 0.8210970464135021, "eval_loss": 0.7847117185592651, "eval_runtime": 7.5289, "eval_samples_per_second": 132.822, "eval_steps_per_second": 8.368, "step": 154200 }, { "epoch": 9.98, "grad_norm": 11.999798774719238, "learning_rate": 1.624617737003058e-07, "loss": 0.7304, "step": 154300 }, { "epoch": 9.98, "grad_norm": 6.774813652038574, "learning_rate": 1.1467889908256882e-07, "loss": 0.7232, "step": 154400 }, { "epoch": 9.98, "eval_accuracy": 0.8188436830835117, "eval_loss": 0.8175400495529175, "eval_runtime": 7.6391, "eval_samples_per_second": 130.905, "eval_steps_per_second": 8.247, "step": 154400 }, { "epoch": 9.99, "grad_norm": 11.216461181640625, "learning_rate": 6.68960244648318e-08, "loss": 0.6794, "step": 154500 }, { "epoch": 10.0, "grad_norm": 9.180341720581055, "learning_rate": 1.9113149847094802e-08, "loss": 0.6781, "step": 154600 }, { "epoch": 10.0, "eval_accuracy": 0.8220910623946037, "eval_loss": 0.786108136177063, "eval_runtime": 7.8655, "eval_samples_per_second": 127.137, "eval_steps_per_second": 8.01, "step": 154600 } ], "logging_steps": 100, "max_steps": 154640, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "total_flos": 1.804302319604119e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }